In [36]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [37]:
gg_voters = pd.read_csv('data/gg_voters_summary_may23_2024.csv')
gg_voters


Unnamed: 0,round_num,voter,donated_usd,num_projects
0,1,0x00de4b13153673bcae2616b67bf822500d325fc3,46.000000,3
1,1,0x09cabec1ead1c0ba254b09efb3ee13841712be14,4507.000000,15
2,1,0x11111254369792b2ca5d084ab5eea397ca8fa48b,110.000000,3
3,1,0x12dd91e8d046b225fb70d5c4cdb7831c886521ef,50.000000,1
4,1,0x14094949152eddbfcd073717200da82fed8dc960,3.050000,9
...,...,...,...,...
395828,19,0xfffcfe7a5d540e51ec8086af6d7c8a053beee592,11.255382,11
395829,19,0xfffd27e68da8ae553bc5c4b80c1e969fde023693,3.089099,3
395830,19,0xfffd95bc5a23de688b7f58c8a5e26c2696ff64eb,0.978136,1
395831,19,0xfffebd07cfe14b224598e21a6eb4ffa147f54e56,10.435431,10


In [38]:
gg20_voters = pd.read_csv('data/gg20_voter_summary_may23_2024.csv')
gg20_voters


Unnamed: 0,round_num,voter,donated_usd,num_projects
0,20,0x000000000a38444e0a6e37d3b630d7e855a7cb13,21.904093,20
1,20,0x00000002d88f9b3f4eb303564817fff4adcde46f,38.985012,3
2,20,0x0000006d14ce3cf81449c3ba1f26108df0a4de8b,0.119688,1
3,20,0x000000df34e2422bb7744bc93ab7594d371e2d4e,1.614920,1
4,20,0x0000040f1111c5c3d2037940658ee770bb37e0a2,1.245631,1
...,...,...,...,...
34882,20,0xffe92bf689d7d2b7286e3c9ee34c89e24a4a3115,1.762367,1
34883,20,0xfff09621f09caa2c939386b688e62e5be19d2d56,14.969156,1
34884,20,0xfff401de598bd133cff26be5e79bbee4eed2a0e4,0.107024,1
34885,20,0xfff4a769b3ea7824f5f0100ccdfa585e1e34f4fd,0.451768,3


In [39]:
all_voters = pd.concat([gg_voters, gg20_voters])
all_voters = all_voters[all_voters['voter'].str.startswith('0x') & (all_voters['voter'].str.len() == 42)]

# Calculate the average and median lifetime voter value
# Group by 'voter' and calculate the sum of 'donated_usd' and 'num_projects' for each voter
voter_stats = all_voters.groupby('voter').agg({
    'donated_usd': 'sum',
    'num_projects': 'sum',
    'round_num': 'nunique'
}).reset_index()

# Calculate the average and median lifetime voter value
average_voter_value = voter_stats['donated_usd'].mean()
median_voter_value = voter_stats['donated_usd'].median()


# Calculate additional statistics for the voters
# Calculate the total number of unique voters
total_unique_voters = voter_stats['voter'].nunique()

# Calculate the total donated amount
total_donated_usd = voter_stats['donated_usd'].sum()

# Calculate the average number of projects per voter
average_projects_per_voter = voter_stats['num_projects'].mean()

# Calculate the median number of projects per voter
median_projects_per_voter = voter_stats['num_projects'].median()

# Calculate the average number of unique rounds participated in per voter
average_rounds_per_voter = voter_stats['round_num'].mean()

# Calculate the median number of unique rounds participated in per voter
median_rounds_per_voter = voter_stats['round_num'].median()

# Display the additional statistics
print(f"Total Unique Voters: {total_unique_voters:,}")
print(f"Total Donated USD: ${total_donated_usd:,.2f}")
print(f"Average Projects per Voter: {average_projects_per_voter:.2f}")
print(f"Median Projects per Voter: {median_projects_per_voter:.2f}")
print(f"Average Rounds per Voter: {average_rounds_per_voter:.2f}")
print(f"Median Rounds per Voter: {median_rounds_per_voter:.2f}")
print(f"Average Voter Value: ${average_voter_value:,.2f}")
print(f"Median Voter Value: ${median_voter_value:,.2f}")



Total Unique Voters: 282,424
Total Donated USD: $16,318,343.56
Average Projects per Voter: 13.10
Median Projects per Voter: 5.00
Average Rounds per Voter: 1.52
Median Rounds per Voter: 1.00
Average Voter Value: $57.78
Median Voter Value: $10.48


In [6]:
import plotly.graph_objects as go

# Group by 'round_num' and calculate the required statistics
round_stats = all_voters.groupby('round_num').agg({
    'voter': 'nunique',
    'donated_usd': ['mean', 'median']
}).reset_index()

# Flatten the MultiIndex columns
round_stats.columns = ['round_num', 'unique_voters', 'average_donated_usd', 'median_donated_usd']

# Create a figure
fig = go.Figure()

# Plot number of unique voters
fig.add_trace(go.Scatter(
    x=round_stats['round_num'], 
    y=round_stats['unique_voters'], 
    mode='lines+markers', 
    name='Unique Voters',
    line=dict(color='royalblue', width=2),
    marker=dict(size=8),
    yaxis='y1'
))

# Plot average donated USD
fig.add_trace(go.Scatter(
    x=round_stats['round_num'], 
    y=round_stats['average_donated_usd'], 
    mode='lines+markers', 
    name='Average Donated USD',
    line=dict(color='firebrick', width=2),
    marker=dict(size=8),
    yaxis='y2'
))

# Plot median donated USD
fig.add_trace(go.Scatter(
    x=round_stats['round_num'], 
    y=round_stats['median_donated_usd'], 
    mode='lines+markers', 
    name='Median Donated USD',
    line=dict(color='green', width=2),
    marker=dict(size=8),
    yaxis='y2'
))

# Update layout for dual y-axes
fig.update_layout(
    height=600, 
    width=1000, 
    title_text="Voter Statistics by Round Number",
    title_x=0.5,
    xaxis=dict(
        title="Round Number",
        titlefont=dict(size=14, color='black'),
        tickmode='linear',
        showline=True,
    ),
    yaxis=dict(
        title="Number of Unique Voters",
        titlefont=dict(size=14, color='royalblue'),
        showline=True,
        side='left'
    ),
    yaxis2=dict(
        title="Donated USD",
        titlefont=dict(size=14, color='firebrick'),
        overlaying='y',
        side='right'
    )
)

# Show the figure
fig.show()


In [7]:
import plotly.graph_objects as go

# Group round_num > 5 into a 6+ group
voter_stats['round_group'] = voter_stats['round_num'].apply(lambda x: x if x <= 5 else 6)

# Calculate average and median voter value for each round group
average_voter_value_per_round = voter_stats.groupby('round_group')['donated_usd'].mean()
median_voter_value_per_round = voter_stats.groupby('round_group')['donated_usd'].median()

# Calculate the number of voters in each round group
voters_per_round_group = voter_stats['round_group'].value_counts().sort_index()

# Create a figure
fig = go.Figure()

# Plot average voter value
fig.add_trace(go.Scatter(
    x=average_voter_value_per_round.index, 
    y=average_voter_value_per_round.values, 
    mode='lines+markers', 
    name='Average Voter Value',
    line=dict(color='royalblue', width=2),
    marker=dict(size=8)
))

# Plot median voter value
fig.add_trace(go.Scatter(
    x=median_voter_value_per_round.index, 
    y=median_voter_value_per_round.values, 
    mode='lines+markers', 
    name='Median Voter Value',
    line=dict(color='firebrick', width=2),
    marker=dict(size=8)
))

# Update layout for storytelling
fig.update_layout(
    height=600, 
    width=1000, 
    title_text="Voter Statistics by Round Group",
    title_x=0.5,
    xaxis=dict(
        title="Number of Rounds",
        titlefont=dict(size=14, color='black'),
        tickmode='linear',
        showline=True,  # Add axis line
        linecolor='black',  # Axis line color
        linewidth=2  # Axis line width
    ),
    yaxis=dict(
        title="Value in USD",
        titlefont=dict(size=14, color='black'),
        tickfont=dict(color='black'),
        showline=True,  # Add axis line
        linecolor='black',  # Axis line color
        linewidth=2  # Axis line width
    ),
    legend=dict(
        x=0.01, 
        y=0.99, 
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    plot_bgcolor='white',
    margin=dict(l=50, r=50, t=50, b=50)
)

# Remove gridlines for a cleaner look
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.show()


In [8]:
gg20 = pd.read_csv('data/gg20_donors.csv')
gg20['round_num'] = 20
gg20.drop(columns=['program'], inplace=True)
gg20.rename(columns={'donor_address':'voter'}, inplace=True)
gg20.drop_duplicates(inplace=True)
gg20

Unnamed: 0,voter,round_num
0,0x57c8e4e19f47acf5c05e04ddbd9b1d27065349d2,20
1,0xf913da8d4725988cdf1ae6bfaf3c3b7836ae8faa,20
2,0x17824caa7c9519674ba3f3e27c64d0a8103630dd,20
3,0xb0455085e33f1290ea9f603c1db0e69ee443bb4d,20
4,0x917fb7fbe8be99694a380d319d1f56433b608fc3,20
...,...,...
180110,0x9a8da1f1196e8422d2cb948eb88fc2a483681c51,20
180151,0x8eddc30b12a5cc60d046fc1b1b887eb2c1353a87,20
180163,0x17e4285bee986ae686504131d3d5abe0009f297a,20
180219,0x3946214ea5af34258ecfa95c2a64e20b69065d7d,20


In [40]:
data = pd.read_csv('data/round_voters.csv')
data = pd.concat([data, gg20])
# Filter out rows with invalid Ethereum addresses
data = data[data['voter'].str.startswith('0x') & (data['voter'].str.len() == 42)]

# Convert 'round_num' to numeric and handle invalid values
data['round_num'] = pd.to_numeric(data['round_num'], errors='coerce')
data = data.dropna(subset=['round_num'])
data['cohort'] = data.groupby('voter')['round_num'].transform('min')
data.shape

(430856, 3)

In [18]:
data

Unnamed: 0,round_num,voter,cohort
0,1.0,0x00de4b13153673bcae2616b67bf822500d325fc3,1.0
39,1.0,0xff16d64179a02d6a56a1183a28f1d6293646e2dd,1.0
41,2.0,0x00de4b13153673bcae2616b67bf822500d325fc3,1.0
146,3.0,0x00de4b13153673bcae2616b67bf822500d325fc3,1.0
152,3.0,0x05a1ff0a32bc24265bcb39499d0c5d9a6cb2011c,3.0
...,...,...,...
179599,20.0,0x2e99892cbcf20c76bba592aab67e6982886d7104,18.0
179620,20.0,0x836643f81312e79680c06f32c4f4de48c7e77ea8,20.0
179886,20.0,0x699baef464defa37b89850b712f8cd30bd659c60,20.0
179892,20.0,0x29864b53c8d7509de2ccf5fb76488cfd102ddfd0,20.0


In [33]:
## FILTER TO FC
fc = pd.read_csv('data/farcaster_may9_oso.csv')
data = data[data['voter'].isin(fc['address'])]
data


Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,round_num,voter,cohort,prev_round,voter_type
0,1.0,0x00de4b13153673bcae2616b67bf822500d325fc3,1.0,,new
39,1.0,0xff16d64179a02d6a56a1183a28f1d6293646e2dd,1.0,,new
41,2.0,0x00de4b13153673bcae2616b67bf822500d325fc3,1.0,1.0,retained
146,3.0,0x00de4b13153673bcae2616b67bf822500d325fc3,1.0,2.0,retained
152,3.0,0x05a1ff0a32bc24265bcb39499d0c5d9a6cb2011c,3.0,,new
...,...,...,...,...,...
179599,20.0,0x2e99892cbcf20c76bba592aab67e6982886d7104,18.0,18.0,resurrected
179620,20.0,0x836643f81312e79680c06f32c4f4de48c7e77ea8,20.0,,new
179886,20.0,0x699baef464defa37b89850b712f8cd30bd659c60,20.0,,new
179892,20.0,0x29864b53c8d7509de2ccf5fb76488cfd102ddfd0,20.0,,new


In [10]:

# data_grouped represents the number of unique voters per round
data_grouped = data.groupby('round_num')['voter'].nunique()

# Plotting the number of unique voters per round with enhanced aesthetics
fig = go.Figure()

# Add scatter plot
fig.add_trace(go.Scatter(
    x=data_grouped.index, 
    y=data_grouped.values,
    mode='lines+markers',
    marker=dict(size=8, color='darkgreen', line=dict(width=1, color='DarkSlateGrey')),
    line=dict(color='darkgreen', width=2),
    name='Unique Voters'
))

# Update layout for better storytelling
fig.update_layout(
    title=dict(
        text='Number of Unique Voters per Round',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='DarkSlateGrey')
    ),
    xaxis=dict(
        title='Round Number',
        titlefont=dict(size=16, color='DarkSlateGrey'),
        tickfont=dict(size=14, color='DarkSlateGrey'),
        showgrid=False,
        showline=True,
        linecolor='DarkSlateGrey'
    ),
    yaxis=dict(
        title='Number of Unique Voters',
        titlefont=dict(size=16, color='DarkSlateGrey'),
        tickfont=dict(size=14, color='DarkSlateGrey'),
        showgrid=False,
        showline=True,
        linecolor='DarkSlateGrey'
    ),
    plot_bgcolor='white',
    margin=dict(l=40, r=40, t=60, b=40),
    hovermode='x unified',
    legend=dict(
        x=0.01, y=0.99,
        bgcolor='rgba(255, 255, 255, 0.5)',
        bordercolor='DarkSlateGrey',
        borderwidth=1
    )
)

fig.show()


In [11]:

# data_grouped represents the number of unique voters per round
new_users = data[data.duplicated('voter', keep='first') == False].groupby('round_num')['voter'].nunique()
returning_users = data[data.duplicated('voter', keep='first')].groupby('round_num')['voter'].nunique()

# Plotting the number of unique voters per round with enhanced aesthetics
fig = go.Figure()

# Add scatter plot for new users
fig.add_trace(go.Scatter(
    x=new_users.index, 
    y=new_users.values,
    mode='lines+markers',
    marker=dict(size=8, color='blue', line=dict(width=1, color='DarkSlateGrey')),
    line=dict(color='blue', width=2),
    name='New Voters'
))

# Add scatter plot for returning users
fig.add_trace(go.Scatter(
    x=returning_users.index, 
    y=returning_users.values,
    mode='lines+markers',
    marker=dict(size=8, color='red', line=dict(width=1, color='DarkSlateGrey')),
    line=dict(color='red', width=2),
    name='Returning Voters'
))

# Update layout for better storytelling
fig.update_layout(
    title=dict(
        text='Number of New and Returning Voters per Round',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='DarkSlateGrey')
    ),
    xaxis=dict(
        title='Round Number',
        titlefont=dict(size=16, color='DarkSlateGrey'),
        tickfont=dict(size=14, color='DarkSlateGrey'),
        showgrid=False,
        showline=True,
        linecolor='DarkSlateGrey'
    ),
    yaxis=dict(
        title='Number of Voters',
        titlefont=dict(size=16, color='DarkSlateGrey'),
        tickfont=dict(size=14, color='DarkSlateGrey'),
        showgrid=False,
        showline=True,
        linecolor='DarkSlateGrey'
    ),
    plot_bgcolor='white',
    margin=dict(l=40, r=40, t=60, b=40),
    hovermode='x unified',
    legend=dict(
        x=0.01, y=0.99,
        bgcolor='rgba(255, 255, 255, 0.5)',
        bordercolor='DarkSlateGrey',
        borderwidth=1
    )
)

fig.show()


In [12]:
data_grouped = data.groupby('round_num')['voter'].nunique()
data_grouped

round_num
1.0        41
2.0       103
3.0       324
4.0       660
5.0       883
6.0       850
7.0      1401
8.0      5350
9.0     17610
10.0    21870
11.0    23224
12.0    38670
13.0    25271
14.0    61838
15.0    58426
16.0    29369
17.0    17869
18.0    47454
19.0    44534
20.0    35109
Name: voter, dtype: int64

In [13]:
# Calculate the number of rounds each user participates in
user_rounds = data.groupby('voter')['round_num'].nunique()

# Calculate the average and median number of rounds users participate in
average_rounds = user_rounds.mean()
median_rounds = user_rounds.median()

# Display the results
average_rounds, median_rounds


(1.5245549536288396, 1.0)

In [16]:
# Determine the range of cohorts and rounds dynamically
min_cohort = int(data['cohort'].min())
max_cohort = int(data['cohort'].max())
min_round = int(data['round_num'].min())
max_round = int(data['round_num'].max())

# Create a pivot table with cohorts as index and rounds as columns
cohort_table = pd.pivot_table(data, values='voter', index='cohort', columns='round_num', aggfunc='count', fill_value=0)

# Ensure cohort_table includes all cohorts from min_cohort to max_cohort
all_cohorts = pd.Index(range(min_cohort, max_cohort + 1), name='cohort')
cohort_table = cohort_table.reindex(all_cohorts, fill_value=0)

# Shift each row to the left until the first non-zero value is in the first column
for i in range(cohort_table.shape[0]):
    # Find the index of the first non-zero value
    non_zero_index = next((index for index, value in enumerate(cohort_table.iloc[i, :]) if value != 0), None)
    # Shift the row to the left by the index of the first non-zero value
    if non_zero_index is not None:
        cohort_table.iloc[i, :] = np.roll(cohort_table.iloc[i, :], -non_zero_index)

# Reset the column names to represent the relative round number
cohort_table.columns = list(range(cohort_table.shape[1]))
cohort_table

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,41,26,14,11,9,6,4,3,3,2,3,3,3,2,1,1,2,2,1,2
2,77,24,24,19,15,4,2,4,1,1,0,1,2,2,0,0,1,1,0,0
3,286,120,94,54,8,7,5,4,2,6,6,2,4,2,1,2,0,2,0,0
4,505,141,46,37,33,22,17,14,20,15,18,15,9,5,6,6,10,0,0,0
5,620,55,27,31,29,24,19,16,8,8,10,10,8,7,11,9,0,0,0,0
6,674,41,49,36,37,17,23,19,16,20,13,11,14,11,12,0,0,0,0,0
7,1280,387,291,253,186,173,121,148,119,49,49,45,52,47,0,0,0,0,0,0
8,4838,1369,1039,718,675,332,640,466,262,174,219,191,140,0,0,0,0,0,0,0
9,15851,5125,3480,3223,1616,2260,1369,803,417,482,394,264,0,0,0,0,0,0,0,0
10,15368,3953,3287,1979,2329,1506,766,338,564,393,353,0,0,0,0,0,0,0,0,0


In [17]:
initial_cohort_sizes = cohort_table.iloc[:, 0]
retention_table = cohort_table.iloc[:, 0:].divide(initial_cohort_sizes.replace(0, 1), axis=0)

retention_table

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,0.634146,0.341463,0.268293,0.219512,0.146341,0.097561,0.073171,0.073171,0.04878,0.073171,0.073171,0.073171,0.04878,0.02439,0.02439,0.04878,0.04878,0.02439,0.04878
2,1.0,0.311688,0.311688,0.246753,0.194805,0.051948,0.025974,0.051948,0.012987,0.012987,0.0,0.012987,0.025974,0.025974,0.0,0.0,0.012987,0.012987,0.0,0.0
3,1.0,0.41958,0.328671,0.188811,0.027972,0.024476,0.017483,0.013986,0.006993,0.020979,0.020979,0.006993,0.013986,0.006993,0.003497,0.006993,0.0,0.006993,0.0,0.0
4,1.0,0.279208,0.091089,0.073267,0.065347,0.043564,0.033663,0.027723,0.039604,0.029703,0.035644,0.029703,0.017822,0.009901,0.011881,0.011881,0.019802,0.0,0.0,0.0
5,1.0,0.08871,0.043548,0.05,0.046774,0.03871,0.030645,0.025806,0.012903,0.012903,0.016129,0.016129,0.012903,0.01129,0.017742,0.014516,0.0,0.0,0.0,0.0
6,1.0,0.060831,0.0727,0.053412,0.054896,0.025223,0.034125,0.02819,0.023739,0.029674,0.019288,0.01632,0.020772,0.01632,0.017804,0.0,0.0,0.0,0.0,0.0
7,1.0,0.302344,0.227344,0.197656,0.145313,0.135156,0.094531,0.115625,0.092969,0.038281,0.038281,0.035156,0.040625,0.036719,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.282968,0.214758,0.148408,0.13952,0.068623,0.132286,0.096321,0.054155,0.035965,0.045267,0.039479,0.028938,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.323323,0.219545,0.203331,0.101949,0.142578,0.086367,0.050659,0.026307,0.030408,0.024856,0.016655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.257223,0.213886,0.128774,0.151549,0.097996,0.049844,0.021994,0.0367,0.025573,0.02297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
initial_cohort_sizes = cohort_table.iloc[:, 0].replace(0, 1)  # Replace 0 with 1 to avoid division by zero
retention_table = cohort_table.iloc[:, 0:].divide(initial_cohort_sizes, axis=0)
retention_table

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,0.634146,0.341463,0.268293,0.219512,0.146341,0.097561,0.073171,0.073171,0.04878,0.073171,0.073171,0.073171,0.04878,0.02439,0.02439,0.04878,0.04878,0.02439,0.04878
2,1.0,0.311688,0.311688,0.246753,0.194805,0.051948,0.025974,0.051948,0.012987,0.012987,0.0,0.012987,0.025974,0.025974,0.0,0.0,0.012987,0.012987,0.0,0.0
3,1.0,0.41958,0.328671,0.188811,0.027972,0.024476,0.017483,0.013986,0.006993,0.020979,0.020979,0.006993,0.013986,0.006993,0.003497,0.006993,0.0,0.006993,0.0,0.0
4,1.0,0.279208,0.091089,0.073267,0.065347,0.043564,0.033663,0.027723,0.039604,0.029703,0.035644,0.029703,0.017822,0.009901,0.011881,0.011881,0.019802,0.0,0.0,0.0
5,1.0,0.08871,0.043548,0.05,0.046774,0.03871,0.030645,0.025806,0.012903,0.012903,0.016129,0.016129,0.012903,0.01129,0.017742,0.014516,0.0,0.0,0.0,0.0
6,1.0,0.060831,0.0727,0.053412,0.054896,0.025223,0.034125,0.02819,0.023739,0.029674,0.019288,0.01632,0.020772,0.01632,0.017804,0.0,0.0,0.0,0.0,0.0
7,1.0,0.302344,0.227344,0.197656,0.145313,0.135156,0.094531,0.115625,0.092969,0.038281,0.038281,0.035156,0.040625,0.036719,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.282968,0.214758,0.148408,0.13952,0.068623,0.132286,0.096321,0.054155,0.035965,0.045267,0.039479,0.028938,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.323323,0.219545,0.203331,0.101949,0.142578,0.086367,0.050659,0.026307,0.030408,0.024856,0.016655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.257223,0.213886,0.128774,0.151549,0.097996,0.049844,0.021994,0.0367,0.025573,0.02297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
import numpy as np

# Assuming retention_table is your DataFrame with the cohort data
for n in range(len(retention_table)):
    if n != 0:
        retention_table.iloc[n, -n:] = np.nan

# Create the heatmap using Plotly Express
fig = px.imshow(retention_table,
                labels=dict(x="Round Number", y="Cohort", color="Retention Rate"),
                x=retention_table.columns[:],
                y=retention_table.index,
                color_continuous_scale=px.colors.sequential.Blues,
                text_auto='.2%',
                aspect="auto")

# Customize the heatmap layout
fig.update_layout(
    xaxis=dict(
        title='Rounds Since Cohort Joined',
        side='top'
    ),
    yaxis_title='Cohort',
    plot_bgcolor='white',
    font=dict(size=11),
    width=800,
    height=600
)

# Update x-axis and y-axis to show every number
fig.update_xaxes(tickmode='linear', dtick=1)
fig.update_yaxes(tickmode='linear', dtick=1)

# Show the plot
fig.show()

In [20]:
cohort_table.sum()

0     282611
1      52906
2      32135
3      20963
4      15321
5      10240
6       6714
7       3725
8       2710
9       1546
10      1065
11       542
12       232
13        76
14        31
15        18
16        13
17         5
18         1
19         2
dtype: int64

In [21]:

# Calculate the mean retention for each round
for n in range(len(retention_table)):
    if n != 0:
        retention_table.iloc[n, -n:] = np.nan
mean_retention = retention_table.mean()

# Create a line plot for mean retention
fig = go.Figure(data=go.Scatter(
    x=mean_retention.index, 
    y=mean_retention.values, 
    mode='lines+markers+text', 
    name='Mean Retention', 
    text=[f'{x:.2f}' for x in mean_retention.values], 
    textposition='top center',
    line=dict(color='royalblue', width=2),
    marker=dict(color='royalblue', size=8)
))

# Customize the layout to remove the graph outline/box and adjust axes
fig.update_layout(
    title=dict(
        text='Retention Rate by Rounds Retained',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='black')
    ),
    xaxis=dict(
        title='Rounds Retained',
        showline=True,
        linewidth=2,
        linecolor='black',
        mirror=False,
        ticks='outside',
        tickmode='linear',
        dtick=1,
        tickfont=dict(size=12, color='black'),
        range=[0, max(mean_retention.index) + 1]  # Ensure x-axis starts from 0
    ),
    yaxis=dict(
        title='Retention Rate',
        showline=True,
        linewidth=2,
        linecolor='black',
        mirror=False,
        ticks='outside',
        tickmode='linear',
        dtick=0.1,
        tickfont=dict(size=12, color='black'),
        range=[0, 1]  # Ensure y-axis goes from 0 to 1
    ),
    plot_bgcolor='white',
    font=dict(size=12, color='black'),
    width=800,
    height=500,
    margin=dict(l=50, r=50, t=50, b=50),
    showlegend=False
)

# Show the plot
fig.show()


In [22]:
import plotly.graph_objects as go

# Create a line plot for retention by cohort
fig = go.Figure()

# Iterate over each cohort and add a line for each
for cohort in retention_table.index:
    fig.add_trace(go.Scatter(
        x=retention_table.columns[1:],  # Exclude the first column to start x-axis at 1
        y=retention_table.loc[cohort][1:],  # Exclude the first value to match x-axis
        mode='lines+markers',
        name=f'Cohort {cohort}'
    ))

# Add title and labels
fig.update_layout(
    title='Retention by Cohort',
    xaxis_title='Rounds Retained',
    yaxis_title='Retention Rate',
    plot_bgcolor='white',
    font=dict(size=11),
    width=800,
    height=600
)

# Show the plot
fig.show()


In [30]:
data_sorted = data.sort_values(by=['voter','round_num'])
data_sorted.head(15)


Unnamed: 0,round_num,voter,cohort,prev_round,voter_type
468,4.0,0x000000000000541e251335090ac5b47176af4f7e,4.0,,new
2012,6.0,0x000000000000541e251335090ac5b47176af4f7e,4.0,4.0,resurrected
49101,11.0,0x00000000000360176d958e11c140308cd0863679,11.0,,new
49102,11.0,0x00000000000cd56832ce5dfbcbff02e7ec639bc9,11.0,,new
110997,13.0,0x00000000000cd56832ce5dfbcbff02e7ec639bc9,11.0,11.0,resurrected
136324,14.0,0x00000000000cd56832ce5dfbcbff02e7ec639bc9,11.0,13.0,retained
198179,15.0,0x00000000000cd56832ce5dfbcbff02e7ec639bc9,11.0,14.0,retained
9620,9.0,0x00000000005dbcb0d0513fcda746382fe8a53468,9.0,,new
49103,11.0,0x00000000005ef87f8ca7014309ece7260bbcdaeb,11.0,,new
136325,14.0,0x0000000002732779240fe05873611dc4203dfb71,14.0,,new


In [41]:
def categorize_voter(row):
    if pd.isnull(row['prev_round']) and pd.notnull(row['round_num']):
        return 'new'
    elif pd.notnull(row['prev_round']) and row['prev_round'] == row['round_num'] - 1:
        return 'retained'
    elif pd.notnull(row['prev_round']) and row['prev_round'] < row['round_num'] - 1:
        return 'resurrected'

# Create a new dataframe with previous round information
data['prev_round'] = data.groupby('voter')['round_num'].shift()

# Apply the function to categorize voters
data['voter_type'] = data.apply(categorize_voter, axis=1)

# Group by round_num and voter_type and count the unique voters
voter_counts = data.groupby(['round_num', 'voter_type'])['voter'].nunique().reset_index()

# Pivot the data for better readability
voter_counts_pivot = voter_counts.pivot(index='round_num', columns='voter_type', values='voter')

# Show the pivot table
voter_counts_pivot

voter_type,new,resurrected,retained
round_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,41.0,,
2.0,77.0,,26.0
3.0,286.0,1.0,37.0
4.0,505.0,8.0,147.0
5.0,620.0,25.0,238.0
6.0,674.0,36.0,140.0
7.0,1280.0,62.0,59.0
8.0,4838.0,65.0,447.0
9.0,15851.0,132.0,1627.0
10.0,15368.0,433.0,6069.0


In [42]:
import plotly.graph_objects as go

# Create traces
trace1 = go.Bar(x=voter_counts_pivot.index, y=voter_counts_pivot['new'], name='New')
trace2 = go.Bar(x=voter_counts_pivot.index, y=voter_counts_pivot['resurrected'], name='Resurrected')
trace3 = go.Bar(x=voter_counts_pivot.index, y=voter_counts_pivot['retained'], name='Retained')

# Create layout
layout = go.Layout(
    title='Voter Counts by Round and Type',
    xaxis=dict(title='Round Number', showgrid=False, linecolor='black'),
    yaxis=dict(title='Voter Counts', showgrid=False, linecolor='black'),
    plot_bgcolor='white',
    barmode='stack'
)

# Create figure
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)

# Show the plot
fig.show()

