In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import seaborn as sns
import pandas as pd
import numpy as np

#%load_ext sql_magic
%matplotlib inline

#suppress scientific notation
pd.options.display.float_format = '{:.0f}'.format

In [None]:
all_2019_2020_participants = pd.read_csv("raw_data/campaign_participants_20192020.csv", sep=',', encoding = 'utf-8') #all 2019-2020 participants

cp_countries = pd.read_csv("output_data/output.csv", sep=',', encoding = 'utf-8') #df of those that returned in 09/2021  
#cp_countries['user_registration'] = pd.to_datetime(cp_countries['user_registration'], unit='s').dt.strftime('%Y-%m-%d') #handle dates

#https://stackoverflow.com/questions/42442408/take-first-6-digits-of-pandas-column
cp_countries['reg_YM'] = cp_countries['user_registration']\
   .dropna()\
   .astype(int)\
   .astype(str).str[:6]
cp_countries['reg_Y'] = cp_countries['user_registration']\
   .dropna()\
   .astype(int)\
   .astype(str).str[:4]



african_countries = pd.read_csv("output_data/african_countries_w_ssa.csv", sep=',', encoding = 'utf-8')
ssa_countries = african_countries.loc[african_countries['Region'] == 'Sub-Saharan Africa']#select the rows i'm interested in

#merge cp_countries and ssa_countries
df_all_d_r = cp_countries.merge(ssa_countries, how='left', on='country_code') #campaign participants that returned in 09/2021 + ssa info
df_all_r = df_all_d_r.drop_duplicates() 

#select the rows i'm interested in
df_r_ssa = df_all_r.loc[df_all_r['Region'] == 'Sub-Saharan Africa'] #campaign participants that returned in 09/2021 & edited from SSA

In [None]:
len(cp_countries[cp_countries['user_registration'].isnull()])

In [None]:
#get unique participant counts for the lists
up_all = all_2019_2020_participants['username'].unique() #2019/2020 participants
up_all_r = df_all_r['user_name'].unique() #2019/2020 participants that returned in sept 2021
up_r_ssa = df_r_ssa['user_name'].unique() #those that returned in sept 2021 and edited from ssa

print(len(up_all))
print(len(up_all_r))
print(len(up_r_ssa))

### Exploration

In [None]:
#check for outliers in page_len
sns.stripplot(data = df_all_r, y = 'user_editcount', jitter = True);
#sns.plt.ylim(0, 500)
#sns.plt.show()

In [None]:
#check for outliers in page_len
sns.stripplot(data = df_r_ssa, y = 'user_editcount', jitter = True);
#sns.plt.ylim(0, 500)
#sns.plt.show()

#### Question: How many 2019-2020 campaign participants edited in September 2021?

In [None]:
print(len(up_all_r))
(len(up_all_r)/len(up_all))*100

#### Question: How many 2019-2020 campaign participants edited in September (09/01-09/30) 2021 from SSA?

In [None]:
print(len(up_r_ssa))
(len(up_r_ssa)/len(up_all))*100

#### Question: what percentage of editors in SSA Africa participated in campaign activities in the 2019-2020 period?

In [None]:
(len(up_r_ssa)/550000)*100

#### countries

In [None]:
df_r_ssa.groupby('country_name')['user_name'].agg(['count', 'nunique']).sort_values(by = 'count', ascending=False)#.describe()

In [None]:
c = df_r_ssa.groupby('country_name')['user_name'].nunique().sort_values(ascending=False).reset_index().rename(columns={'user_name':'unique_user_names'})

In [None]:
c.head(5)

In [None]:
c.plot.bar(x='country_name', y='unique_user_names')
#d['Country'].value_counts().plot.bar()

##### How many participants edited from more than one country?

In [None]:
#ssa list
mcssa = pd.pivot_table(df_r_ssa,index=["user_name"], values=["country_name"], aggfunc=pd.Series.nunique).reset_index()

print("# of participants from more than one country:", len(mcssa[mcssa['country_name'] > 1]))

print("percent:",(len(mcssa[mcssa['country_name'] > 1])/ df_r_ssa['user_name'].nunique())*100)

In [None]:
#r list
mcr = pd.pivot_table(df_all_r,index=["user_name"], values=["country_name"], aggfunc=pd.Series.nunique).reset_index()

print("# of participants from more than one country:",len(mcr[mcr['country_name'] > 1]))
print("percent:",(len(mcr[mcr['country_name'] > 1])/ df_all_r['user_name'].nunique())*100)

#### wikis

##### How many participants edited in more than one wiki?

In [None]:
#r list
mwr = pd.pivot_table(df_all_r,index=["user_name"], values=["wiki_db"], aggfunc=pd.Series.nunique).reset_index().rename(columns={'wiki_db':'wiki_db_counts'})

print("# of participants contributing to more than one wiki:",len(mwr[mwr['wiki_db_counts'] > 1]))
print("percent:",(len(mwr[mwr['wiki_db_counts'] > 1])/ df_all_r['user_name'].nunique())*100)

In [None]:
wiki_bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
mwr_wikis_binned = pd.cut(mwr['wiki_db_counts'], bins=wiki_bins).value_counts().reset_index()
mwr_wikis_binned

In [None]:
#ssa list
mws = pd.pivot_table(df_r_ssa,index=["user_name"], values=["wiki_db"], aggfunc=pd.Series.nunique).reset_index().rename(columns={'wiki_db':'wiki_db_counts'})

print("# of participants contributing to more than one wiki:",len(mws[mws['wiki_db_counts'] > 1]))
print("percent:",(len(mws[mws['wiki_db_counts'] > 1])/ df_r_ssa['user_name'].nunique())*100)

In [None]:
mws_wikis_binned = pd.cut(mws['wiki_db_counts'], bins=wiki_bins).value_counts().reset_index()
mws_wikis_binned

##### which wikis are most represented?

In [None]:
df_all_r.groupby('wiki_db')['user_name'].agg(['count', 'nunique']).sort_values(by = 'count', ascending=False).head(15)#.describe()

In [None]:
df_r_ssa.groupby('wiki_db')['user_name'].agg(['count', 'nunique']).sort_values(by = 'count', ascending=False).head(15)#.describe()

In [None]:
w = df_r_ssa.groupby('wiki_db')['user_name'].nunique().sort_values(ascending=False).reset_index().rename(columns={'user_name':'unique_user_names'})

In [None]:
w.head(10)

In [None]:
ws = w.head(25)

In [None]:
ws.plot.bar(x='wiki_db', y='unique_user_names')
#d['Country'].value_counts().plot.bar()

#### edit count

In [None]:
#check for outliers in page_len
sns.stripplot(data = df_all_r, y = 'user_editcount', jitter = True);
#sns.plt.ylim(0, 500)
#sns.plt.show()

In [None]:
#check for outliers in page_len
sns.stripplot(data = df_r_ssa, y = 'user_editcount', jitter = True);
#sns.plt.ylim(0, 500)
#sns.plt.show()

In [None]:
#select and dedupe across all columns
prep = df_r_ssa[['user_editcount', 'user_name']].drop_duplicates()

#keep each user's highest edit count value only
preps = prep.groupby('user_name', group_keys=False).apply(lambda x: x.loc[x.user_editcount.idxmax()])

In [None]:
bins = [0, 50, 100, 300, 600, 1000, 2000, 3000]

participants_output_binned = pd.cut(preps['user_editcount'], bins=bins).value_counts().reset_index()

In [None]:
pd.cut(preps['user_editcount'], bins=bins).value_counts()

In [None]:
uec_plot = participants_output_binned.plot(kind='bar')

uec_plot.set_ylabel('No. of Editors')
uec_plot.set_title('Editors by edit count groupings')
#uec_plot.set_xticks(width)
uec_plot.set_xticklabels( ('0-59', '100-300','50-100', '1k-2k', '300-600', '600-1k', '2k-3k') );

In [None]:
#########FUTURE - editcount by reg date

#### user_reg

In [None]:
#check for outliers in page_len
sns.stripplot(data = df_r_ssa, y = 'reg_YM', jitter = True);
#sns.plt.ylim(0, 500)
#sns.plt.show()

In [None]:
#check for outliers in page_len
sns.stripplot(data = df_all_r, y = 'reg_YM', jitter = True);

##### newcomers during the campaign times

In [None]:
newcomers=df_r_ssa.loc[(df_r_ssa['reg_Y'] >= '2019') & (df_r_ssa['reg_Y'] <= '2020')]
newcomers['user_name'].nunique()

In [None]:
(newcomers['user_name'].nunique()/len(up_r_ssa))*100

#### join with campaign info

In [None]:
all_2019_2020_participants.sample(3)

In [None]:
all_rs_un = (df_all_r['user_name'].unique()).tolist()
all_ssas_un = (df_r_ssa['user_name'].unique()).tolist()

In [None]:
all_rs = all_2019_2020_participants[all_2019_2020_participants['username'].isin(all_rs_un)]
all_ssas = all_2019_2020_participants[all_2019_2020_participants['username'].isin(all_ssas_un)]

In [None]:
cc_all_r = pd.pivot_table(all_rs,index=["username"], values=["campaign"], aggfunc=pd.Series.nunique).reset_index().rename(columns={'campaign':'campaign_counts'})
cc_ssa_r = pd.pivot_table(all_ssas,index=["username"], values=["campaign"], aggfunc=pd.Series.nunique).reset_index().rename(columns={'campaign':'campaign_counts'})

In [None]:
print("# of all r participants contributing to more than one campaign:",len(cc_all_r[cc_all_r['campaign_counts'] > 1]))
print("percent:",(len(cc_all_r[cc_all_r['campaign_counts'] > 1])/ df_all_r['user_name'].nunique())*100)

campaign_bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 50]

n2 = pd.cut(cc_all_r['campaign_counts'], bins=campaign_bins).value_counts().reset_index()
n2

In [None]:
print("# of ssa participants contributing to more than one campaign:",len(cc_ssa_r[cc_ssa_r['campaign_counts'] > 1]))
print("percent:",(len(cc_ssa_r[cc_ssa_r['campaign_counts'] > 1])/ df_r_ssa['user_name'].nunique())*100)

n3 = pd.cut(cc_ssa_r['campaign_counts'], bins=campaign_bins).value_counts().reset_index()
n3