In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# Process Participation Data

### Load the Participation Data

In [2]:
participation = pd.read_pickle('../../data/raw/participation_data')

In [3]:
participation.head()

Unnamed: 0,authenticated,duration,extra,index_,magnitude,origin,profile,profile_origin,profile_referrer,profile_utm_campaign,profile_utm_content,profile_utm_medium,profile_utm_source,profile_utm_term,project,repetitions,type,when,where
0,True,0,,1,1,Unspecified,c3174748ab29f73d8c6226d0c2171aeb,,,,,,,,25.0,1,Participated,2016-07-22 14:07:43,
1,True,0,,2,1,Unspecified,c3174748ab29f73d8c6226d0c2171aeb,,,,,,,,25.0,1,Participated,2016-08-26 17:43:54,
2,True,0,,3,1,Unspecified,59d1c4ccba844b6f4722d2967c531441,,,,,,,,25.0,1,Participated,2016-08-26 17:46:32,
3,True,0,,4,1,Unspecified,59d1c4ccba844b6f4722d2967c531441,,,,,,,,25.0,1,Participated,2016-08-26 17:48:18,
4,True,0,,5,1,Unspecified,59d1c4ccba844b6f4722d2967c531441,,,,,,,,25.0,1,Participated,2016-08-28 23:57:25,


In [4]:
participation.describe()

Unnamed: 0,duration,index_,magnitude,project,repetitions
count,1474748.0,1474748.0,1474748.0,1261569.0,1474748.0
mean,0.0,737375.1,17.46123,1369.494,1.0
std,0.0,425723.6,17.83508,2715.116,0.0
min,0.0,1.0,0.0,4.0,1.0
25%,0.0,368687.8,1.0,1014.0,1.0
50%,0.0,737375.5,1.0,1014.0,1.0
75%,0.0,1106062.0,39.0,1014.0,1.0
max,0.0,1474749.0,40.0,20655.0,1.0


### One-Hot Encoding the type field

#### Clean the values

In [5]:
def reformat_type(text):
    return text.replace(' ', '_').replace('/', 'or')

participation['type'] = participation['type'].apply(reformat_type)

#### Add new one-hot encoding for type column

In [6]:
type_dataframe = pd.get_dummies(participation['type'],prefix='type')

In [7]:
participation = participation.join(type_dataframe)

In [8]:
participation.head()

Unnamed: 0,authenticated,duration,extra,index_,magnitude,origin,profile,profile_origin,profile_referrer,profile_utm_campaign,...,type_Marked_themselves_a_participant,type_Negate_previous_event,type_OAuth_Login,type_Participated,type_Rated_the_project,type_Removed_a_bookmark,type_Removed_from_dashboard,type_Search_in_finder,type_Started_a_project,type_Viewed_a_profile
0,True,0,,1,1,Unspecified,c3174748ab29f73d8c6226d0c2171aeb,,,,...,0,0,0,1,0,0,0,0,0,0
0,True,0,,1,1,Unspecified,c3174748ab29f73d8c6226d0c2171aeb,,,,...,0,0,0,0,0,0,0,0,0,0
0,True,0,,1,1,Unspecified,c3174748ab29f73d8c6226d0c2171aeb,,,,...,0,0,0,1,0,0,0,0,0,0
0,True,0,,537250,39,Unspecified,5aab8db0481044218e02eb042e95dd8d,,https://blog.eyesonalz.com/top-10-citizenscien...,,...,0,0,0,1,0,0,0,0,0,0
0,True,0,,537250,39,Unspecified,5aab8db0481044218e02eb042e95dd8d,,https://blog.eyesonalz.com/top-10-citizenscien...,,...,0,0,0,0,0,0,0,0,0,0


### Explore the participation

In [9]:
participation.describe()

Unnamed: 0,duration,index_,magnitude,project,repetitions,type_Added_a_bookmark,type_Added_to_dashboard,type_Asked_a_question,type_Bookmarked_the_project,type_Classification_or_Transcription,...,type_Marked_themselves_a_participant,type_Negate_previous_event,type_OAuth_Login,type_Participated,type_Rated_the_project,type_Removed_a_bookmark,type_Removed_from_dashboard,type_Search_in_finder,type_Started_a_project,type_Viewed_a_profile
count,3045646.0,3045646.0,3045646.0,2625379.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,...,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0
mean,0.0,563486.5,15.10397,1346.708,1.0,0.01666083,0.01010295,1.05068e-05,0.01734312,0.418129,...,0.002926801,0.001761531,2.758036e-05,0.3278503,0.0001172165,0.001208611,0.0004714927,0.1273139,3.283376e-07,0.008619518
std,0.0,380416.2,17.44207,2537.021,0.0,0.1279971,0.1000044,0.003241403,0.1305463,0.4932517,...,0.0540207,0.04193362,0.005251629,0.46943,0.01082602,0.03474407,0.02170877,0.3333243,0.0005730075,0.09244038
min,0.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,253804.2,1.0,1014.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,513211.0,1.0,1014.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,775028.0,37.0,1014.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,1474749.0,40.0,20655.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Drop some columns

In [10]:
participation = participation.drop(columns=['duration', 'repetitions'])    

In [11]:
participation.describe()

Unnamed: 0,index_,magnitude,project,type_Added_a_bookmark,type_Added_to_dashboard,type_Asked_a_question,type_Bookmarked_the_project,type_Classification_or_Transcription,type_Clicked_to_join_the_project,type_Data_collection,...,type_Marked_themselves_a_participant,type_Negate_previous_event,type_OAuth_Login,type_Participated,type_Rated_the_project,type_Removed_a_bookmark,type_Removed_from_dashboard,type_Search_in_finder,type_Started_a_project,type_Viewed_a_profile
count,3045646.0,3045646.0,2625379.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,...,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0,3045646.0
mean,563486.5,15.10397,1346.708,0.01666083,0.01010295,1.05068e-05,0.01734312,0.418129,0.02534766,0.007501528,...,0.002926801,0.001761531,2.758036e-05,0.3278503,0.0001172165,0.001208611,0.0004714927,0.1273139,3.283376e-07,0.008619518
std,380416.2,17.44207,2537.021,0.1279971,0.1000044,0.003241403,0.1305463,0.4932517,0.1571788,0.08628591,...,0.0540207,0.04193362,0.005251629,0.46943,0.01082602,0.03474407,0.02170877,0.3333243,0.0005730075,0.09244038
min,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,253804.2,1.0,1014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,513211.0,1.0,1014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,775028.0,37.0,1014.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1474749.0,40.0,20655.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Group by Project and Profile

In [12]:
profile_project_interactions = participation.groupby(['profile','project']).size().reset_index()

In [13]:
profile_project_interactions['num_interactions'] = profile_project_interactions[0]
profile_project_interactions = profile_project_interactions.drop(columns=0)

In [14]:
profile_project_interactions.head()

Unnamed: 0,profile,project,num_interactions
0,000073c3675ea9a1d0fe0ee3ca57e2bf,413.0,6
1,0002b85e757486c6d80ed6f73f465eaa,16864.0,3
2,0003a41bbdb3371df4c1829913f17537,19794.0,1
3,000436aaa487e461e6e16e02ab3e89eb,659.0,3
4,000476d8680db78d75b3b9edefc4a6d2,1510.0,1


In [15]:
profile_project_interactions.sort_values(by='num_interactions', ascending=False)

Unnamed: 0,profile,project,num_interactions
35719,5aab8db0481044218e02eb042e95dd8d,1014.0,455319
95477,f512f2afac71936a18d212929f30a9db,1014.0,253989
4465,0bb971b361b9fcc5c62f107149901df6,1014.0,202287
64357,a5015692c9fabeed95ff84469768ef20,1014.0,173026
65897,a900f4940eb7a493c3ed1239636e3438,1014.0,156110
15782,299a04f44bdef72485d628557c056d8a,1014.0,100175
20241,3492d8f7af1fd35a3fcd2e91e94ac301,1014.0,96965
8262,1546695d482d49e51146efaab7fae423,1014.0,85040
15630,291f5dc8c587ad44a093f001d71627a8,1014.0,72018
55972,8f52d4c68da61cc0dab830ed40fd970e,1014.0,46488
