In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
import datetime

In [2]:
participations = pd.read_pickle('../../data/raw/participation_data')

## Remove Participations with bad types

In [3]:
types = participations.groupby('type').size().reset_index()
types

Unnamed: 0,type,0
0,Added a bookmark,24177
1,Added to dashboard,15281
2,Asked a question,32
3,Bookmarked the project,24917
4,Classification / Transcription,719976
5,Clicked to join the project,37274
6,Data collection,18207
7,Joined the project,82906
8,Marked themselves a participant,3216
9,Negate previous event,2708


In [4]:
participations = participations[participations['type'] != 'Removed a bookmark']
participations = participations[participations['type'] != 'Removed from dashboard']

In [5]:
len(participations)

1472257

## Split into Train, Val, Test

In [6]:
min(participations['when'])

'1996-04-01 00:00:00'

In [7]:
max(participations['when'])

'2018-08-31 21:46:45'

In [8]:
participations = participations.sort_values(by=['when'])

In [9]:
# Create our training test set
split_idx = int(len(participations)*0.8)
train_participations = participations.iloc[:split_idx]
val_test_participations = participations.iloc[split_idx:]
print('Our training set contains: %d participations' % (len(train_participations)))
print('Our validation and test sets together contain: %d participations' % (len(val_test_participations)))

Our training set contains: 1177805 participations
Our validation and test sets together contain: 294452 participations


In [11]:
# Randomly split our val_test set into validation and test set
mask = np.random.rand(len(val_test_participations)) <= 0.5
print('The length of our mask: %d' % (len(mask)))

val_participations = val_test_participations[mask]
test_participations = val_test_participations[~mask]
print('Our validation set contains: %d participations' % (len(val_participations)))
print('Our test set contains: %d participations' % (len(test_participations)))

The length of our mask: 294452
Our validation set contains: 146995 participations
Our test set contains: 147457 participations


## Generate the CF-Matrices

In [12]:
# Generate a list of profiles (*note* we are only going to use profiles that appear in the test set)
profiles = list(set(train_participations['profile']))
profiles.sort()
print('We have %d profiles in our dataset' % (len(profiles)))

We have 56967 profiles in our dataset


In [13]:
# Load in our projects dataset
projects = pd.read_pickle('../../data/processed/project_data')
project_ids = list(set(projects['project_id']))
project_ids.sort()
print('We have %d projects in our dataset' % (len(projects)))

We have 1781 projects in our dataset


In [14]:
# Create pandas dataframe with profiles as columns and projects as rows
cf = pd.DataFrame(columns=profiles, index=project_ids)

In [15]:
# Set all values in the dataframe to zero
for col in cf.columns:
    cf[col].values[:] = 0

In [16]:
cf.loc[:5].iloc[:5]

Unnamed: 0,000073c3675ea9a1d0fe0ee3ca57e2bf,0002b85e757486c6d80ed6f73f465eaa,0003a41bbdb3371df4c1829913f17537,000436aaa487e461e6e16e02ab3e89eb,000476d8680db78d75b3b9edefc4a6d2,0004853e76f038acb207adaa98cc4afb,00060e78dfb9091883494f288a1e20e4,0006e4b3c3848d18512fdf38df7b0a60,0007997ece92bc1a4ea4e570093da014,0008be86a1b78ca8b274eeadc7c172f2,...,ffec8cb90395feaab9d1ad8a71ab032c,ffedec8c17ec6f24dfa94dfc0faf68f1,ffee921319221a4453992c401ff65aab,fff207eb43c06973fc7b6b75b9b57856,fff533919de6cb46896be4fc079cc6a2,fff5a2b4907aac15f8dead80fec10f8c,fff9d5d756cb28d954fc1e7252585b55,fffa095bfa6c965df8fb14d4aa513397,fffa7925d905995c73168e4f710a852b,fffe67794536df60188d62d0b8b5b0ee
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Create a dataframe for Train, Validation and Test
train_cf = cf.copy()
val_cf = cf.copy()
test_cf = cf.copy()

In [38]:
# Method for generating the CF-Matrix
def generate_cf_matrix(this_cf, this_participations):
    projects_profiles = this_participations.groupby('project')['profile'].apply(set)
    max_id = max(projects_profiles.index)
    
    this_project_ids = [project_id for project_id in projects_profiles.index if project_id in project_ids]
    
    for project_id in this_project_ids:
        print('%d / %d' % (project_id, max_id))
        
        profiles_list = list(projects_profiles[project_id])
        project_id = int(project_id)
        
        this_cf.loc[project_id].loc[profiles_list] = 1
    return this_cf

In [33]:
train_cf = generate_cf_matrix(train_cf, train_participations)

4 / 20022
5 / 20022
6 / 20022
7 / 20022
8 / 20022
19 / 20022
20 / 20022
22 / 20022
23 / 20022
24 / 20022
25 / 20022
26 / 20022
27 / 20022
28 / 20022
29 / 20022
30 / 20022
31 / 20022
32 / 20022
33 / 20022
34 / 20022
35 / 20022
36 / 20022
37 / 20022
38 / 20022
39 / 20022
40 / 20022
41 / 20022
42 / 20022
43 / 20022
44 / 20022
46 / 20022
47 / 20022
48 / 20022
49 / 20022
50 / 20022
51 / 20022
52 / 20022
53 / 20022
54 / 20022
55 / 20022
57 / 20022
59 / 20022
92 / 20022
93 / 20022
94 / 20022
95 / 20022
96 / 20022
97 / 20022
101 / 20022
102 / 20022
103 / 20022
104 / 20022
106 / 20022
107 / 20022
108 / 20022
110 / 20022
111 / 20022
112 / 20022
113 / 20022
114 / 20022
115 / 20022
116 / 20022
117 / 20022
119 / 20022
120 / 20022
121 / 20022
125 / 20022
133 / 20022
134 / 20022
135 / 20022
136 / 20022
137 / 20022
142 / 20022
143 / 20022
144 / 20022
145 / 20022
150 / 20022
164 / 20022
165 / 20022
168 / 20022
169 / 20022
184 / 20022
185 / 20022
187 / 20022
200 / 20022
205 / 20022
206 / 20022
209 / 200

1009 / 20022
1010 / 20022
1011 / 20022
1012 / 20022
1014 / 20022
1015 / 20022
1017 / 20022
1018 / 20022
1019 / 20022
1020 / 20022
1021 / 20022
1024 / 20022
1025 / 20022
1026 / 20022
1027 / 20022
1028 / 20022
1032 / 20022
1033 / 20022
1034 / 20022
1035 / 20022
1036 / 20022
1037 / 20022
1038 / 20022
1039 / 20022
1040 / 20022
1041 / 20022
1042 / 20022
1043 / 20022
1044 / 20022
1046 / 20022
1047 / 20022
1049 / 20022
1050 / 20022
1051 / 20022
1052 / 20022
1053 / 20022
1054 / 20022
1055 / 20022
1057 / 20022
1058 / 20022
1059 / 20022
1061 / 20022
1062 / 20022
1063 / 20022
1064 / 20022
1065 / 20022
1066 / 20022
1068 / 20022
1069 / 20022
1075 / 20022
1076 / 20022
1078 / 20022
1079 / 20022
1081 / 20022
1082 / 20022
1085 / 20022
1086 / 20022
1087 / 20022
1088 / 20022
1089 / 20022
1096 / 20022
1108 / 20022
1113 / 20022
1114 / 20022
1115 / 20022
1116 / 20022
1119 / 20022
1120 / 20022
1121 / 20022
1124 / 20022
1126 / 20022
1127 / 20022
1128 / 20022
1129 / 20022
1132 / 20022
1134 / 20022
1137 / 20022

17770 / 20022
17786 / 20022
17793 / 20022
17797 / 20022
17818 / 20022
17822 / 20022
17828 / 20022
17833 / 20022
17848 / 20022
17862 / 20022
17873 / 20022
17875 / 20022
17900 / 20022
17911 / 20022
17913 / 20022
17914 / 20022
17925 / 20022
17929 / 20022
17940 / 20022
17948 / 20022
17957 / 20022
17964 / 20022
17969 / 20022
17972 / 20022
17973 / 20022
17974 / 20022
17979 / 20022
17993 / 20022
17995 / 20022
17998 / 20022
18005 / 20022
18010 / 20022
18034 / 20022
18060 / 20022
18066 / 20022
18069 / 20022
18079 / 20022
18088 / 20022
18106 / 20022
18111 / 20022
18115 / 20022
18116 / 20022
18120 / 20022
18121 / 20022
18126 / 20022
18134 / 20022
18136 / 20022
18143 / 20022
18153 / 20022
18155 / 20022
18162 / 20022
18190 / 20022
18191 / 20022
18195 / 20022
18197 / 20022
18206 / 20022
18220 / 20022
18234 / 20022
18247 / 20022
18251 / 20022
18259 / 20022
18260 / 20022
18275 / 20022
18283 / 20022
18289 / 20022
18290 / 20022
18297 / 20022
18299 / 20022
18304 / 20022
18310 / 20022
18324 / 20022
18331 

In [34]:
# Find zero columns in the training set
empty_profiles = train_cf.columns[(train_cf == 0).all()]
print('We have %d profiles that have interacted with no projects' % (len(empty_profiles)))

In [37]:
# Find zero rows
empty_projects = (train_cf==0).all(axis=1)
print('We have %d projects that have had no profile interactions' % (len(empty_projects)))

We have 1781 projects that have had no profile interactions
