In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
import datetime
import scipy.sparse as sparse
from functools import reduce

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
participations = pd.read_json('../../data/raw/new-sci-participation-data.txt')

### Split the participations into train/val/test

In [4]:
len(participations)

42159

### Clean data

In [5]:
interaction_types = participations.groupby('type')['project'].agg('count').reset_index()
interaction_types = interaction_types.rename(index=str, columns={"project": "num_interactions"})

In [6]:
interaction_types

Unnamed: 0,type,num_interactions
0,Classification / Transcription,12074
1,Data collection,23717
2,Joined the project,3020
3,Participated,3348


In [7]:
# Remove participations with bad types
types = participations.groupby('type').size().reset_index()
participations = participations[participations['type'] != 'Removed a bookmark']
participations = participations[participations['type'] != 'Removed from dashboard']

In [8]:
print('Our new dataset has %d participations in it' % (len(participations)))

Our new dataset has 42159 participations in it


In [9]:
# Remove participations with NaN project_id
participations = participations[np.isfinite(participations['project'])]

In [10]:
len(set(participations['profile']))

4782

In [11]:
len(participations[participations['project'] == 92])

357

### Group by profile

We want to
1. Group by profile to get all profiles with at least 8 unique project interactions
2. Order the participations by 'when'
3. Get number of unique participations

In [12]:
# Get the profile ids for profiles that have more than 8 unique project interactions
profiles = participations.groupby('profile')['project'].nunique()
active_profiles = profiles[profiles >= 2]
print('So we have a total of %d profiles that have more than 8 project interactions' % (len(active_profiles)))

So we have a total of 896 profiles that have more than 8 project interactions


In [13]:
np.mean(active_profiles.sort_values(ascending=False))

2.3013392857142856

In [14]:
# Only select profiles that are in active_probiles
active_participations = participations[participations['profile'].isin(active_profiles.index)]
print('So we have a total of %d participations' % (len(active_participations)))

So we have a total of 17658 participations


In [15]:
# Get the first 80% of participations from all profile groups
grouping = active_participations.groupby('profile') #.apply(lambda x: x)
print('Our grouping has %d profiles' % (len(grouping)))

Our grouping has 896 profiles


In [16]:
train_grouping = grouping.apply(lambda x: x.sort_values('when', ascending=True).head(int(len(x)*0.8)))
print('So our train_grouping has %d participations' % (len(train_grouping)))

So our train_grouping has 13691 participations


In [17]:
val_test_grouping = grouping.apply(lambda x: x.sort_values('when', ascending=True).tail(int(len(x)*0.2)))
print('So our val_test_grouping has %d participations' % (len(val_test_grouping)))

So our val_test_grouping has 3145 participations


### Create our participation dataframes

In [18]:
# Create the training dataframe
train_participations = train_grouping.reset_index(drop=True)
print('So our train_cf has %d participations' % (len(train_participations)))

So our train_cf has 13691 participations


In [19]:
# Create our validation and testing dataframes
val_test_participations = val_test_grouping.reset_index(drop=True)
print('So our val_test_cf has %d participations' % (len(val_test_participations)))

# Randomly split our val_test set into validation and test set
mask = np.random.rand(len(val_test_participations)) <= 0.5
print('The length of our mask: %d' % (len(mask)))

val_participations = val_test_participations[mask]
test_participations = val_test_participations[~mask]
print('Our validation set contains: %d participations' % (len(val_participations)))
print('Our test set contains: %d participations' % (len(test_participations)))

So our val_test_cf has 3145 participations
The length of our mask: 3145
Our validation set contains: 1571 participations
Our test set contains: 1574 participations


### Create our CF Matrices

In [20]:
# Load in our projects dataset
projects = pd.read_pickle('../../data/processed/project_data')
project_ids = list(set(projects['project_id']))
project_ids.sort()
print('We have %d projects in our dataset' % (len(projects)))

We have 1781 projects in our dataset


In [21]:
profiles = active_profiles.index

In [22]:
print('And we have %d profiles in our dataset' % (len(profiles)))

And we have 896 profiles in our dataset


In [23]:
# Create pandas dataframe with profiles as columns and projects as rows
cf = pd.DataFrame(columns=profiles, index=project_ids)

In [24]:
cf.shape

(1781, 896)

In [25]:
# Set all values in the dataframe to zero
for col in cf.columns:
    cf[col].values[:] = 0

In [26]:
# Create a dataframe for Train, Validation and Test
train_cf = cf.copy()
val_cf = cf.copy()
test_cf = cf.copy()

In [27]:
# Generate the train_cf matrix
train_projects_profiles = train_participations.groupby('project')['profile'].apply(set)
train_max_id = max(train_projects_profiles.index)

train_project_ids = [project_id for project_id in train_projects_profiles.index if project_id in train_cf.index]

for project_id in log_progress(train_project_ids):
    train_profiles_list = list(train_projects_profiles[project_id])
    project_id = int(project_id)

    train_cf.loc[project_id].loc[train_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=27)))

In [28]:
print('Our training dataset has %d interactions in it' % (np.count_nonzero(train_cf.values)))

Our training dataset has 437 interactions in it


In [29]:
# Generate the val_cf matrix
val_projects_profiles = val_participations.groupby('project')['profile'].apply(set)
val_max_id = max(val_projects_profiles.index)

val_project_ids = [project_id for project_id in val_projects_profiles.index if project_id in val_cf.index]

for project_id in log_progress(val_project_ids):
    val_profiles_list = list(val_projects_profiles[project_id])
    project_id = int(project_id)

    val_cf.loc[project_id].loc[val_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=12)))

In [30]:
print('Our validation dataset has %d interactions in it' % (np.count_nonzero(val_cf.values)))

Our validation dataset has 70 interactions in it


In [31]:
# Generate the test_cf matrix
test_projects_profiles = test_participations.groupby('project')['profile'].apply(set)
test_max_id = max(test_projects_profiles.index)

test_project_ids = [project_id for project_id in test_projects_profiles.index if project_id in test_cf.index]

for project_id in log_progress(test_project_ids):
    test_profiles_list = list(test_projects_profiles[project_id])
    project_id = int(project_id)

    test_cf.loc[project_id].loc[test_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=11)))

In [32]:
print('Our test dataset has %d interactions in it' % (np.count_nonzero(test_cf.values)))

Our test dataset has 68 interactions in it


In [33]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1781, 896)
Our val_cf has size (1781, 896)
Our test_cf has size (1781, 896)


### Remove any projects or profiles that have all zero values in the training set

In [34]:
# Find zero columns in the training set
empty_profiles = train_cf.columns[(train_cf == 0).all()]
print('We have %d profiles that have interacted with no projects in the training set' % (len(empty_profiles)))

We have 552 profiles that have interacted with no projects in the training set


In [35]:
# Find zero rows
empty_projects_train = train_cf[train_cf.eq(0).all(1)].index
empty_projects_val = val_cf[val_cf.eq(0).all(1)].index
empty_projects_test = test_cf[test_cf.eq(0).all(1)].index

empty_projects = reduce(np.intersect1d, (empty_projects_train, empty_projects_val, empty_projects_test))

print('We have %d projects that have had no profile interactions in the training set' % (len(empty_projects_train)))
print('We have %d projects that have had no profile interactions in all sets' % (len(empty_projects)))

We have 1754 projects that have had no profile interactions in the training set
We have 1754 projects that have had no profile interactions in all sets


In [36]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1781, 896)
Our val_cf has size (1781, 896)
Our test_cf has size (1781, 896)


In [37]:
# Remove columns from dataframes
clean_train_cf = train_cf.drop(columns=empty_profiles)
clean_val_cf = val_cf.drop(columns=empty_profiles)
clean_test_cf = test_cf.drop(columns=empty_profiles)

In [38]:
# Remove rows from dataframes
clean_train_cf = clean_train_cf.drop(empty_projects)
clean_val_cf = clean_val_cf.drop(empty_projects)
clean_test_cf = clean_test_cf.drop(empty_projects)

In [39]:
print('Our train_cf has size (%d, %d)' % (clean_train_cf.shape[0], clean_train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (clean_val_cf.shape[0], clean_val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (clean_test_cf.shape[0], clean_test_cf.shape[1]))

Our train_cf has size (27, 344)
Our val_cf has size (27, 344)
Our test_cf has size (27, 344)


In [40]:
print('Our training dataset has %d interactions in it' % (np.count_nonzero(clean_train_cf.values)))
print('Our validation dataset has %d interactions in it' % (np.count_nonzero(clean_val_cf.values)))
print('Our test dataset has %d interactions in it' % (np.count_nonzero(clean_test_cf.values)))

Our training dataset has 437 interactions in it
Our validation dataset has 65 interactions in it
Our test dataset has 62 interactions in it


# Save Pickles

In [41]:
clean_train_cf.to_pickle('../../data/raw/new_train_cf.pkl')
clean_val_cf.to_pickle('../../data/raw/new_val_cf.pkl')
clean_test_cf.to_pickle('../../data/raw/new_test_cf.pkl')

# Save Pickles

In [42]:
train_sparse = sparse.csr_matrix(clean_train_cf.values.astype(int))
sparse.save_npz('../../data/raw/new_train_sparse.npz', train_sparse)

In [43]:
val_sparse = sparse.csr_matrix(clean_val_cf.values.astype(int))
sparse.save_npz('../../data/raw/new_val_sparse.npz', val_sparse)

In [44]:
test_sparse = sparse.csr_matrix(clean_test_cf.values.astype(int))
sparse.save_npz('../../data/raw/new_test_sparse.npz', test_sparse)

# Create new dataframes for profiles and projects

In [45]:
train_cf = pd.read_pickle('../../data/processed/new_train_cf.pkl')
profile_ids = train_cf.columns
train_labels = pd.DataFrame(profile_ids, index=np.arange(0, len(profile_ids)))
train_labels.to_pickle('../../data/processed/new_cf_profiles.pkl')

In [46]:
train_cf = pd.read_pickle('../../data/processed/new_train_cf.pkl')
project_ids = train_cf.index
train_labels = pd.DataFrame(project_ids, index=np.arange(0, len(project_ids)))
train_labels.to_pickle('../../data/processed/new_cf_projects.pkl')