In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
import datetime
import scipy.sparse as sparse

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
participations = pd.read_pickle('../../data/raw/participation_data')

### Clean data

In [4]:
# Remove participations with bad types
types = participations.groupby('type').size().reset_index()
participations = participations[participations['type'] != 'Removed a bookmark']
participations = participations[participations['type'] != 'Removed from dashboard']

In [9]:
# Remove participations with NaN project_id
participations = participations[np.isfinite(participations['project'])]

### Group by profile

We want to
1. Group by profile to get all profiles with at least 8 unique project interactions
2. Order the participations by 'when'
3. Get number of unique participations

In [97]:
# Get the profile ids for profiles that have more than 8 unique project interactions
profiles = participations.groupby('profile')['project'].nunique()
active_profiles = profiles[profiles >= 8]
print('So we have a total of %d profiles that have more than 8 project interactions' % (len(active_profiles)))

So we have a total of 1032 profiles that have more than 8 project interactions


In [40]:
# Only select profiles that are in active_probiles
active_participations = participations[participations['profile'].isin(active_profiles.index)]
print('So we have a total of %d participations' % (len(active_participations)))

So we have a total of 164803 participations


In [44]:
# Get the first 80% of participations from all profile groups
grouping = active_participations.groupby('profile').apply(lambda x: x.sort_values('when'))
train_grouping = grouping.apply(lambda x: x.head(int(len(x)*0.8)))
print('So our train_grouping has %d participations' % (len(train_grouping)))

So our train_grouping has 131842 participations


In [78]:
val_test_grouping = grouping.apply(lambda x: x.tail(int(len(x)*0.2)))
print('So our val_test_grouping has %d participations' % (len(val_test_grouping)))

So our val_test_grouping has 32960 participations


### Create our participation dataframes

In [81]:
# Create the training dataframe
train_participations = train_grouping.reset_index(drop=True)
print('So our train_cf has %d participations' % (len(train_cf)))

So our train_cf has 131842 participations


In [82]:
# Create our validation and testing dataframes
val_test_participations = val_test_grouping.reset_index(drop=True)
print('So our val_test_cf has %d participations' % (len(val_test_cf)))

# Randomly split our val_test set into validation and test set
mask = np.random.rand(len(val_test_participations)) <= 0.5
print('The length of our mask: %d' % (len(mask)))

val_participations = val_test_participations[mask]
test_participations = val_test_participations[~mask]
print('Our validation set contains: %d participations' % (len(val_participations)))
print('Our test set contains: %d participations' % (len(test_participations)))

So our val_test_cf has 32960 participations
The length of our mask: 32960
Our validation set contains: 16518 participations
Our test set contains: 16442 participations


### Create our CF Matrices

In [83]:
# Load in our projects dataset
projects = pd.read_pickle('../../data/processed/project_data')
project_ids = list(set(projects['project_id']))
project_ids.sort()
print('We have %d projects in our dataset' % (len(projects)))

We have 1781 projects in our dataset


In [108]:
profiles = active_profiles.index

In [109]:
# Create pandas dataframe with profiles as columns and projects as rows
cf = pd.DataFrame(columns=profiles, index=project_ids)

In [110]:
cf.shape

(1781, 1032)

In [111]:
# Set all values in the dataframe to zero
for col in cf.columns:
    cf[col].values[:] = 0

In [112]:
# Create a dataframe for Train, Validation and Test
train_cf = cf.copy()
val_cf = cf.copy()
test_cf = cf.copy()

In [113]:
# Generate the train_cf matrix
train_projects_profiles = train_participations.groupby('project')['profile'].apply(set)
train_max_id = max(train_projects_profiles.index)

train_project_ids = [project_id for project_id in train_projects_profiles.index if project_id in train_cf.index]

for project_id in log_progress(train_project_ids):
    train_profiles_list = list(train_projects_profiles[project_id])
    project_id = int(project_id)

    train_cf.loc[project_id].loc[train_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=1029)))

In [114]:
# Generate the val_cf matrix
val_projects_profiles = val_participations.groupby('project')['profile'].apply(set)
val_max_id = max(val_projects_profiles.index)

val_project_ids = [project_id for project_id in val_projects_profiles.index if project_id in val_cf.index]

for project_id in log_progress(val_project_ids):
    val_profiles_list = list(val_projects_profiles[project_id])
    project_id = int(project_id)

    val_cf.loc[project_id].loc[val_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=593)))

In [115]:
# Generate the test_cf matrix
test_projects_profiles = test_participations.groupby('project')['profile'].apply(set)
test_max_id = max(test_projects_profiles.index)

test_project_ids = [project_id for project_id in test_projects_profiles.index if project_id in test_cf.index]

for project_id in log_progress(test_project_ids):
    test_profiles_list = list(test_projects_profiles[project_id])
    project_id = int(project_id)

    test_cf.loc[project_id].loc[test_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=603)))

In [116]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1781, 1032)
Our val_cf has size (1781, 1032)
Our test_cf has size (1781, 1032)


### Remove any projects or profiles that have all zero values in the training set

In [117]:
# Find zero columns in the training set
empty_profiles = train_cf.columns[(train_cf == 0).all()]
print('We have %d profiles that have interacted with no projects in the training set' % (len(empty_profiles)))

We have 152 profiles that have interacted with no projects in the training set


In [118]:
# Find zero rows
empty_projects = train_cf[train_cf.eq(0).all(1)].index
print('We have %d projects that have had no profile interactions in the training set' % (len(empty_projects)))

We have 752 projects that have had no profile interactions in the training set


In [119]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1781, 1032)
Our val_cf has size (1781, 1032)
Our test_cf has size (1781, 1032)


In [120]:
# Remove columns from dataframes
train_cf = train_cf.drop(columns=empty_profiles)
val_cf = val_cf.drop(columns=empty_profiles)
test_cf = test_cf.drop(columns=empty_profiles)

In [121]:
# Remove rows from dataframes
train_cf = train_cf.drop(empty_projects)
val_cf = val_cf.drop(empty_projects)
test_cf = test_cf.drop(empty_projects)

In [122]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1029, 880)
Our val_cf has size (1029, 880)
Our test_cf has size (1029, 880)


## Save Pickles

In [123]:
train_cf.to_pickle('../../data/raw/train_cf.pkl')
val_cf.to_pickle('../../data/raw/val_cf.pkl')
test_cf.to_pickle('../../data/raw/test_cf.pkl')

## Save Sparse Matrices

In [124]:
train_sparse = sparse.csr_matrix(train_cf.values.astype(int))
sparse.save_npz('train_sparse.npz', train_sparse)

In [125]:
val_sparse = sparse.csr_matrix(val_cf.values.astype(int))
sparse.save_npz('val_sparse.npz', val_sparse)

In [126]:
test_sparse = sparse.csr_matrix(test_cf.values.astype(int))
sparse.save_npz('test_sparse.npz', test_sparse)