In [1]:
import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
import datetime
import scipy.sparse as sparse

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [3]:
participations = pd.read_pickle('../../data/raw/participation_data')

### Clean data

In [4]:
# Remove participations with bad types
types = participations.groupby('type').size().reset_index()
participations = participations[participations['type'] != 'Removed a bookmark']
participations = participations[participations['type'] != 'Removed from dashboard']

In [5]:
# Remove participations with NaN project_id
participations = participations[np.isfinite(participations['project'])]

### Group by profile

We want to
1. Group by profile to get all profiles with at least 8 unique project interactions
2. Order the participations by 'when'
3. Get number of unique participations

In [6]:
# Get the profile ids for profiles that have more than 8 unique project interactions
profiles = participations.groupby('profile')['project'].nunique()
active_profiles = profiles[profiles >= 8]
print('So we have a total of %d profiles that have more than 8 project interactions' % (len(active_profiles)))

So we have a total of 1032 profiles that have more than 8 project interactions


In [7]:
# Only select profiles that are in active_probiles
active_participations = participations[participations['profile'].isin(active_profiles.index)]
print('So we have a total of %d participations' % (len(active_participations)))

So we have a total of 164803 participations


## <font color='red'>The below cells are were the issues are coming from. It's not taking the first 80% of each group </font>

In [79]:
# Get the first 80% of participations from all profile groups
grouping = active_participations.groupby('profile') #.apply(lambda x: x)
print('Our grouping has %d profiles' % (len(grouping)))

Our grouping has 1032 profiles


In [80]:
train_grouping = grouping.apply(lambda x: x.sort_values('when', ascending=True).head(int(len(x)*0.8)))
print('So our train_grouping has %d participations' % (len(train_grouping)))

So our train_grouping has 131433 participations


In [81]:
val_test_grouping = grouping.apply(lambda x: x.sort_values('when', ascending=True).tail(int(len(x)*0.2)))
print('So our val_test_grouping has %d participations' % (len(val_test_grouping)))

So our val_test_grouping has 32558 participations


### Create our participation dataframes

In [82]:
# Create the training dataframe
train_participations = train_grouping.reset_index(drop=True)
print('So our train_cf has %d participations' % (len(train_participations)))

So our train_cf has 131433 participations


In [83]:
# Create our validation and testing dataframes
val_test_participations = val_test_grouping.reset_index(drop=True)
print('So our val_test_cf has %d participations' % (len(val_test_participations)))

# Randomly split our val_test set into validation and test set
mask = np.random.rand(len(val_test_participations)) <= 0.5
print('The length of our mask: %d' % (len(mask)))

val_participations = val_test_participations[mask]
test_participations = val_test_participations[~mask]
print('Our validation set contains: %d participations' % (len(val_participations)))
print('Our test set contains: %d participations' % (len(test_participations)))

So our val_test_cf has 32558 participations
The length of our mask: 32558
Our validation set contains: 16306 participations
Our test set contains: 16252 participations


### Create our CF Matrices

In [12]:
# Load in our projects dataset
projects = pd.read_pickle('../../data/processed/project_data')
project_ids = list(set(projects['project_id']))
project_ids.sort()
print('We have %d projects in our dataset' % (len(projects)))

We have 1781 projects in our dataset


In [13]:
profiles = active_profiles.index

In [14]:
print('And we have %d profiles in our dataset' % (len(profiles)))

And we have 1032 profiles in our dataset


In [15]:
# Create pandas dataframe with profiles as columns and projects as rows
cf = pd.DataFrame(columns=profiles, index=project_ids)

In [16]:
cf.shape

(1781, 1032)

In [17]:
# Set all values in the dataframe to zero
for col in cf.columns:
    cf[col].values[:] = 0

In [18]:
# Create a dataframe for Train, Validation and Test
train_cf = cf.copy()
val_cf = cf.copy()
test_cf = cf.copy()

In [19]:
# Generate the train_cf matrix
train_projects_profiles = train_participations.groupby('project')['profile'].apply(set)
train_max_id = max(train_projects_profiles.index)

train_project_ids = [project_id for project_id in train_projects_profiles.index if project_id in train_cf.index]

for project_id in log_progress(train_project_ids):
    train_profiles_list = list(train_projects_profiles[project_id])
    project_id = int(project_id)

    train_cf.loc[project_id].loc[train_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=1029)))

In [20]:
print('Our training dataset has %d interactions in it' % (np.count_nonzero(train_cf.values)))

Our training dataset has 16345 interactions in it


In [21]:
# Generate the val_cf matrix
val_projects_profiles = val_participations.groupby('project')['profile'].apply(set)
val_max_id = max(val_projects_profiles.index)

val_project_ids = [project_id for project_id in val_projects_profiles.index if project_id in val_cf.index]

for project_id in log_progress(val_project_ids):
    val_profiles_list = list(val_projects_profiles[project_id])
    project_id = int(project_id)

    val_cf.loc[project_id].loc[val_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=583)))

In [22]:
print('Our validation dataset has %d interactions in it' % (np.count_nonzero(val_cf.values)))

Our validation dataset has 2405 interactions in it


In [23]:
# Generate the test_cf matrix
test_projects_profiles = test_participations.groupby('project')['profile'].apply(set)
test_max_id = max(test_projects_profiles.index)

test_project_ids = [project_id for project_id in test_projects_profiles.index if project_id in test_cf.index]

for project_id in log_progress(test_project_ids):
    test_profiles_list = list(test_projects_profiles[project_id])
    project_id = int(project_id)

    test_cf.loc[project_id].loc[test_profiles_list] = 1

VBox(children=(HTML(value=''), IntProgress(value=0, max=603)))

In [24]:
print('Our test dataset has %d interactions in it' % (np.count_nonzero(test_cf.values)))

Our test dataset has 2424 interactions in it


In [25]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1781, 1032)
Our val_cf has size (1781, 1032)
Our test_cf has size (1781, 1032)


### Remove any projects or profiles that have all zero values in the training set

In [26]:
# Find zero columns in the training set
empty_profiles = train_cf.columns[(train_cf == 0).all()]
print('We have %d profiles that have interacted with no projects in the training set' % (len(empty_profiles)))

We have 152 profiles that have interacted with no projects in the training set


In [27]:
# Find zero rows
empty_projects = train_cf[train_cf.eq(0).all(1)].index
print('We have %d projects that have had no profile interactions in the training set' % (len(empty_projects)))

We have 752 projects that have had no profile interactions in the training set


In [28]:
print('Our train_cf has size (%d, %d)' % (train_cf.shape[0], train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (val_cf.shape[0], val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (test_cf.shape[0], test_cf.shape[1]))

Our train_cf has size (1781, 1032)
Our val_cf has size (1781, 1032)
Our test_cf has size (1781, 1032)


In [29]:
# Remove columns from dataframes
#clean_train_cf = train_cf.drop(columns=empty_profiles)
#clean_val_cf = val_cf.drop(columns=empty_profiles)
#clean_test_cf = test_cf.drop(columns=empty_profiles)

In [30]:
# Remove rows from dataframes
clean_train_cf = train_cf.drop(empty_projects)
clean_val_cf = val_cf.drop(empty_projects)
clean_test_cf = test_cf.drop(empty_projects)

In [31]:
print('Our train_cf has size (%d, %d)' % (clean_train_cf.shape[0], clean_train_cf.shape[1]))
print('Our val_cf has size (%d, %d)' % (clean_val_cf.shape[0], clean_val_cf.shape[1]))
print('Our test_cf has size (%d, %d)' % (clean_test_cf.shape[0], clean_test_cf.shape[1]))

Our train_cf has size (1029, 1032)
Our val_cf has size (1029, 1032)
Our test_cf has size (1029, 1032)


In [32]:
print('Our training dataset has %d interactions in it' % (np.count_nonzero(clean_train_cf.values)))
print('Our validation dataset has %d interactions in it' % (np.count_nonzero(clean_val_cf.values)))
print('Our test dataset has %d interactions in it' % (np.count_nonzero(clean_test_cf.values)))

Our training dataset has 16345 interactions in it
Our validation dataset has 2371 interactions in it
Our test dataset has 2386 interactions in it


## Save Pickles

In [33]:
clean_train_cf.to_pickle('../../data/raw/train_cf.pkl')
clean_val_cf.to_pickle('../../data/raw/val_cf.pkl')
clean_test_cf.to_pickle('../../data/raw/test_cf.pkl')

## Save Sparse Matrices

In [34]:
train_sparse = sparse.csr_matrix(clean_train_cf.values.astype(int))
sparse.save_npz('train_sparse.npz', train_sparse)

In [35]:
val_sparse = sparse.csr_matrix(clean_val_cf.values.astype(int))
sparse.save_npz('val_sparse.npz', val_sparse)

In [36]:
test_sparse = sparse.csr_matrix(clean_test_cf.values.astype(int))
sparse.save_npz('test_sparse.npz', test_sparse)

# EXAMINE THINGS

In [84]:
train_profiles = set(train_participations['profile'])
val_profiles = set(val_participations['profile'])
test_profiles = set(test_participations['profile'])
print('We have %d profiles in the training set' % (len(train_profiles)))
print('We have %d profiles in the validation set' % (len(val_profiles)))
print('We have %d profiles in the testing set' % (len(test_profiles)))

We have 1032 profiles in the training set
We have 992 profiles in the validation set
We have 992 profiles in the testing set


In [85]:
set.intersection(train_profiles, val_profiles)

{'001bedb58aa43c8d3596b5b522ba1040',
 '007350803bc77f892db9c666fc106e95',
 '0075788f90e355569d544d9c63078c9e',
 '00be614f60877db80975ae8bb2f5f7a9',
 '00c7ab6e95719c9ac3d9e7877366f60c',
 '00d4005887bcce01071866724babfe28',
 '0132ed8d86c34003806ca57d9cb0980e',
 '01b5f42d8970e242737997104edbb3d4',
 '01fc03843e310b53fe00c76c6a2f1e60',
 '0325bc8e6012ad8af877ef9c2e725170',
 '0354226c8ac8b7b893545a547b52d7c3',
 '03774154ceb67d74620d4f480eca5149',
 '037d4760ed41c4eb227b20ce0564eae4',
 '04ee978ce79302c92d3cb43bddad4c6d',
 '0510761b7894865537070a1511629c99',
 '055d2fe39f4c06a8c6c3ae1c4c336d33',
 '05f1c31d77c6a2cd7c40603e95954f36',
 '06218b96c81734acf53ca0be9428758f',
 '06574bdbd790dd97626f87c1506b852f',
 '06576caa6169160034d4e3da3cebf6f7',
 '065e9247172f34e03db0bb142cb26217',
 '0663da67c65c818a9a8c7f9f2592a05a',
 '06c7d3c131d126653ab6be3621a2dd4c',
 '0724ccae6ca1f867209969c6d3b27438',
 '0750f298e9acb8f1f8092a839b6b5a52',
 '07572e4833389ec4d8e5194b95710979',
 '07f91ce1ac65934c95a83f78ec82d65e',
 

In [86]:
set.intersection(train_profiles, test_profiles)

{'001bedb58aa43c8d3596b5b522ba1040',
 '007350803bc77f892db9c666fc106e95',
 '0075788f90e355569d544d9c63078c9e',
 '009e5dfe604e3dc2568717dccc438392',
 '00be614f60877db80975ae8bb2f5f7a9',
 '00c7ab6e95719c9ac3d9e7877366f60c',
 '00d4005887bcce01071866724babfe28',
 '0132ed8d86c34003806ca57d9cb0980e',
 '01b5f42d8970e242737997104edbb3d4',
 '01fc03843e310b53fe00c76c6a2f1e60',
 '0325bc8e6012ad8af877ef9c2e725170',
 '0354226c8ac8b7b893545a547b52d7c3',
 '03774154ceb67d74620d4f480eca5149',
 '037d4760ed41c4eb227b20ce0564eae4',
 '04ee978ce79302c92d3cb43bddad4c6d',
 '0510761b7894865537070a1511629c99',
 '055d2fe39f4c06a8c6c3ae1c4c336d33',
 '05f1c31d77c6a2cd7c40603e95954f36',
 '06218b96c81734acf53ca0be9428758f',
 '06574bdbd790dd97626f87c1506b852f',
 '06576caa6169160034d4e3da3cebf6f7',
 '0663da67c65c818a9a8c7f9f2592a05a',
 '06c7d3c131d126653ab6be3621a2dd4c',
 '0724ccae6ca1f867209969c6d3b27438',
 '0750f298e9acb8f1f8092a839b6b5a52',
 '07572e4833389ec4d8e5194b95710979',
 '07f91ce1ac65934c95a83f78ec82d65e',
 