# Create matrix for relationships between users and projects

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
from scipy import sparse

### Group by project and profile

In [2]:
# Load in our raw data files
participations = pd.read_pickle('../../data/raw/participation_data')
projects = pd.read_pickle("../../data/raw/project_data")

In [3]:
# Group participation data by profile and project_id
profile_projects = participations.groupby(['profile','project'])[['profile','project']].size().to_frame('interactions').reset_index()
profile_projects.head()

Unnamed: 0,profile,project,interactions
0,000073c3675ea9a1d0fe0ee3ca57e2bf,413.0,2
1,0002b85e757486c6d80ed6f73f465eaa,16864.0,1
2,0003a41bbdb3371df4c1829913f17537,19794.0,1
3,000436aaa487e461e6e16e02ab3e89eb,659.0,1
4,000476d8680db78d75b3b9edefc4a6d2,1510.0,1


In [4]:
# Save this dataframe to a pickle file
profile_projects.to_pickle("../../data/processed/profiles_projects")

In [5]:
profile_projects = profile_projects.join(profile_projects.groupby('profile')['project'].size(), on='profile', rsuffix='_count')

### Find profiles with more than one project interaction

In [6]:
# Get the active profiles
active_profiles = profile_projects[profile_projects['project_count'] > 1]

# Create the adjacency matrix for this subset of users
active_profile_projects = pd.crosstab(active_profiles.profile, active_profiles.project)

In [7]:
project_ids = list(projects['project_id'])
count = 0
# Add any missing columns
for project_id in project_ids:
    if project_id not in active_profile_projects.columns:
        active_profile_projects[project_id] = 0

print(active_profile_projects.shape)
        
# Remove any columns that aren't in project_data
cols = active_profile_projects.columns.tolist()
for col in cols:
    if col not in list(projects['project_id']):
        active_profile_projects = active_profile_projects.drop(columns=[col])
    
print(active_profile_projects.shape)

(4866, 1783)
(4866, 1781)


In [8]:
active_profile_projects = active_profile_projects.reset_index('profile')

In [9]:
# Convert the adjacency matrix to be a sparse matrix and then save it
sA = sparse.csr_matrix(active_profile_projects.drop(columns=['profile']))

# Save the matrix to file
sparse.save_npz("../../data/processed/active_profile_projects_matrix", sA)

In [10]:
sA.shape

(4866, 1781)

In [11]:
active_profile_projects.to_pickle('../../data/processed/active_profile_projects')

# TEST

In [12]:
users_projects = pd.read_pickle('../../data/processed/active_profile_projects')

In [13]:
users_projects.head()

project,profile,4.0,5.0,6.0,7.0,8.0,19.0,20.0,22.0,24.0,...,296.0,297.0,298.0,302.0,303.0,304.0,309.0,310.0,312.0,19847.0
0,001bedb58aa43c8d3596b5b522ba1040,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0021e5df03d7feb6ba9558cc2828d616,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00300cba5401183830a6a82b80c8ff7f,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0033882471572a66322d0747c6a4b12d,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00536e1575193e409e255cd02ed9d205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
