# Create matrix for relationships between users and projects

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
from scipy import sparse

### Group by project and profile

In [2]:
# Load in our raw data files
participations = pd.read_pickle('../../data/raw/participation_data')
projects = pd.read_pickle("../../data/raw/project_data")

In [3]:
# Group participation data by profile and project_id
profile_projects = participations.groupby(['profile','project'])[['profile','project']].size().to_frame('interactions').reset_index()
profile_projects.head()

Unnamed: 0,profile,project,interactions
0,000073c3675ea9a1d0fe0ee3ca57e2bf,413.0,2
1,0002b85e757486c6d80ed6f73f465eaa,16864.0,1
2,0003a41bbdb3371df4c1829913f17537,19794.0,1
3,000436aaa487e461e6e16e02ab3e89eb,659.0,1
4,000476d8680db78d75b3b9edefc4a6d2,1510.0,1


In [4]:
# Save this dataframe to a pickle file
profile_projects.to_pickle("../../data/processed/profiles_projects")

### Find profiles with more than one project interaction

In [5]:
# Get the active profiles
active_profiles = profile_projects[profile_projects['interactions'] > 1]

# Create the adjacency matrix for this subset of users
active_adj_mat = pd.crosstab(active_profiles.profile, active_profiles.project)

In [6]:
project_ids = list(projects['project_id'])
count = 0
# Add any missing columns
for project_id in project_ids:
    if project_id not in active_adj_mat.columns:
        active_adj_mat[project_id] = 0

print(active_adj_mat.shape)
        
# Remove any columns that aren't in project_data
cols = active_adj_mat.columns.tolist()
for col in cols:
    if col not in list(projects['project_id']):
        active_adj_mat = active_adj_mat.drop(columns=[col])
    
print(active_adj_mat.shape)

(21347, 1783)
(21347, 1781)


In [7]:
# Create the adjecency matrix for profile and projects
adj_mat = pd.crosstab(profile_projects.profile, profile_projects.project)

In [8]:
# Convert the adjacency matrix to be a sparse matrix and then save it
sA = sparse.csr_matrix(adj_mat)

# Save the matrix to file
sparse.save_npz("../../data/processed/active_profile_projects_matrix", sA)

In [9]:
adj_mat.reset_index('profile').to_pickle('../../data/processed/active_profile_projects')