# Create matrix for relationships between users and projects

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
from scipy import sparse

### Group by project and profile

In [3]:
# Load in our raw data files
# participations = pd.read_pickle('../../data/raw/participation_data')
participations = pd.read_json('../../data/raw/new-sci-participation-data.txt')
projects = pd.read_pickle("../../data/raw/project_data")

In [4]:
# Group participation data by profile and project_id
profile_projects = participations.groupby(['profile','project'])[['profile','project']].size().to_frame('interactions').reset_index()
profile_projects.head()

Unnamed: 0,profile,project,interactions
0,000e1777-660b-56b9-9888-cea36c3548e8,75,46
1,001a6b8c-26d3-51d9-8b49-e58a456be67f,106,1
2,002f0f67-affe-5096-b4fb-082747884075,131,1
3,002f52c8-c618-5260-bd8b-619be5226417,75,96
4,002f52c8-c618-5260-bd8b-619be5226417,474,20


In [5]:
# Save this dataframe to a pickle file
profile_projects.to_pickle("../../data/processed/profiles_projects")

In [6]:
profile_projects = profile_projects.join(profile_projects.groupby('profile')['project'].size(), on='profile', rsuffix='_count')

### Find profiles with more than one project interaction

In [7]:
# Get the active profiles
active_profiles = profile_projects[profile_projects['project_count'] > 1]

# Create the adjacency matrix for this subset of users
active_profile_projects = pd.crosstab(active_profiles.profile, active_profiles.project)

In [8]:
project_ids = list(projects['project_id'])
count = 0
# Add any missing columns
for project_id in project_ids:
    if project_id not in active_profile_projects.columns:
        active_profile_projects[project_id] = 0

print(active_profile_projects.shape)
        
# Remove any columns that aren't in project_data
cols = active_profile_projects.columns.tolist()
for col in cols:
    if col not in list(projects['project_id']):
        active_profile_projects = active_profile_projects.drop(columns=[col])
    
print(active_profile_projects.shape)

(896, 1822)
(896, 1781)


In [9]:
active_profile_projects = active_profile_projects.reset_index('profile')

In [10]:
# Convert the adjacency matrix to be a sparse matrix and then save it
sA = sparse.csr_matrix(active_profile_projects.drop(columns=['profile']))

# Save the matrix to file
sparse.save_npz("../../data/processed/active_profile_projects_matrix", sA)

In [11]:
sA.shape

(896, 1781)

In [12]:
active_profile_projects.to_pickle('../../data/processed/active_profile_projects')

# TEST

In [13]:
users_projects = pd.read_pickle('../../data/processed/active_profile_projects')

In [14]:
users_projects.head()

project,profile,92,94,95,97,101,102,103,106,107,...,307,308,309,310,311,312,18862,18999,19535,19847
0,002f52c8-c618-5260-bd8b-619be5226417,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,00487450-5210-5b29-99fb-1957de69e2fb,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00716eb8-499e-5fe3-b71f-98a991274193,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00ab9da3-38c7-5ae8-ac95-6f589a73c44e,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00fbf5e2-56d4-5e8d-9302-d8f07ce844ab,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
