# Recommend Projects to Users with Time Consistency

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
from scipy import sparse
import math

import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.models.cdea.evaluate import evaluate

In [2]:
users_projects_list = pd.read_pickle('../../data/processed/profile_projects_time_consistent')

In [3]:
projects = pd.read_pickle('../../data/raw/project_data')

In [4]:
users_projects_matrix =  pd.read_pickle('../../data/processed/active_profile_projects')

In [5]:
project_ids = np.load('../../data/processed/project_ids')

In [6]:
similarity_matrix = pd.read_pickle('../../data/processed/similarity_matrix')

In [7]:
# Normalise the similarity
similarity_matrix = (similarity_matrix + 1) / 2

In [8]:
project_ids.shape

(1783,)

### Prepare user for recommendation

In [9]:
# Get our users project list
user_projects_list = users_projects_list[users_projects_list['num_projects'] > 7].iloc[1000]
user_id = user_projects_list['profile']


projects_list = [val for val in user_projects_list['projects'] if not math.isnan(val)]
projects_list

[16864.0,
 16864.0,
 737.0,
 737.0,
 169.0,
 169.0,
 17862.0,
 17862.0,
 17248.0,
 17248.0,
 16978.0,
 16978.0,
 1970.0,
 1970.0,
 564.0,
 564.0,
 1208.0,
 1208.0]

In [10]:
# Get our users adjcacency matrix row
adj_matrix = users_projects_matrix[users_projects_matrix['profile'] == user_id]
adj_matrix = adj_matrix.drop(columns=['profile'])
adj_matrix

project,4.0,5.0,6.0,7.0,8.0,19.0,20.0,22.0,24.0,25.0,...,296.0,297.0,298.0,302.0,303.0,304.0,309.0,310.0,312.0,19847.0
589,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Cut out projects that occured after the 80% time step
cutoff_idx = int(np.ceil(len(projects_list)*0.8))
cutoff_idx

15

In [12]:
# Project Ids of projects before the cutoff
before_cutoff = list(set(projects_list[:cutoff_idx]))
before_cutoff

[16864.0, 737.0, 17248.0, 17862.0, 169.0, 16978.0, 1970.0, 564.0]

In [13]:
# Project Ids of projects after the cutoff
after_cutoff = list(set(projects_list[cutoff_idx:]))
after_cutoff

[1208.0, 564.0]

In [14]:
# Figure out which projects to cut out of the 
projects_to_cut = np.setdiff1d(after_cutoff, before_cutoff)
projects_to_cut

array([1208.])

In [15]:
# Set the project to 0
adj_matrix[projects_to_cut] = 0

### Make Predictions

In [16]:
adj_matrix = np.array(adj_matrix.T)

In [17]:
# Calculate the similarity between user and projects
num_projects = np.count_nonzero(adj_matrix)
user_projects_sim = np.sum(adj_matrix * similarity_matrix.values, axis=1) / num_projects

In [18]:
similar_items = pd.DataFrame(user_projects_sim)
similar_items.columns = ['similarity_score']
similar_items['project_id'] = projects['project_id']

In [19]:
# Pick the Top-N item
N = 5
similar_items = similar_items.sort_values('similarity_score', ascending=False)
similar_items = similar_items.head(N)

In [20]:
similar_items

Unnamed: 0,similarity_score,project_id
835,114.444882,1306.0
739,114.423375,1140.0
325,114.270954,553.0
867,108.347876,1397.0
815,108.20638,1238.0


### Evaluate Recommender

In [21]:
# Set y_true
y_true = np.zeros(projects.shape[0])
after_cutoff = np.array(after_cutoff, dtype=int)
print(after_cutoff)
y_true[after_cutoff] = 1
print(np.count_nonzero(y_true))
print(y_true.shape)

[1208  564]
2
(1781,)


In [22]:
# Set y_pred
y_pred = np.zeros(projects.shape[0])
predicted_projects = np.array(similar_items.index, dtype=int)
y_pred[predicted_projects] = 1
print(np.count_nonzero(y_pred))
print(y_pred.shape)

5
(1781,)


In [23]:
precision, recall = evaluate(y_true, y_pred)

### Calculate the Refined Precision

In [24]:
# Loop over everything in y_pred
# Find it's max similarity with elements in y_true
pred_sim_matrix = similarity_matrix[similar_items.index]
pred_sim_matrix

Unnamed: 0,835,739,325,867,815
4,0.003768,0.009200,0.021989,0.981199,0.995018
5,0.000058,0.002844,0.030216,0.975377,0.991639
6,0.000020,0.003473,0.028472,0.976156,0.992636
7,0.999902,0.997016,0.970866,0.026482,0.008256
8,0.001488,0.004878,0.038122,0.974415,0.988653
19,0.006047,0.003502,0.058426,0.964082,0.976100
20,0.000018,0.003475,0.028655,0.976113,0.992578
22,0.000126,0.003928,0.027327,0.977168,0.992766
23,0.000020,0.003453,0.028662,0.976139,0.992567
24,0.999929,0.997119,0.970378,0.024105,0.008110


In [25]:
print(y_true.shape)
print(np.count_nonzero(y_true))
true_idx = np.nonzero(y_true)
print(true_idx)

(1781,)
2
(array([ 564, 1208]),)


In [26]:
pred_sim_matrix.shape

(1781, 5)

In [27]:
masked_pred_sim_matrix = pred_sim_matrix.iloc[true_idx]
masked_pred_sim_matrix

Unnamed: 0,835,739,325,867,815
873,0.000237,0.002305,0.032327,0.973957,0.99035
17329,5e-05,0.003186,0.030266,0.975112,0.991992


In [28]:
masked_pred_sim_matrix.shape

(2, 5)

In [29]:
refined_precision = np.mean(masked_pred_sim_matrix.max(axis=0)) + precision

In [30]:
refined_precision

0.400570809841156