In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
import os
import sys
from time import gmtime, strftime

from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support, mean_squared_error, average_precision_score
import numpy as np
import pandas as pd
import math
import keras.backend as K
from scipy import sparse
from scipy.sparse import vstack

# Content based recommender imports
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.models.recommenders.content_recommender import ContentRecommender
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append('../../data')
sys.path.append('../../src/models')
from recommenders.cf_recommender import CFRecommender
from autoencoders import hyb2, hyb3

Using TensorFlow backend.


In [3]:
field = 'description'

In [4]:
def load_projects_tfidf(field):
    # Load the full project data from the pickle file
    content_projects = pd.read_pickle("../../data/processed/cf_projects_data")

    # Get the TF-IDF for the description fields
    v = TfidfVectorizer(max_features=3000)
    desc_idf = v.fit_transform(content_projects[field])

    # Train/Val/Test Split
    content_test_split_idx = int(np.floor(desc_idf.shape[0] * 0.8))
    content_val_split_idx = int(content_test_split_idx * 0.9)

    content_train_x = desc_idf[:content_val_split_idx]
    content_val_x = desc_idf[content_val_split_idx:content_test_split_idx]
    content_test_x = desc_idf[content_test_split_idx:]

    content_train_labels_idx = np.arange(0, content_val_split_idx)
    content_val_labels_idx = np.arange(content_val_split_idx, content_test_split_idx)
    content_test_labels_idx = np.arange(content_test_split_idx, desc_idf.shape[0])

    content_train_labels = pd.DataFrame(content_projects['project_id'].iloc[:content_val_split_idx], index=content_train_labels_idx)
    content_val_labels = pd.DataFrame(content_projects['project_id'].iloc[content_val_split_idx:content_test_split_idx], index=content_val_labels_idx)
    content_test_labels = pd.DataFrame(content_projects['project_id'].iloc[content_test_split_idx:], index=content_test_labels_idx)

    return content_train_labels, content_train_x, content_val_labels, content_val_x, content_test_labels, content_test_x

project_train_labels, project_train_x, project_val_labels, project_val_x, project_test_labels, project_test_x = load_projects_tfidf(field)

# Generate the embeddings
x = vstack([project_train_x, project_val_x, project_test_x]).tocsr()
x_projects = project_train_labels + project_val_labels + project_test_labels

In [5]:
similarity = cosine_similarity(x.todense())

In [32]:
# Read results
autoencoder_name = 'train_autoencoder_1024_hyb3_new_users_projects_0.8_10'
results = pd.read_json('../../data/experiment-results/hybrid/%s.json' % (autoencoder_name), lines=True)

results = results.T
results['json'] = results[0]

newDataframe = pd.DataFrame(columns=['user_index', 'precision', 'recall', 'y_pred', 'y_true', 'len_yPred', 'len_yTrue', 'avg_precision', 'rmse'])
newDataframe['y_pred'] = newDataframe['y_pred'].astype(object)
newDataframe['y_true'] = newDataframe['y_true'].astype(object)

for index, row in results.iterrows():
    newSeries = pd.Series(index=['user_index', 'precision', 'recall', 'y_pred', 'y_true', 'len_yPred', 'len_yTrue'])
    newSeries['y_pred'] = newSeries['y_pred'].astype(object)
    newSeries['y_true'] = newSeries['y_true'].astype(object)

    newSeries['user_index'] = row['json']['user_index']
    newSeries['precision'] = row['json']['precision']
    newSeries['recall'] = row['json']['recall']

    things1 = np.array(row['json']['y_pred']).astype('str')
    things2 = np.array(row['json']['y_true']).astype('str')
    y_pred_string = '[' + ', '.join(things1) + ']'
    y_true_string = '[' + ', '.join(things2) + ']'
    
    newSeries['y_pred'] = y_pred_string
    newSeries['y_true'] = y_true_string
    
    newSeries['len_yPred'] = len(row['json']['y_pred'])
    newSeries['len_yTrue'] = len(row['json']['y_true'])
    
    newSeries['rmse'] = row['json']['rmse']
    newSeries['avg_precision'] = row['json']['avg_precision']
    
    newDataframe.loc[index] = newSeries
    
# Calulate similarity
refined_precisions = []
for i in range(0, newDataframe.shape[0]):
    user_row = newDataframe.iloc[i]
    if len(user_row['y_true'][1:-1]) > 0:
        y_pred = np.array([int(s) for s in user_row['y_pred'][1:-1].split(',')])
        y_true = np.array([int(s) for s in user_row['y_true'][1:-1].split(',')])

        pred_sim_matrix = similarity[y_pred]

        masked_pred_sim_matrix = pred_sim_matrix[:, y_true]

        precision = user_row['precision']

        refined_precision = np.mean(masked_pred_sim_matrix.max(axis=0)) + precision
        refined_precisions = refined_precisions + [refined_precision]

np.mean(refined_precisions)

0.7825011257388353

In [33]:
newDataframe.describe()

Unnamed: 0,user_index,precision,recall,avg_precision,rmse
count,344.0,344.0,344.0,344.0,344.0
mean,171.5,0.018023,0.150145,0.010013,0.04706947
std,99.448479,0.045441,0.354963,0.021339,0.08843143
min,0.0,0.0,0.0,0.0,3.308094e-27
25%,85.75,0.0,0.0,0.0,3.3323260000000005e-27
50%,171.5,0.0,0.0,0.0,3.333521e-27
75%,257.25,0.0,0.0,0.0,3.3336860000000004e-27
max,343.0,0.3,1.0,0.185185,0.4303315


In [34]:
all_preds = np.array([])
for i in range(0, newDataframe.shape[1]):
    preds = np.asarray(newDataframe.iloc[i]['y_pred'][1:-1].split(', '), dtype=int)
    all_preds = np.append(all_preds, preds)
    
len(set(all_preds))

10

In [35]:
set(all_preds)

{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 11.0, 24.0, 25.0, 26.0}

In [36]:
newDataframe[newDataframe['user_index'] == 81.0]

Unnamed: 0,user_index,precision,recall,y_pred,y_true,len_yPred,len_yTrue,avg_precision,rmse
81,81.0,0.2,0.4,"[1, 2, 3, 4, 5, 6, 11, 24, 25, 26]","[1, 6, 9, 10, 20]",10,5,0.185185,0.430331
