In [7]:
import os
import sys
from time import gmtime, strftime

from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support, mean_squared_error, average_precision_score
import numpy as np
import pandas as pd
import math
import keras.backend as K
from scipy import sparse
from scipy.sparse import vstack

# Content based recommender imports
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.models.recommenders.content_recommender import ContentRecommender
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append('../../data')
sys.path.append('../../src/models')
from recommenders.cf_recommender import CFRecommender
from autoencoders import hyb2, hyb3

In [3]:
field = 'description'

# Collaborative Filtering Data

In [4]:
def load_users_projects():
    cf = pd.read_pickle('../../data/processed/new_cf_projects.pkl')
    #cf = pd.read_pickle('data/processed/cf_profiles.pkl')
    train_x = sparse.load_npz("../../data/processed/new_train_sparse.npz")
    val_x = sparse.load_npz("../../data/processed/new_val_sparse.npz")
    test_x = sparse.load_npz("../../data/processed/new_test_sparse.npz")
    train_labels = cf
    val_labels = cf
    test_labels = cf
    return train_labels, train_x, val_labels, val_x, test_labels, test_x

def load_profile_labels():
    cf_profiles = pd.read_pickle('../../data/processed/new_cf_profiles.pkl')
    return cf_profiles

# Load out time consistent collaborative filtering data
train_labels, train_x, val_labels, val_x, test_labels, test_x = load_users_projects()

# Content Data

In [5]:
def load_projects_tfidf(field):
    # Load the full project data from the pickle file
    content_projects = pd.read_pickle("../../data/processed/cf_projects_data")

    # Get the TF-IDF for the description fields
    v = TfidfVectorizer(max_features=3000)
    desc_idf = v.fit_transform(content_projects[field])

    # Train/Val/Test Split
    content_test_split_idx = int(np.floor(desc_idf.shape[0] * 0.8))
    content_val_split_idx = int(content_test_split_idx * 0.9)

    content_train_x = desc_idf[:content_val_split_idx]
    content_val_x = desc_idf[content_val_split_idx:content_test_split_idx]
    content_test_x = desc_idf[content_test_split_idx:]

    content_train_labels_idx = np.arange(0, content_val_split_idx)
    content_val_labels_idx = np.arange(content_val_split_idx, content_test_split_idx)
    content_test_labels_idx = np.arange(content_test_split_idx, desc_idf.shape[0])

    content_train_labels = pd.DataFrame(content_projects['project_id'].iloc[:content_val_split_idx], index=content_train_labels_idx)
    content_val_labels = pd.DataFrame(content_projects['project_id'].iloc[content_val_split_idx:content_test_split_idx], index=content_val_labels_idx)
    content_test_labels = pd.DataFrame(content_projects['project_id'].iloc[content_test_split_idx:], index=content_test_labels_idx)

    return content_train_labels, content_train_x, content_val_labels, content_val_x, content_test_labels, content_test_x

project_train_labels, project_train_x, project_val_labels, project_val_x, project_test_labels, project_test_x = load_projects_tfidf(field)

# Generate the embeddings
x = vstack([project_train_x, project_val_x, project_test_x]).tocsr()
x_projects = project_train_labels + project_val_labels + project_test_labels

# Get User-Project Similarity

First create the users TF-IDF vector -- should be shape (3000,)

In [6]:
users_tf_idf = None
for user_index in range(0, train_x.shape[1]):
    user_project_idx = np.nonzero(train_x[:, user_index])[0]
    user_tf_idf = np.squeeze(np.asarray(x[user_project_idx].sum(axis=0)))
    users_tf_idf = vstack([users_tf_idf, user_tf_idf])
users_tf_idf = sparse.csr_matrix(users_tf_idf)

In [13]:
users_tf_idf.shape

(896, 3000)

In [11]:
x.shape

(1021, 3000)

In [47]:
# Loop over all users
cosine_sim = None
for i in range(0, users_tf_idf.shape[0]):
    # Get the users TF-IDF row
    user_row = np.squeeze(np.asarray(users_tf_idf.getrow(0).todense())).reshape((3000, 1)).T
    # Get the cosine similarity for every user to every project
    user_cosine_sim = cosine_similarity(x.todense(), Y=user_row).T
    cosine_sim = vstack([cosine_sim, user_cosine_sim])

In [48]:
cosine_sim.shape

(896, 1021)

In [49]:
type(cosine_sim)

scipy.sparse.coo.coo_matrix

In [50]:
cosine_sim_mat = np.asarray(cosine_sim.todense())

In [51]:
cosine_sim_mat.shape

(896, 1021)

In [52]:
type(cosine_sim_mat)

numpy.ndarray

In [55]:
np.save('../../data/processed/user-project-similarity.npy', cosine_sim_mat)