# Create Hybird Recommender

In [1]:
import os
import sys
from time import gmtime, strftime

from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd
import math
import keras.backend as K
from scipy import sparse
from scipy.sparse import vstack

Using TensorFlow backend.


In [2]:
k = 5 #int(sys.argv[1])

### Load Collaborative Filtering Model

In [3]:
sys.path.append('../../data')
sys.path.append('../../src/models')
from recommenders.cf_recommender import CFRecommender

In [4]:
# Load the autoencoder to use
autoencoder_model = 'train_autoencoder_0_deep2_users_projects' #str(sys.argv[2])
dataSource = 'users_projects' # str(sys.argv[3])
model = load_model('../../data/autoencoders/' + autoencoder_model + '.h5')
projects = pd.read_pickle('../../data/processed/cf_projects_data')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [5]:
def load_users_projects():
    cf = pd.read_pickle('../../data/processed/cf_projects.pkl')
    #cf = pd.read_pickle('data/processed/cf_profiles.pkl')
    train_x = sparse.load_npz("../../data/processed/train_sparse.npz")
    val_x = sparse.load_npz("../../data/processed/val_sparse.npz")
    test_x = sparse.load_npz("../../data/processed/test_sparse.npz")
    train_labels = cf
    val_labels = cf
    test_labels = cf
    return train_labels, train_x, val_labels, val_x, test_labels, test_x

def load_profile_labels():
    cf_profiles = pd.read_pickle('../../data/processed/cf_profiles.pkl')
    return cf_profiles

# Load out time consistent collaborative filtering data
train_labels, train_x, val_labels, val_x, test_labels, test_x = load_users_projects()

### Load the content based models

In [6]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.models.recommenders.content_recommender import ContentRecommender
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model

In [7]:
def load_projects_tfidf(field):
    # Load the full project data from the pickle file
    content_projects = pd.read_pickle("../../data/processed/cf_projects_data")

    # Get the TF-IDF for the description fields
    v = TfidfVectorizer()
    desc_idf = v.fit_transform(projects[field])

    # Train/Val/Test Split
    content_test_split_idx = int(np.floor(desc_idf.shape[0] * 0.8))
    content_val_split_idx = int(content_test_split_idx * 0.9)

    content_train_x = desc_idf[:content_val_split_idx]
    content_val_x = desc_idf[content_val_split_idx:content_test_split_idx]
    content_test_x = desc_idf[content_test_split_idx:]

    content_train_labels_idx = np.arange(0, content_val_split_idx)
    content_val_labels_idx = np.arange(content_val_split_idx, content_test_split_idx)
    content_test_labels_idx = np.arange(content_test_split_idx, desc_idf.shape[0])

    content_train_labels = pd.DataFrame(projects['project_id'].iloc[:content_val_split_idx], index=content_train_labels_idx)
    content_val_labels = pd.DataFrame(projects['project_id'].iloc[content_val_split_idx:content_test_split_idx], index=content_val_labels_idx)
    content_test_labels = pd.DataFrame(projects['project_id'].iloc[content_test_split_idx:], index=content_test_labels_idx)

    return content_train_labels, content_train_x, content_val_labels, content_val_x, content_test_labels, content_test_x

In [8]:
field = 'description'

In [9]:
# Load the autoencoder to use
autoencoder_model = 'train_autoencoder_32_cdae_tfidf_description'
autoencoder = load_model('../../data/autoencoders/' + autoencoder_model + '.h5')

project_train_labels, project_train_x, project_val_labels, project_val_x, project_test_labels, project_test_x = load_projects_tfidf(field)

# Generate the embeddings
x = vstack([project_train_x, project_val_x, project_test_x]).tocsr()
x_projects = project_train_labels + project_val_labels + project_test_labels

In [10]:
train = sparse.load_npz("../../data/processed/train.npz")
test = sparse.load_npz("../../data/processed/test.npz")

# Build our recommender and similarity matrix
recommender = ContentRecommender()
similarity_matrix = recommender.similarity(x)

Recommender


# Make Recommendations

In [11]:
precisions = []
recalls = []
refined_precisions = []

# Pick the user
for profile_idx in range(0, len(train_labels)):
    # Collaborative Filtering Predictions
    profile_col = np.squeeze(np.asarray(train_x.getcol(profile_idx).todense())).reshape(1,-1)
    labels = np.asarray(train_labels.index)

    # Make a prediction for 
    predictions = model.predict([profile_col, labels])
    
    # Calculate the similarity between user and projects
    user_projects_sim = np.sum(np.asarray(train_x.getcol(profile_idx).todense()) * similarity_matrix.values, axis=0) / 1021

    # We can do this through masking
    projects_to_not_suggest_again = train_x.nonzero()[0]

    # Order the projects by similarity
    similar_items = pd.DataFrame(user_projects_sim)
    similar_items.columns = ['similarity_score']
    similar_items['project_id'] = similarity_matrix.columns
    similar_items = similar_items.sort_values('similarity_score', ascending=False)
    
    cb_preds = similar_items['similarity_score']
    cf_preds = pd.Series(predictions[0], index=train_labels.values.flatten())
    
    cb_sum = sum(cb_preds)
    cf_sum = sum(cf_preds)
    
    norm_cb_preds = cb_preds / cb_sum
    norm_cf_preds = cf_preds / cf_sum
    
    sum_preds = pd.Series(np.zeros(len(train_labels.values.flatten())), index=train_labels.values.flatten())
    for i in range(0, len(norm_cb_preds)):
        index = norm_cb_preds.index[i]
        pred = norm_cb_preds.iloc[i]
        result = pred + norm_cf_preds.iloc[i]
        sum_preds.loc[i] = result
        
    top_5_idx = sum_preds.sort_values(ascending=False).head(5).index
    
    top_5_project_ids = train_labels.iloc[top_5_idx].values.flatten()
    
    y_true = np.squeeze(np.asarray(test_x.getcol(profile_idx).todense())).reshape(1,-1)

    y_true = np.squeeze(np.asarray(y_true))

    predicted_projects = top_5_project_ids
    y_pred = np.zeros(y_true.shape)
    y_pred[predicted_projects] = 1
    
    # Get precision and recall
    precision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred, average='binary', pos_label=1)

    # Get the similarity matrix entries for the most similar items to our 
    pred_sim_matrix = similarity_matrix[top_5_idx]

    # Get the indices of all the projects that were actually participated with after cut_off time
    true_idx = np.nonzero(y_true)

    # This should now mean we have a 2D matrix which has 
    # len(similar_items) columns 
    # len(true_idx) rows
    masked_pred_sim_matrix = pred_sim_matrix.iloc[true_idx]

    refined_precision = np.mean(masked_pred_sim_matrix.max(axis=0)) + precision
    
    precisions = precisions + [precision]
    recalls = recalls + [recall]
    refined_precisions = refined_precisions + [refined_precision]


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


KeyboardInterrupt: 