# Create Hybird Recommender (with content in the first layer)

In [1]:
import os
import sys
from time import gmtime, strftime

from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support, mean_squared_error, average_precision_score
import numpy as np
import pandas as pd
import math
import keras.backend as K
from scipy import sparse
from scipy.sparse import vstack

# Content based recommender imports
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.models.recommenders.content_recommender import ContentRecommender
from gensim.models import doc2vec
from collections import namedtuple
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model

sys.path.append('../../data')
sys.path.append('../../src/models')
from recommenders.cf_recommender import CFRecommender
from autoencoders import hyb1, hyb2

Using TensorFlow backend.


In [2]:
field = 'description'

### Load Collaborative Filtering Data

In [3]:
def load_users_projects():
    cf = pd.read_pickle('../../data/processed/new_cf_projects.pkl')
    #cf = pd.read_pickle('data/processed/cf_profiles.pkl')
    train_x = sparse.load_npz("../../data/processed/new_train_sparse.npz")
    val_x = sparse.load_npz("../../data/processed/new_val_sparse.npz")
    test_x = sparse.load_npz("../../data/processed/new_test_sparse.npz")
    train_labels = cf
    val_labels = cf
    test_labels = cf
    return train_labels, train_x, val_labels, val_x, test_labels, test_x

def load_profile_labels():
    cf_profiles = pd.read_pickle('../../data/processed/new_cf_profiles.pkl')
    return cf_profiles

# Load out time consistent collaborative filtering data
train_labels, train_x, val_labels, val_x, test_labels, test_x = load_users_projects()

### Load the Content Data

In [4]:
def load_projects_tfidf(field):
    # Load the full project data from the pickle file
    content_projects = pd.read_pickle("../../data/processed/cf_projects_data")

    # Get the TF-IDF for the description fields
    v = TfidfVectorizer(max_features=3000)
    desc_idf = v.fit_transform(content_projects[field])

    # Train/Val/Test Split
    content_test_split_idx = int(np.floor(desc_idf.shape[0] * 0.8))
    content_val_split_idx = int(content_test_split_idx * 0.9)

    content_train_x = desc_idf[:content_val_split_idx]
    content_val_x = desc_idf[content_val_split_idx:content_test_split_idx]
    content_test_x = desc_idf[content_test_split_idx:]

    content_train_labels_idx = np.arange(0, content_val_split_idx)
    content_val_labels_idx = np.arange(content_val_split_idx, content_test_split_idx)
    content_test_labels_idx = np.arange(content_test_split_idx, desc_idf.shape[0])

    content_train_labels = pd.DataFrame(content_projects['project_id'].iloc[:content_val_split_idx], index=content_train_labels_idx)
    content_val_labels = pd.DataFrame(content_projects['project_id'].iloc[content_val_split_idx:content_test_split_idx], index=content_val_labels_idx)
    content_test_labels = pd.DataFrame(content_projects['project_id'].iloc[content_test_split_idx:], index=content_test_labels_idx)

    return content_train_labels, content_train_x, content_val_labels, content_val_x, content_test_labels, content_test_x

project_train_labels, project_train_x, project_val_labels, project_val_x, project_test_labels, project_test_x = load_projects_tfidf(field)

# Generate the embeddings
x = vstack([project_train_x, project_val_x, project_test_x]).tocsr()
x_projects = project_train_labels + project_val_labels + project_test_labels

# Make Recommendations

First create the users TF-IDF vector -- should be shape (3000,)

In [5]:
users_tf_idf = None
for user_index in range(0, train_x.shape[1]):
    user_project_idx = np.nonzero(train_x[:, user_index])[0]
    user_tf_idf = np.squeeze(np.asarray(x[user_project_idx].sum(axis=0)))
    users_tf_idf = vstack([users_tf_idf, user_tf_idf])
users_tf_idf = sparse.csr_matrix(users_tf_idf)

Create the autoencoder

In [6]:
U = train_x.shape[1]
I = train_x.shape[0]
embedding_size = 32
q = 0.8

# Create our autoencoder model
model = hyb2.create(I=I, U=U, K=embedding_size,
                    hidden_activation='relu', output_activation='sigmoid', q=q, l=0.001)
model.compile(loss='mean_absolute_error', optimizer='adam')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


  h_item = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(h_item)
  h_content = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(content_item)
  h_content_1024 = Dense(1024, W_regularizer=l2(l), b_regularizer=l2(l))(content_item)
  h_content_512 = Dense(512, W_regularizer=l2(l), b_regularizer=l2(l))(content_item)
  h_user = Embedding(input_dim=U, output_dim=K, input_length=1, W_regularizer=l2(l), name='embedding_layer')(x_user)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x_item (InputLayer)             (None, 1021)         0                                            
__________________________________________________________________________________________________
x_user (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1021)         0           x_item[0][0]                     
__________________________________________________________________________________________________
embedding_layer (Embedding)     (None, 1, 32)        33024       x_user[0][0]                     
__________________________________________________________________________________________________
content_it

  return Model(input=[x_item, content_item, x_user], output=decoded)


Train the autoencoder

In [7]:
# Train our autoencoder
train_x_t = train_x.T
val_x_t = val_x.T
test_x_t = test_x.T

train_val_x = train_x_t + val_x_t
train_test_x = train_x_t + test_x_t

users = np.arange(0, train_x_t.shape[0])

In [8]:
print('The train_x shape is %s' % (str(train_x_t.shape)))
print('The x_projects shape is %s' % (str(users.shape)))
print('The user_tf_idf shape is %s' % (str(users_tf_idf.shape)))

The train_x shape is (1032, 1021)
The x_projects shape is (1032,)
The user_tf_idf shape is (1032, 3000)


In [9]:
history = model.fit(x=[train_x_t, users_tf_idf, users], y=train_x_t,
                    batch_size=32, nb_epoch=10, verbose=1,
                    validation_data=[[train_x_t, users_tf_idf, users], train_val_x])

  This is separate from the ipykernel package so we can avoid doing imports until


Instructions for updating:
Use tf.cast instead.
Train on 1032 samples, validate on 1032 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Make Predictions

In [10]:
profile_idx = 0
profile_col = np.squeeze(np.asarray(train_x.getcol(profile_idx).todense())).reshape(1,-1)
labels = np.asarray(train_labels.index)
this_users_tf_idf = np.squeeze(np.asarray(users_tf_idf.getrow(profile_idx).todense())).reshape(1,-1)

In [11]:
# Make a prediction for 
predictions = model.predict([profile_col, this_users_tf_idf, labels])

In [12]:
k = 5
recommender = CFRecommender(k)

In [13]:
# Get the Top-K Recommendataions
recommendations = recommender.top_projects(profile_col, predictions, train_labels)

In [14]:
y_true, y_pred = recommender.generate_y(recommendations, train_labels, test_x.getcol(profile_idx), val_x=val_x.getcol(profile_idx))


In [15]:
# Get precision and recall
precision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred, average='binary', pos_label=1)
avg_precision = average_precision_score(y_true, predictions.reshape(y_true.shape), average='weighted', pos_label=1)
rmse = math.sqrt(mean_squared_error(y_true, predictions.reshape(y_true.shape)))

In [16]:
precision

0.0

In [17]:
recall

0.0

In [18]:
avg_precision

0.0019588638589618022

In [19]:
rmse

0.04425905397725761