# Create CF Model Using Autoencoderes

1. ~~Split data into train and test data~~
2. Create a custom loss function for our autoencoder
3. ~~Train our autoencoder~~
4. Get the precision and recall of our autoencoder

In [22]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import src.models.cdea.CDAE as CDAE
import src.models.cdea.load_data as load_data
import src.models.cdea.metrics as metrics

from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd
import math
import keras.backend as K

In [23]:
batch_size = 32
epochs = 10
embedding_size = 32

In [24]:
# Load the proejct data
users_projects = pd.read_pickle('../../data/processed/active_profile_projects')

In [25]:
users_projects.head()

project,profile,4.0,5.0,6.0,7.0,8.0,19.0,20.0,22.0,24.0,...,296.0,297.0,298.0,302.0,303.0,304.0,309.0,310.0,312.0,19847.0
0,001bedb58aa43c8d3596b5b522ba1040,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0021e5df03d7feb6ba9558cc2828d616,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00300cba5401183830a6a82b80c8ff7f,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0033882471572a66322d0747c6a4b12d,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00536e1575193e409e255cd02ed9d205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split into Train and Test data

In [26]:
def train_test_split(users_projects_matrix):
    # Get the adjacency vector for this user
    users_projects_matrix = users_projects_matrix.drop(columns=['profile'])
        
    split_percentage = 0.2
    split_column_index = int(split_percentage * len(users_projects_matrix.columns))
    
    train = users_projects_matrix.copy()
    test = users_projects_matrix.copy()
    
    # Set a certain amount of the projects to 0
    train.iloc[:, :split_column_index] = 0
    
    return train, test, split_column_index

In [27]:
train, test, split_column_index = train_test_split(users_projects)

In [28]:
train.shape

(4866, 1781)

In [29]:
test.shape

(4866, 1781)

## Create new loss function for autoender

In [30]:
def customLoss(yTrue,yPred):
    zeros_idx = np.where(yTrue == 0)[0]
    if len(zeros_idx) > 0:
        yPred[zeros_idx] = 0
    return K.sum(K.log(yTrue) - K.log(yPred))

## Train autoencoder

In [31]:
# Create our model
model = CDAE.create(I=train.shape[1], U=train.shape[0]+1, K=embedding_size,
                    hidden_activation='relu', output_activation='sigmoid', q=0.50, l=0.01)
model.compile(loss=customLoss, optimizer='adam')
model.summary()

hello
[]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x_item (InputLayer)             (None, 1781)         0                                            
__________________________________________________________________________________________________
x_user (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 1781)         0           x_item[0][0]                     
__________________________________________________________________________________________________
embedding_layer (Embedding)     (None, 1, 32)        155744      x_user[0][0]                     
__________________________________________________________________________________________________
d

  h_item = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(h_item)
  h_user = Embedding(input_dim=U, output_dim=K, input_length=1, W_regularizer=l2(l), name='embedding_layer')(x_user)
  return Model(input=[x_item, x_user], output=y)


In [32]:
things = np.arange(0,train.shape[0]).reshape(train.shape[0],1)

In [33]:
# Train our Autoencoder
history = model.fit(x=[train, things], y=train,
                    batch_size=batch_size, nb_epoch=epochs, verbose=1)

  This is separate from the ipykernel package so we can avoid doing imports until


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evalute Recommender System

In [34]:
full_pred = model.predict([train, things])

In [35]:
y_pred_floats = full_pred[:, :split_column_index]
y_pred_indices = y_pred_floats.argsort()[-5:][::-1]

In [36]:
y_pred = np.zeros(y_pred_floats.shape)
for i in range(0, y_pred_indices.shape[1]):
    y_pred[y_pred_indices[:, i], i] = 1

In [37]:
y_true = test.iloc[:, :split_column_index]

In [38]:
y_true.shape

(4866, 356)

In [39]:
np.nonzero(y_true.values)

(array([   0,    0,    0, ..., 4864, 4864, 4865]),
 array([ 64, 234, 237, ..., 190, 234, 182]))

In [40]:
# Get precision and recall
precision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred)

In [41]:
np.mean(precision)

0.007443820224719102

In [42]:
np.mean(recall)

0.0033074903163714717