# Create CF Model Using Autoencoderes

1. ~~Split data into train and test data~~
2. Create a custom loss function for our autoencoder
3. ~~Train our autoencoder~~
4. ~~Get the precision and recall of our autoencoder~~

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import src.models.cdea.CDAE as CDAE
import src.models.cdea.load_data as load_data
import src.models.cdea.metrics as metrics

from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd
import math
import keras.backend as K

Using TensorFlow backend.


In [2]:
batch_size = 32
epochs = 10
embedding_size = 32

In [3]:
# Load the proejct data
users_projects = pd.read_pickle('../../data/processed/active_profile_projects')

In [4]:
users_projects.head()

project,profile,4.0,5.0,6.0,7.0,8.0,19.0,20.0,22.0,24.0,...,296.0,297.0,298.0,302.0,303.0,304.0,309.0,310.0,312.0,19847.0
0,001bedb58aa43c8d3596b5b522ba1040,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0021e5df03d7feb6ba9558cc2828d616,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00300cba5401183830a6a82b80c8ff7f,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0033882471572a66322d0747c6a4b12d,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00536e1575193e409e255cd02ed9d205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split into Train and Test data

In [5]:
def train_test_split(users_projects_matrix):
    # Get the adjacency vector for this user
    users_projects_matrix = users_projects_matrix.drop(columns=['profile'])
        
    split_percentage = 0.2
    split_column_index = int(split_percentage * len(users_projects_matrix.columns))
    
    train = users_projects_matrix.copy()
    test = users_projects_matrix.copy()
    
    # Set a certain amount of the projects to 0
    train.iloc[:, :split_column_index] = 0
    
    return train, test, split_column_index

In [6]:
train, test, split_column_index = train_test_split(users_projects)

In [7]:
train.shape

(4866, 1781)

In [8]:
test.shape

(4866, 1781)

## Create new loss function for autoender

In [9]:
'''
    Use this for casting some of the values to 0
    https://stackoverflow.com/questions/41043894/setting-all-negative-values-of-a-tensor-to-zero-in-tensorflow
'''
import keras.backend as K

def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    
    wh = K.tf.where(K.tf.equal(y_true_f,0))
    print(wh)
    
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)

    return K.mean(K.square(y_pred_f - y_true_f), axis=-1)

## Train autoencoder

In [10]:
# Create our model
model = CDAE.create(I=train.shape[1], U=train.shape[0]+1, K=embedding_size,
                    hidden_activation='relu', output_activation='sigmoid', q=0.50, l=0.01)
model.compile(loss='mean_absolute_error', optimizer='adam')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
x_item (InputLayer)             (None, 1781)         0                                            
__________________________________________________________________________________________________
x_user (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 1781)         0           x_item[0][0]                     
__________________________________________________________________________________________________
embedding_layer (Embe

  h_item = Dense(K, W_regularizer=l2(l), b_regularizer=l2(l))(h_item)
  h_user = Embedding(input_dim=U, output_dim=K, input_length=1, W_regularizer=l2(l), name='embedding_layer')(x_user)
  return Model(input=[x_item, x_user], output=y)


In [11]:
things = np.arange(0,train.shape[0]).reshape(train.shape[0],1)

In [12]:
# Train our Autoencoder
history = model.fit(x=[train, np.arange(0,train.shape[0]).reshape(train.shape[0],1)], y=train,
                    batch_size=batch_size, nb_epoch=epochs, verbose=1)

Instructions for updating:
Use tf.cast instead.


  This is separate from the ipykernel package so we can avoid doing imports until


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evalute Recommender System

In [13]:
full_pred = model.predict([train, things])

In [14]:
y_pred_floats = full_pred[:, :split_column_index]
y_pred_indices = y_pred_floats.argsort()[-5:][::-1]

In [15]:
y_pred = np.zeros(y_pred_floats.shape)
for i in range(0, y_pred_indices.shape[1]):
    y_pred[y_pred_indices[:, i], i] = 1

In [16]:
y_true = test.iloc[:, :split_column_index]

In [17]:
y_true.shape

(4866, 356)

In [18]:
np.nonzero(y_true.values)

(array([   0,    0,    0, ..., 4864, 4864, 4865]),
 array([ 64, 234, 237, ..., 190, 234, 182]))

In [19]:
# Get precision and recall
precision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred)

In [20]:
np.mean(precision)

0.011001872659176029

In [21]:
np.mean(recall)

0.001654855321321292