## Deep Factorizaton Machine (Deep FM)

#### Acknowledgement

Guo, H., Tang, R., Ye, Y., Li, Z., and He, X. (2017). DeepFM: A
Factorization-Machine based Neural Network for CTR Prediction. arXiv e-prints. 

In [None]:
# import later used packages
import numpy as np
import pandas as pd
from Rec_split import rec_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
import torch
from torch import nn
from itertools import product

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM

# custom function see .py file
from Kendall_distance import kendall_distance_with_penalty

pd.set_option('mode.chained_assignment', None)

### Data Loading and Preprocessing

In [2]:
# define function used to save genre feature
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [3]:
#read data
data = pd.read_csv("data/ml_1M_full.csv")

#split data
train, val, test = rec_split(data, 'User', 'Timestamp', train_share=0.85, val_share=0.0)
train['Rating'] = train['Rating'].apply(lambda x: 1 if x >= 4 else 0) #convert Rating to binary

In [4]:
#generate dataset for prediction generation

# create one dataframe with each movie and one with each user
movie_columns = ['Movie', 'Genre', 'Release_Year']
user_columns = ['User', 'Gender', 'Age', 'Occupation']

movie_df = train[movie_columns].drop_duplicates()
user_df = train[user_columns].drop_duplicates()

# Create combinations of rows from both DataFrames
combined_rows = [list(row1) + list(row2) for row1, row2 in product(movie_df.values, user_df.values)]

# Create a new DataFrame with columns from both DataFrames and one row for each possible user-item interaction
columns = list(movie_df.columns) + list(user_df.columns)
prediction_df = pd.DataFrame(combined_rows, columns=columns)

#remove data already in train
merged_df = pd.merge(prediction_df, train, on=['User', 'Movie'], how='outer', indicator=True)
prediction_df = merged_df[merged_df['_merge']=='left_only'].drop(columns=['_merge', 'Rating', 'Gender_y', 'Age_y', 'Occupation_y', 'Genre_y', 'Release_Year_y']).rename(columns=lambda x: x.replace('_x', ''))

#remove movies exclusivly in test
prediction_df = pd.merge(prediction_df, test, on=['User', 'Movie'], how='outer', indicator=True)
test = prediction_df[prediction_df['_merge']!='right_only'].drop(columns=['_merge', 'Gender_y', 'Age_y', 'Occupation_y', 'Genre_y', 'Release_Year_y']).rename(columns=lambda x: x.replace('_x', ''))


In [5]:
# define features used for training and predicting
sparse_features = ["User", "Movie", "Gender", "Age", "Occupation"]
target = ['Rating']

# Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    train[feat] = lbe.fit_transform(train[feat])
    test[feat] = lbe.fit_transform(test[feat])

# preprocess the Genre feature fror train data
key2index = {} # save feature encoding
genres_list = list(map(split, train['Genre'].values))
genres_length = np.array(list(map(len, genres_list)))

# preprocess the Genre feature for test data
genres_list_test = list(map(split, test['Genre'].values))
genres_length_test = np.array(list(map(len, genres_list_test)))

max_len = max([max(genres_length), max(genres_length_test)])

genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
genres_list_test = pad_sequences(genres_list_test, maxlen=max_len, padding='post', )

In [6]:
# define input features

fixlen_feature_columns_sparce = [SparseFeat(feat, train[feat].nunique(), embedding_dim=4)
                            for feat in sparse_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns_sparce + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns_sparce + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [7]:
# generate input data for training
model_input = {name: train[name] for name in sparse_features}
model_input["genres"] = genres_list

# generate input data for the predictions
model_input_test = {name: test[name] for name in sparse_features}
model_input_test["genres"] = genres_list_test

### Hyperparameter Tuning

In [40]:
# define hyperparameters
hidden_layer = [[100, 100], [100, 100, 100], [400, 400], [400, 400, 400]]
dropout_rate = 0.5
activation = nn.ReLU

learning_rate = [0.01, 0.001, 0.0001]
epochs = [3, 5, 10]
batch_size = 256

In [41]:
# generate empty dataframe to save results
results = pd.DataFrame()

for layers in hidden_layer:
    for lr in learning_rate:
        for epoch in epochs:
            # train the model for each hyperparameter combination
            model = DeepFM(linear_feature_columns, dnn_feature_columns, use_fm=True, 
                        dnn_hidden_units=layers,
                        dnn_dropout=dropout_rate,
                        task='binary',
                        dnn_activation=activation)

            model.compile(torch.optim.Adam(model.parameters(), lr=lr), 'binary_crossentropy', metrics=['binary_crossentropy', 'auc'])
            history = model.fit(model_input, train[target].values, batch_size=batch_size, epochs=epoch, verbose=2, validation_split=15/85)
            
            # save results of a specific hyperparameter combination
            res = pd.DataFrame({key: values[-1] for key, values in history.history.items()}, index=[0])
            res['epoch'] = epoch
            res['learning rate'] = lr
            res['hidden layers'] = str(layers)

            # add result to dataframe with all results
            results = pd.concat([results, res], ignore_index=True)
            results.to_csv('results/DeepFM_hyperparameter.csv')   

cpu
Train on 697783 samples, validate on 149526 samples, 2726 steps per epoch
Epoch 1/3
41s - loss:  0.5545 - binary_crossentropy:  0.5543 - auc:  0.7765 - val_binary_crossentropy:  0.6018 - val_auc:  0.7141
Epoch 2/3
40s - loss:  0.5203 - binary_crossentropy:  0.5197 - auc:  0.8096 - val_binary_crossentropy:  0.6038 - val_auc:  0.7115
Epoch 3/3
42s - loss:  0.5073 - binary_crossentropy:  0.5064 - auc:  0.8210 - val_binary_crossentropy:  0.6020 - val_auc:  0.7142
cpu
Train on 697783 samples, validate on 149526 samples, 2726 steps per epoch
Epoch 1/5
45s - loss:  0.5545 - binary_crossentropy:  0.5543 - auc:  0.7765 - val_binary_crossentropy:  0.6018 - val_auc:  0.7141
Epoch 2/5
41s - loss:  0.5203 - binary_crossentropy:  0.5197 - auc:  0.8096 - val_binary_crossentropy:  0.6038 - val_auc:  0.7115
Epoch 3/5
42s - loss:  0.5073 - binary_crossentropy:  0.5064 - auc:  0.8210 - val_binary_crossentropy:  0.6020 - val_auc:  0.7142
Epoch 4/5
43s - loss:  0.5004 - binary_crossentropy:  0.4993 - a

In [43]:
# check for the best performing hyper parameter combination
results = pd.read_csv('results/DeepFM_hyperparameter.csv').drop(columns='Unnamed: 0')
results.sort_values(by=['val_auc'], ascending=False, inplace=True)

results

Unnamed: 0,loss,binary_crossentropy,auc,val_binary_crossentropy,val_auc,epoch,learning rate,hidden layers
22,0.51926,0.51924,0.80814,0.594707,0.723408,5,0.001,"[400, 400]"
17,0.530083,0.530079,0.79948,0.596942,0.723351,10,0.0001,"[100, 100, 100]"
35,0.530657,0.53066,0.79903,0.597083,0.723314,10,0.0001,"[400, 400, 400]"
25,0.531886,0.53189,0.797952,0.595699,0.723273,5,0.0001,"[400, 400]"
16,0.531448,0.531445,0.798568,0.59814,0.723119,5,0.0001,"[100, 100, 100]"
26,0.530541,0.530543,0.799026,0.595682,0.723037,10,0.0001,"[400, 400]"
4,0.524921,0.524887,0.803021,0.594686,0.723035,5,0.001,"[100, 100]"
7,0.531261,0.531256,0.798682,0.595591,0.723023,5,0.0001,"[100, 100]"
33,0.533755,0.533759,0.796693,0.597328,0.723018,3,0.0001,"[400, 400, 400]"
34,0.532274,0.532275,0.797886,0.59665,0.722985,5,0.0001,"[400, 400, 400]"


### Model Evaluation

In [11]:
#optimal hyperparameters
epoch_opt = 5
lr_opt = 0.001
layers_opt = [400, 400]

In [12]:
# train the model with optimal hyperparameters
model_opt = DeepFM(linear_feature_columns, dnn_feature_columns, use_fm=True, 
                        dnn_hidden_units=layers_opt,
                        dnn_dropout=dropout_rate,
                        task='binary',
                        dnn_activation=activation)

model_opt.compile(torch.optim.Adam(model_opt.parameters(), lr=lr_opt), 'binary_crossentropy', metrics=['binary_crossentropy', 'auc'])
history_opt = model_opt.fit(model_input, train[target].values, batch_size=batch_size, epochs=epoch_opt, verbose=2, validation_split=15/85)

cpu
Train on 697783 samples, validate on 149526 samples, 2726 steps per epoch
Epoch 1/5
42s - loss:  0.5596 - binary_crossentropy:  0.5596 - auc:  0.7706 - val_binary_crossentropy:  0.5985 - val_auc:  0.7199
Epoch 2/5
41s - loss:  0.5392 - binary_crossentropy:  0.5392 - auc:  0.7912 - val_binary_crossentropy:  0.5978 - val_auc:  0.7211
Epoch 3/5
43s - loss:  0.5332 - binary_crossentropy:  0.5332 - auc:  0.7963 - val_binary_crossentropy:  0.5954 - val_auc:  0.7216
Epoch 4/5
44s - loss:  0.5249 - binary_crossentropy:  0.5249 - auc:  0.8033 - val_binary_crossentropy:  0.5950 - val_auc:  0.7227
Epoch 5/5
51s - loss:  0.5193 - binary_crossentropy:  0.5192 - auc:  0.8081 - val_binary_crossentropy:  0.5947 - val_auc:  0.7234


In [13]:
# predict rating for each user item combination and append it to the test dataframe
predictions = model_opt.predict(model_input_test)
test['Prediction'] = predictions

In [31]:
# Generate a list with all users to loop over
users = test.User.unique()

# Initialize DataFrames to store results
awhrs = pd.DataFrame()
asats = pd.DataFrame()
asats_2 = pd.DataFrame()

# Lists to store Kendall distance sums
kendal_sum = []
kendal_sum_2 = []

# Loop over different values of k
for k in [1, 5, 10, 20, 50]:
    # Initialize lists to store @k metrics
    whrs = []
    sat_us = []
    sat_us_2 = []
    recommendations_allu = []
    
    # Loop over each user
    for user in users:
        # Initialize metrics for a specific user
        whr = 0
        sat = 0
        sat_2 = 0

        # Filter predictions for the current user
        predictions_user = test[test['User']==user]
        # Get top-k recommendations for the user
        recommendations = predictions_user.sort_values('Prediction', ascending=False).head(k)

        # Calculate weighted hit rate and user satisfaction
        for rec in recommendations['Rating']:
            if rec == 1:
                whr -= 5
            elif rec == 2:
                whr -= 2
            elif rec == 3:
                whr += 2
            elif rec == 4:
                whr += 6
                sat = 1
            elif rec == 5:
                whr += 12
                sat = 1
                sat_2 = 1
                
        whr = whr / k
        whrs.append(whr)
        sat_us.append(sat)
        sat_us_2.append(sat_2)
        
        # Store recommendations for the user
        recommendations_allu.append(list(recommendations['Movie'])) 

        # Calculate Kendall distance with penalty
        # only once as it uses the whole sequence of predictions and is therefore independend of k
        if k == 1:
            kendal_u = kendall_distance_with_penalty(predictions_user[~predictions_user['Rating'].isna()], predictions_user[~predictions_user['Rating'].isna()], 'Movie', 'Movie', 'Rating_x', 'Prediction_x', p = 0.05)
            kendal_u_2 = kendall_distance_with_penalty(predictions_user[~predictions_user['Rating'].isna()], predictions_user[~predictions_user['Rating'].isna()], 'Movie', 'Movie', 'Rating_x', 'Prediction_x', p = 0.2)

            kendal_sum.append(kendal_u)
            kendal_sum_2.append(kendal_u_2)

    # Calculate average weighted hit rate for current k
    average_whr = pd.DataFrame({'Average Weigthed Hit Rate': np.mean(whrs), 'k': k}, index=[0])
    
    # Calculate average user satisfaction for current k (with satisfaction weight 1)
    average_sat = pd.DataFrame({'Average User Satisfaction':np.mean(sat_us), 'k': k}, index=[0])
    
    # Calculate average user satisfaction for current k (with satisfaction weight 2)
    average_sat_2 = pd.DataFrame({'Average User Satisfaction':np.mean(sat_us_2), 'k': k}, index=[0])
    
    # Store recommendation distribution for current k
    recommendations_k = pd.DataFrame({'Element': pd.Series(recommendations_allu).index, 'Occurrence Count': pd.Series(recommendations_allu).values})
    recommendations_k.to_csv(f'results/Recommendation_distribution@{k}.csv')

    # Concatenate results for current k to the overall DataFrames
    awhrs = pd.concat([awhrs, average_whr], ignore_index=True)
    asats = pd.concat([asats, average_sat], ignore_index=True)
    asats_2 = pd.concat([asats_2, average_sat_2], ignore_index=True)

# Calculate average Kendall distance
kendal = pd.DataFrame({'Kendall Distance':np.mean(kendal_sum), 'p': 0.05}, index=[0])
kendal_2 = pd.DataFrame({'Kendall Distance':np.mean(kendal_sum_2), 'p': 0.2}, index=[0])
# Concatenate results for different values of p
kendal = pd.concat([kendal, kendal_2], ignore_index=True)

In [33]:
# save results to csv
awhrs.to_csv('results/DeepFM_awhrs.csv')
asats.to_csv('results/DeepFM_asats.csv')
asats_2.to_csv('results/DeepFM_asats2.csv')
kendal.to_csv('results/DeepFM_Kendall.csv')