In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split,ShuffleSplit
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"]='0'

#NN-based model
from deepctr_torch.models import PNN
from deepctr_torch.models import CCPM

from deepctr_torch.models import WDL 
from deepctr_torch.models import DCN
from deepctr_torch.models import NFM
from deepctr_torch.models import DeepFM

#recent nn-based approach
from deepctr_torch.models import AFM
from deepctr_torch.models import xDeepFM
from deepctr_torch.inputs import SparseFeat,get_feature_names,DenseFeat,VarLenSparseFeat
import torch
from tensorflow.keras.utils import pad_sequences
from sklearn.metrics import mean_squared_error,recall_score,ndcg_score
import json
torch.cuda.is_available()


2023-05-02 23:09:12.464809: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


True

In [2]:
data = pd.read_csv("../hw3_data/Movielens/movielens_sample.csv")
data

Unnamed: 0,user,movie,age,occupation,genre,rating
0,196,242,5,3,5,3
1,186,302,4,4,6|10|13|16,3
2,22,377,3,3,4|5,1
3,244,51,3,1,8|14|17|18,2
4,166,346,5,8,6|8,1
...,...,...,...,...,...,...
99538,880,476,2,6,5,3
99539,716,204,4,5,5|15,5
99540,276,1090,3,6,16,1
99541,13,225,5,8,4|5,2


In [4]:
def Average(lst):
    return round(sum(lst) / len(lst),4) 


def split(x):
    if len(str(x)) == 1:
        key_ans = list(x)
    else:
        key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [45]:
sparse_features = ['user', 'movie', 'age', 'occupation']
target = ['rating']
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

key2index = {}
genres_list = list(map(split, data['genre'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for feat in sparse_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
                            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  
# Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_dict = {"IPNN":PNN(dnn_feature_columns,use_inner=True,use_outter=False,task='regression',device=device),
              "OPNN":PNN(dnn_feature_columns,use_inner=False,use_outter=True,task='regression',device=device),
              "PNN":PNN(dnn_feature_columns,use_inner=True,use_outter=True,task='regression',device=device),
              "CCPM":CCPM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
             "WDL":WDL(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
             "DCN":DCN(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
             "NFM":NFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
             "DeepFM":DeepFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
             "AFM":AFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
             "xDeepFM":xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device)}

In [None]:
rs = ShuffleSplit(n_splits=5, test_size=.1, random_state=42)
store_final_result = {}
for model_name,model_ in model_dict.items():
        
    RMSE_M = []
    recall_M = []
    NDCG_M = []
    print(model_name)
    # 3.generate input data for model
    for i, (train_index, test_index) in enumerate(rs.split(data)):
        model_dict = {"IPNN":PNN(dnn_feature_columns,use_inner=True,use_outter=False,task='regression',device=device),
                      "OPNN":PNN(dnn_feature_columns,use_inner=False,use_outter=True,task='regression',device=device),
                      "PNN":PNN(dnn_feature_columns,use_inner=True,use_outter=True,task='regression',device=device),
                      "CCPM":CCPM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
                      "WDL":WDL(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
                      "DCN":DCN(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
                      "NFM":NFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
                      "DeepFM":DeepFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
                      "AFM":AFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device),
                      "xDeepFM":xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression',device=device)}
        X_train, y_train = data.iloc[train_index], data.iloc[train_index]['rating']  
        X_test, y_test = data.iloc[test_index], data.iloc[test_index]['rating']

        train_model_input = {name: X_train[name] for name in sparse_features}
        train_model_input["genres"] = genres_list[train_index]
        test_model_input = {name: X_test[name] for name in sparse_features}
        test_model_input["genres"] = genres_list[test_index]
        
        model = model_dict[model_name]

        model.compile("adam", "mse", metrics=['mse'], )
        history = model.fit(train_model_input,y_train.values,batch_size=256,epochs=15,verbose=2,validation_split=0.2)
        pred_ans = model.predict(test_model_input, batch_size=256)

        RMSE = mean_squared_error(pred_ans,y_test, squared=False)
        RMSE_M.append(RMSE)
        recall = recall_score(np.round(pred_ans),y_test, average='micro')
        recall_M.append(recall)
        NDCG = ndcg_score(pred_ans.reshape(1, -1),y_test.values.reshape(1, -1), k=10)
        NDCG_M.append(NDCG) 
        print(f"RMSE: {RMSE}, Recall: {recall}, NDCG: {NDCG}")
    print(f"Average RMSE: {Average(RMSE_M)}, Average Recall: {Average(recall_M)}, Average NDCG: {Average(NDCG_M)}")    
    store_final_result[model_name] = f"Average RMSE: {Average(RMSE_M)}, Average Recall: {Average(recall_M)}, Average NDCG: {Average(NDCG_M)}"

#0.8353, Average Recall: 0.4685, Average NDCG: 0.7758
with open('final_result.txt', 'w') as file:
    file.write(json.dumps(store_final_result))

IPNN
cuda
Train on 71670 samples, validate on 17918 samples, 280 steps per epoch
Epoch 1/15
2s - loss:  2.4871 - mse:  2.4869 - val_mse:  0.9116
Epoch 2/15
1s - loss:  0.8781 - mse:  0.8781 - val_mse:  0.8851
Epoch 3/15
2s - loss:  0.8499 - mse:  0.8499 - val_mse:  0.8726
Epoch 4/15
2s - loss:  0.8285 - mse:  0.8285 - val_mse:  0.8697
Epoch 5/15
2s - loss:  0.8141 - mse:  0.8141 - val_mse:  0.8751
Epoch 6/15
2s - loss:  0.8059 - mse:  0.8059 - val_mse:  0.8724
Epoch 7/15
2s - loss:  0.8001 - mse:  0.8001 - val_mse:  0.8718
Epoch 8/15
2s - loss:  0.7960 - mse:  0.7960 - val_mse:  0.8702
Epoch 9/15
2s - loss:  0.7900 - mse:  0.7900 - val_mse:  0.8718
Epoch 10/15
2s - loss:  0.7823 - mse:  0.7823 - val_mse:  0.8685
Epoch 11/15
2s - loss:  0.7666 - mse:  0.7666 - val_mse:  0.8712
Epoch 12/15
2s - loss:  0.7493 - mse:  0.7493 - val_mse:  0.8694
Epoch 13/15
2s - loss:  0.7328 - mse:  0.7328 - val_mse:  0.8656
Epoch 14/15
2s - loss:  0.7189 - mse:  0.7189 - val_mse:  0.8658
Epoch 15/15
2s - l

# source code

In [40]:
data = pd.read_csv('./criteo_sample.txt')

sparse_features_clo = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

data
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
data
feat
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]
fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(fixlen_feature_columns + fixlen_feature_columns)
feature_names
dnn_feature_columns
len(feature_names)
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}

test_model_input = {name:test[name] for name in feature_names}
train_model_input