## Baseline model

Use LR as baseline model

In [1]:
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

### Get training and test sample, remove unused columns

In [2]:
folder_path = "/home/leon/Documents/SparrowRecSys/src/main/resources/webroot/sampledata"

In [3]:
training_path = folder_path + "/Pytorch_data/trainingSamples.csv"

In [4]:
test_path = folder_path + "/Pytorch_data/testSamples.csv"

In [5]:
training_df = pd.read_csv(training_path, index_col=False)

In [6]:
test_df = pd.read_csv(test_path, index_col=False)

In [7]:
training_df.head()

Unnamed: 0,movieId,userId,rating,timestamp,label,releaseYear,movieGenre1,movieGenre2,movieGenre3,movieRatingCount,...,userGenre3,userGenre4,userGenre5,scaledReleaseYear,scaledmovieRatingCount,scaledmovieAvgRating,scaledmovieRatingStddev,scaleduserRatingCount,scaleduserAvgRating,scaleduserRatingStddev
0,593,10096,4.0,954365552,1,1991.0,13,4,12,13692.0,...,17,12,0,0.915493,0.936777,0.90625,0.449735,0.030612,0.688889,0.279874
1,832,10351,3.0,851791379,0,1996.0,13,12,0,3052.0,...,5,12,13,0.985916,0.208758,0.649306,0.486773,0.112245,0.726667,0.22956
2,85,10351,3.0,851791395,0,1995.0,11,5,0,592.0,...,5,12,13,0.971831,0.040438,0.690972,0.502645,0.122449,0.713333,0.22956
3,588,10351,5.0,851792205,1,1992.0,3,15,18,8980.0,...,12,5,13,0.929577,0.614369,0.729167,0.486773,0.22449,0.675555,0.207547
4,370,1090,2.0,1117852491,0,1994.0,2,7,0,3087.0,...,0,0,0,0.957747,0.211153,0.482639,0.555555,0.030612,0.2,0.204403


In [8]:
# fill NaN items in 'userRatedMovie1' column, movieId starts from 1, so we can use 0 to do padding
training_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [9]:
columns2Keep = ['userId', 'userGenre1', 'userGenre2',  'userGenre3','userGenre4', 'userGenre5', 'scaleduserRatingCount',
       'scaleduserAvgRating', 'scaleduserRatingStddev', 'userRatedMovie1', 'movieId',  'movieGenre1', 'movieGenre2', 'movieGenre3', 'scaledReleaseYear', 'scaledmovieRatingCount', 'scaledmovieAvgRating',
       'scaledmovieRatingStddev']

In [10]:
training_feature = training_df[columns2Keep]

In [11]:
training_label = training_df['label']

In [12]:
test_feature = test_df[columns2Keep]

In [13]:
test_label = test_df['label']

In [14]:
training_feature['userRatedMovie1'] = training_feature['userRatedMovie1'].astype('int64')

In [15]:
test_feature['userRatedMovie1'] = test_feature['userRatedMovie1'].astype('int64')

In [16]:
for i, col_name in enumerate(training_feature.columns):
    print(str(i) + " -> " + col_name)

0 -> userId
1 -> userGenre1
2 -> userGenre2
3 -> userGenre3
4 -> userGenre4
5 -> userGenre5
6 -> scaleduserRatingCount
7 -> scaleduserAvgRating
8 -> scaleduserRatingStddev
9 -> userRatedMovie1
10 -> movieId
11 -> movieGenre1
12 -> movieGenre2
13 -> movieGenre3
14 -> scaledReleaseYear
15 -> scaledmovieRatingCount
16 -> scaledmovieAvgRating
17 -> scaledmovieRatingStddev


In [17]:
training_feature

Unnamed: 0,userId,userGenre1,userGenre2,userGenre3,userGenre4,userGenre5,scaleduserRatingCount,scaleduserAvgRating,scaleduserRatingStddev,userRatedMovie1,movieId,movieGenre1,movieGenre2,movieGenre3,scaledReleaseYear,scaledmovieRatingCount,scaledmovieAvgRating,scaledmovieRatingStddev
0,10096,13,11,17,12,0,0.030612,0.688889,0.279874,50,593,13,4,12,0.915493,0.936777,0.906250,0.449735
1,10351,11,7,5,12,13,0.112245,0.726667,0.229560,26,832,13,12,0,0.985916,0.208758,0.649306,0.486773
2,10351,11,7,5,12,13,0.122449,0.713333,0.229560,26,85,11,5,0,0.971831,0.040438,0.690972,0.502645
3,10351,11,7,12,5,13,0.224490,0.675555,0.207547,582,588,3,15,18,0.929577,0.614369,0.729167,0.486773
4,1090,0,0,0,0,0,0.030612,0.200000,0.204403,0,370,2,7,0,0.957747,0.211153,0.482639,0.555555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84480,9515,2,12,13,3,7,0.367347,0.684444,0.270440,153,485,2,3,7,0.943662,0.196989,0.420139,0.529101
84481,9515,2,12,13,3,7,0.418367,0.668889,0.261006,153,720,3,15,7,0.985916,0.123298,0.861111,0.550265
84482,9515,2,12,13,3,7,0.428571,0.671111,0.257862,720,296,7,13,11,0.957747,1.000000,0.902778,0.518518
84483,9515,2,12,13,3,7,0.448980,0.682222,0.261006,527,318,13,11,0,0.957747,0.945946,1.000000,0.380952


### DataSet and DataLoader

In [18]:
sparse_col = [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13] # column_index of sparse features

In [19]:
sparse_col_size = [30001, 20, 20, 20, 20, 20, 1001, 1001, 20, 20, 20] # number of classes per sparse_feature

In [20]:
dense_col = [6, 7, 8, 14, 15, 16, 17]

In [21]:
class ModelDataSet(Dataset):
    # Retrieve an item in every call
    def __init__(self, input_DF, label_DF, sparse_col, dense_col):
        self.df = input_DF
        
        self.dense_df = input_DF.iloc[:, dense_col].astype(np.float32) 
        self.sparse_df = input_DF.iloc[:, sparse_col].astype('int64') 
        
        self.label = label_DF.astype(np.float32) 
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sparse_feature = torch.tensor(self.sparse_df.iloc[idx])
        dense_feature = torch.tensor(self.dense_df.iloc[idx])
        label = torch.tensor(self.label.iloc[idx])
        return {'Feature': (sparse_feature, dense_feature), 'Label': label}

In [22]:
training_dataset = ModelDataSet(training_feature, training_label, sparse_col, dense_col)

In [25]:
test_dataset = ModelDataSet(test_feature, test_label, sparse_col, dense_col)

In [26]:
BATCH_SIZE = 100

In [27]:
training_dataloader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [28]:
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

### Model and loss function

First, define the FM module

In [29]:

class LR(nn.Module):
    def __init__(self, sparse_col_size, dense_col_size):
        # sparse_col_size: list[int]
        # dense_col_size: int
        super().__init__()
        self.sparse_col_size = sparse_col_size
        self.dense_col_size = dense_col_size
        
        
        # 1st order linear layer
        fisrt_order_size = np.sum(sparse_col_size) + dense_col_size
        self.linear_firstOrder = nn.Linear(fisrt_order_size, 1)
        
    def forward(self, sparse_feature, dense_feature):
        if (len(sparse_feature.shape) == 1): # 1D tensor coverted to 2D tensor if batch_number == 1
            sparse_feature = sparse_feature.view(1, -1)
            dense_feature = dense_feature.view(1, -1)
        
        # convert sparse feature to oneHot and Embedding
        one_hot_list =[]
        for i in range(len(self.sparse_col_size)):
            sparse_feature_input = sparse_feature[:, i] # batch x 1
            class_size = self.sparse_col_size[i]
            one_hot_vec = F.one_hot(sparse_feature_input, num_classes=class_size).squeeze(1) # batch x class_number
            one_hot_list.append(one_hot_vec)
        
        one_hot_list.append(dense_feature)
        
        # Prepare input for 1st order layer, FM, deep layer
        sparse_one_hot = torch.cat(one_hot_list, dim=1)   # B x (sum(one_hot)+10), 10 is the size of dense_embedding
        # linear layer
        linear_logit = self.linear_firstOrder(sparse_one_hot)
        logit = linear_logit 
        return F.sigmoid(logit).view(-1)

### Training and evaluation

In [30]:
from sklearn.metrics import roc_auc_score

In [31]:
model = LR(sparse_col_size, 7)

In [32]:
loss_fn = nn.BCELoss()

In [33]:
EPOCHS = 5

In [34]:
LR = 0.01

In [35]:
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=0.001)

In [36]:
# optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9)

In [37]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [38]:
# summary = SummaryWriter()

In [39]:
class Train_Eval():
    def __init__(self, model, loss_fn, optim, device, train_dataloader, test_dataloader):
        self.device = device
        self.model = model.to(self.device)
        self.optim = optim
        self.loss_fn = loss_fn
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        self.threashold = 0.5 # threashold for positive class
        
    def train(self, epochs):
        self.model.train()
        for epoch in range(epochs):
            print("==========================================================")
            print("start training epoch: {}".format(epoch+1))
            loss_list = []
            pred_list = []
            label_list = []
            
            iteration = 1
            for train_data in self.train_dataloader:
                sparse_feature = train_data['Feature'][0].to(self.device)
                dense_feature = train_data['Feature'][1].to(self.device)
                label = train_data['Label'].to(self.device)
                prediction = self.model(sparse_feature, dense_feature)
                
                pred_list.extend(prediction.tolist())
                label_list.extend(label.tolist())
                
                cur_loss = self.loss_fn(prediction, label)
                loss_list.append(cur_loss.item())
                cur_loss.backward()
                self.optim.step()
                self.optim.zero_grad()
                
                # logging every 20 iteration
                if iteration % 20 == 0:
                    print("---------------------------------------------------------")
                    print("epoch {}/{}, cur_iteration is {}, logloss is {:.2f}"
                          .format(epoch+1, epochs, iteration, cur_loss.item()))
                iteration += 1
                
            # validation every epoch
            training_loss, training_accuracy, training_roc_score = self._getMetric(loss_list, pred_list, label_list)
            print("==========================================================")
            print("Result of epoch {}".format(epoch+1))
            print(f"training loss: {training_loss:.2f}, accuracy: {training_accuracy:.3f}, roc_score: {training_roc_score:.2f}")
            
            test_loss, test_accuracy, test_roc_score = self.eval()
            print(f"test loss: {test_loss:.2f}, accuracy: {test_accuracy:.3f}, roc_score: {test_roc_score:.2f}")
            # summary.add_embedding(np.reshape(np.array(loss_list), (1, -1)), tag="loss_list")
            # summary.add_embedding(np.reshape(np.array(pred_list), (1, -1)), tag="pred_list")
            # summary.add_embedding(np.reshape(np.array(label_list), (1, -1)), tag="label_list")
            # summary.add_scalar("training_loss", training_loss)
            # summary.add_scalar("training_accuracy", training_accuracy)
            # summary.add_scalar("training_roc_score", training_roc_score)
    
    def eval(self):
        # return logloss, accuracy, roc_score
        self.model.eval()
        loss_list = []
        pred_list = []
        label_list = []
        with torch.no_grad():
            for test_data in self.test_dataloader:
                sparse_feature = test_data['Feature'][0].to(self.device)
                dense_feature = test_data['Feature'][1].to(self.device)
                label = test_data['Label'].to(self.device)
                prediction = self.model(sparse_feature, dense_feature)
                cur_loss = self.loss_fn(prediction, label)
                
                loss_list.append(cur_loss.item())
                pred_list.extend(prediction.tolist())
                label_list.extend(label.tolist())
        return self._getMetric(loss_list, pred_list, label_list)
                
    def _getMetric(self, loss_list, pred_list, label_list):
        # return logloss, accuracy, roc_score        
        # average logloss
        avg_loss = np.mean(loss_list)
        # roc_score
        roc_score = roc_auc_score(label_list, pred_list)
        # average accuracy
        pred_class_list = list(map(lambda x: 1 if x >= self.threashold else 0, pred_list))
        correct_count = 0
        for p, l in zip(pred_class_list, label_list):
            if p == l:
                correct_count += 1
        avg_accuracy = correct_count / len(label_list)
        
        return avg_loss, avg_accuracy, roc_score

In [40]:
train_eval = Train_Eval(model, loss_fn, optimizer, dev, training_dataloader, test_dataloader)

In [None]:
train_eval.train(EPOCHS)

start training epoch: 1
---------------------------------------------------------
epoch 1/5, cur_iteration is 20, logloss is 0.68
---------------------------------------------------------
epoch 1/5, cur_iteration is 40, logloss is 0.63
---------------------------------------------------------
epoch 1/5, cur_iteration is 60, logloss is 0.67
---------------------------------------------------------
epoch 1/5, cur_iteration is 80, logloss is 0.62
---------------------------------------------------------
epoch 1/5, cur_iteration is 100, logloss is 0.64
---------------------------------------------------------
epoch 1/5, cur_iteration is 120, logloss is 0.61
---------------------------------------------------------
epoch 1/5, cur_iteration is 140, logloss is 0.57
---------------------------------------------------------
epoch 1/5, cur_iteration is 160, logloss is 0.65
---------------------------------------------------------
epoch 1/5, cur_iteration is 180, logloss is 0.59
-----------------

---------------------------------------------------------
epoch 2/5, cur_iteration is 660, logloss is 0.60
---------------------------------------------------------
epoch 2/5, cur_iteration is 680, logloss is 0.54
---------------------------------------------------------
epoch 2/5, cur_iteration is 700, logloss is 0.58
---------------------------------------------------------
epoch 2/5, cur_iteration is 720, logloss is 0.61
---------------------------------------------------------
epoch 2/5, cur_iteration is 740, logloss is 0.55
---------------------------------------------------------
epoch 2/5, cur_iteration is 760, logloss is 0.65
---------------------------------------------------------
epoch 2/5, cur_iteration is 780, logloss is 0.60
---------------------------------------------------------
epoch 2/5, cur_iteration is 800, logloss is 0.53
---------------------------------------------------------
epoch 2/5, cur_iteration is 820, logloss is 0.62
-------------------------------------

---------------------------------------------------------
epoch 4/5, cur_iteration is 420, logloss is 0.56
---------------------------------------------------------
epoch 4/5, cur_iteration is 440, logloss is 0.60
---------------------------------------------------------
epoch 4/5, cur_iteration is 460, logloss is 0.56
---------------------------------------------------------
epoch 4/5, cur_iteration is 480, logloss is 0.63
---------------------------------------------------------
epoch 4/5, cur_iteration is 500, logloss is 0.57
---------------------------------------------------------
epoch 4/5, cur_iteration is 520, logloss is 0.60
---------------------------------------------------------
epoch 4/5, cur_iteration is 540, logloss is 0.56
---------------------------------------------------------
epoch 4/5, cur_iteration is 560, logloss is 0.57
---------------------------------------------------------
epoch 4/5, cur_iteration is 580, logloss is 0.58
-------------------------------------

In [307]:
# summary.close()

In [49]:
model.parameters

<bound method Module.parameters of LR(
  (linear_firstOrder): Linear(in_features=32170, out_features=1, bias=True)
)>