In [168]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from statistics import mode


In [181]:
df_users = pd.read_csv('training.csv')
df_users.head()

Unnamed: 0,user_id,item_id,context_feature_id
0,0,28366,2
1,0,16109,2
2,0,11500,3
3,0,20750,2
4,0,8759,2


In [182]:
df_items = pd.read_csv('item_feature.csv')
df_items.head()

Unnamed: 0,item_id,item_feature_id
0,0,139
1,1,55
2,2,11
3,3,138
4,4,138


In [184]:
df = pd.merge(df_users.drop('context_feature_id',axis=1),df_items,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,item_feature_id
0,0,28366,7
1,1731,28366,7
2,10168,28366,7
3,18883,28366,7
4,19763,28366,7


In [170]:
mode(df.loc[df.user_id==0]['item_feature_id'])

7

In [189]:
df['rating'] = 1  #Setting implicit rating

In [190]:
num_users = df.user_id.max()
num_items = len(np.unique(df_items.item_id))
num_users, num_items

(200152, 39901)

In [191]:
df.columns

Index(['user_id', 'item_id', 'item_feature_id', 'rating'], dtype='object')

In [192]:
user_set = set(df.user_id.values)

In [193]:
#Random uniform negative sampling
cols = list(df.columns)
val_list = []
for itr in range(df.shape[0]//2):
    user_id_sample = np.random.randint(0,num_users)
    item_id_sample = np.random.randint(0,num_items)
    val_list.append([user_id_sample,item_id_sample,\
                     None,0])


In [194]:
val_df = pd.DataFrame(val_list,columns = df.columns)


In [199]:
neg_df = pd.merge(val_df.drop('item_feature_id',axis=1),df_items,on='item_id')

In [200]:
df_new = pd.concat([df,neg_df])
df_new.reset_index(drop=True, inplace=True)
df_new

Unnamed: 0,user_id,item_id,item_feature_id,rating
0,0,28366,7,1
1,1731,28366,7,1
2,10168,28366,7,1
3,18883,28366,7,1
4,19763,28366,7,1
...,...,...,...,...
1455362,111069,8140,138,0
1455363,121046,8140,138,0
1455364,31024,8140,138,0
1455365,83958,11704,147,0


In [201]:
#Splitting into train and validation set
train = df_new.sample(frac=0.8, random_state=25)
val = df_new.drop(train.index)

## Model 1 without using extra features

In [279]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, p=0.5):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
        self.linear1 = nn.Linear(emb_size*2, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.linear2 = nn.Linear(10,1)
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        x = self.linear1(torch.cat((U,V),1))
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [280]:
def train_epocs1(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user_id.values)
        items = torch.LongTensor(train.item_id.values) 
        ratings = torch.FloatTensor(train.rating.values)  
    
        y_hat = model(users, items)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1)) # binary cross entropy with logits
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss1(model)
        if i%10 ==0:
            print("train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 


In [281]:
def valid_loss1(model):
    model.eval()
    users = torch.LongTensor(val.user_id.values) # .cuda()
    items = torch.LongTensor(val.item_id.values) #.cuda()
    ratings = torch.FloatTensor(val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1))
    return loss.item()

In [223]:
num_users = df.user_id.max()+1
num_items = df.item_id.max()+1
print(num_users, num_items) 

200153 39901


In [224]:
model = MF_bias(num_users, num_items, emb_size=50)
train_epocs(model, epochs=60, lr=0.01, wd=1e-5)

train loss 0.654 valid loss 0.649
train loss 0.542 valid loss 0.522
train loss 0.413 valid loss 0.416
train loss 0.325 valid loss 0.373
train loss 0.261 valid loss 0.344
train loss 0.230 valid loss 0.345


## Model 2 with extra feature

In [254]:
class MF_extra(nn.Module):
    def __init__(self, num_users, num_items,num_features, emb_size=100, p=0.5, feature_emb = 5):
        super(MF_extra, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.feature_emb = nn.Embedding(num_features,emb_size)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.feature_emb.weight.data.uniform_(0,0.05)
        
        self.linear1 = nn.Linear(emb_size*3, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.linear2 = nn.Linear(10,1)
    def forward(self, u, v, f):
        U = self.user_emb(u)
        V = self.item_emb(v)
        Fe = self.feature_emb(f)
        x = self.linear1(torch.cat((U,V,Fe),1))
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [255]:
num_users = df.user_id.max()+1
num_items = df.item_id.max()+1
num_features = len(np.unique(df_items.item_feature_id))
print(num_users, num_items,num_features) 

200153 39901 195


In [256]:
def train_epocs2(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user_id.values)
        items = torch.LongTensor(train.item_id.values) 
        features = torch.LongTensor(train.item_feature_id.values)
        ratings = torch.FloatTensor(train.rating.values)  
        y_hat = model(users, items, features)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1)) # binary cross entropy with logits
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss2(model)
        if i%10 ==0:
            print("train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 
            
            
def valid_loss2(model):
    model.eval()
    users = torch.LongTensor(val.user_id.values) # .cuda()
    items = torch.LongTensor(val.item_id.values) #.cuda()
    features = torch.LongTensor(val.item_feature_id.values)
    ratings = torch.FloatTensor(val.rating.values) #.cuda()
    y_hat = model(users, items, features)
    loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1))
    return loss.item()

In [272]:
model = MF_extra(num_users, num_items,num_features, emb_size=50)
train_epocs2(model, epochs=40, lr=0.01, wd=1e-5)

train loss 0.656 valid loss 0.649
train loss 0.517 valid loss 0.491
train loss 0.347 valid loss 0.331
train loss 0.260 valid loss 0.308


# Hyper parameters search

## Learning rate

In [282]:
# learning rate 1
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=1, wd=1e-5)

train loss 0.661 valid loss 1.932
train loss 0.601 valid loss 0.427
train loss 0.502 valid loss 0.469
train loss 0.403 valid loss 0.487
train loss 0.403 valid loss 0.446
train loss 0.380 valid loss 0.435
train loss 0.368 valid loss 0.416
train loss 0.354 valid loss 0.400
train loss 0.341 valid loss 0.394
train loss 0.332 valid loss 0.395
train loss 0.327 valid loss 0.402
train loss 0.326 valid loss 0.403
train loss 0.324 valid loss 0.401
train loss 0.320 valid loss 0.396


In [284]:
# learning rate 0.1
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=0.1, wd=1e-5)

train loss 0.766 valid loss 0.690
train loss 0.353 valid loss 0.383
train loss 0.252 valid loss 0.389
train loss 0.225 valid loss 0.389
train loss 0.191 valid loss 0.452
train loss 0.186 valid loss 0.496
train loss 0.184 valid loss 0.540
train loss 0.183 valid loss 0.573
train loss 0.185 valid loss 0.589
train loss 0.186 valid loss 0.609
train loss 0.186 valid loss 0.626
train loss 0.187 valid loss 0.637
train loss 0.187 valid loss 0.649
train loss 0.188 valid loss 0.659


In [285]:
# learning rate 0.01
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=0.01, wd=1e-5)

train loss 0.647 valid loss 0.644
train loss 0.628 valid loss 0.623
train loss 0.581 valid loss 0.573
train loss 0.494 valid loss 0.484
train loss 0.432 valid loss 0.427
train loss 0.391 valid loss 0.408
train loss 0.357 valid loss 0.398
train loss 0.329 valid loss 0.393
train loss 0.304 valid loss 0.392
train loss 0.280 valid loss 0.392
train loss 0.252 valid loss 0.388
train loss 0.229 valid loss 0.388
train loss 0.217 valid loss 0.396
train loss 0.212 valid loss 0.407


In [286]:
# learning rate 0.001
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=0.001, wd=1e-5)

train loss 0.663 valid loss 0.663
train loss 0.660 valid loss 0.660
train loss 0.657 valid loss 0.657
train loss 0.654 valid loss 0.654
train loss 0.651 valid loss 0.651
train loss 0.647 valid loss 0.647
train loss 0.643 valid loss 0.643
train loss 0.639 valid loss 0.639
train loss 0.633 valid loss 0.634
train loss 0.628 valid loss 0.629
train loss 0.621 valid loss 0.623
train loss 0.614 valid loss 0.616
train loss 0.607 valid loss 0.608
train loss 0.598 valid loss 0.600


### Conclusion: LR 0.01 is enough to quickly converge to minimum loss. 1 and 0.1 would be too large and 0.001 would be too small

## Embedding Size

In [289]:
# emb_size 3
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.654 valid loss 0.651
train loss 0.627 valid loss 0.623
train loss 0.571 valid loss 0.563
train loss 0.471 valid loss 0.457
train loss 0.377 valid loss 0.362


In [290]:
# emb_size 10
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.710 valid loss 0.708
train loss 0.679 valid loss 0.675
train loss 0.627 valid loss 0.617
train loss 0.547 valid loss 0.527
train loss 0.473 valid loss 0.445


In [291]:
# emb_size 50
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.657 valid loss 0.653
train loss 0.617 valid loss 0.610
train loss 0.521 valid loss 0.509
train loss 0.409 valid loss 0.403
train loss 0.337 valid loss 0.355


In [292]:
# emb_size 100
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.727 valid loss 0.721
train loss 0.675 valid loss 0.670
train loss 0.603 valid loss 0.596
train loss 0.494 valid loss 0.478
train loss 0.404 valid loss 0.373


### Conclusion: Embedding size 50 is can converge to minimum loss.

## Dropout rate

In [295]:
nb_epochs = 20
learning_rate = 0.001
p_vals = []
for p in [0.1,0.5,0.9]:
    print('Dropout rate:',p)
    model = MF_bias(num_users, num_items, emb_size=50,p=p)
    train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

Dropout rate: 0.1
train loss 0.737 valid loss 0.727
train loss 0.588 valid loss 0.567
train loss 0.344 valid loss 0.352
train loss 0.252 valid loss 0.311
train loss 0.201 valid loss 0.304
Dropout rate: 0.5
train loss 0.739 valid loss 0.728
train loss 0.599 valid loss 0.577
train loss 0.384 valid loss 0.357
train loss 0.284 valid loss 0.310
train loss 0.238 valid loss 0.310
Dropout rate: 0.9
train loss 0.664 valid loss 0.657
train loss 0.576 valid loss 0.531
train loss 0.511 valid loss 0.398
train loss 0.477 valid loss 0.361
train loss 0.463 valid loss 0.353


### Dropout rate of 0.5 achieves lowest validation loss

## Generate submission file

In [263]:
df_test_ = pd.read_csv('test_kaggle.csv')
df_test_.drop('context_feature_id',inplace=True, axis=1)
df_test_.head()

Unnamed: 0,id,user_id,item_id
0,0,4,16835
1,1,4,22590
2,2,4,1978
3,3,4,28916
4,4,4,14427


In [266]:
df_test = pd.merge(df_test,df_items,on='item_id')
df_test.head()

Unnamed: 0,id,user_id,item_id,item_feature_id
0,0,4,16835,142
1,434,188,16835,142
2,48540,25437,16835,142
3,51161,26834,16835,142
4,73056,38465,16835,142


In [267]:
users = torch.LongTensor(df_test.user_id.values) # .cuda()
items = torch.LongTensor(df_test.item_id.values) #.cuda()
features = torch.LongTensor(df_test.item_feature_id.values)
y_pred = torch.sigmoid(model(users,items,features))

In [268]:
y_pred.detach()

tensor([[0.7819],
        [0.7404],
        [0.9642],
        ...,
        [0.0279],
        [0.0928],
        [0.6124]])

In [269]:
df_test['rating'] = y_pred.detach()

In [270]:
df_test

Unnamed: 0,id,user_id,item_id,item_feature_id,rating
0,0,4,16835,142,0.781930
1,434,188,16835,142,0.740351
2,48540,25437,16835,142,0.964247
3,51161,26834,16835,142,0.761634
4,73056,38465,16835,142,0.609190
...,...,...,...,...,...
381380,263826,139361,7203,142,0.180864
381381,314245,165372,7203,142,0.028509
381382,318929,167864,7203,142,0.027872
381383,356013,186485,7203,142,0.092848


In [296]:
df_test

Unnamed: 0,id,user_id,item_id,item_feature_id,rating
0,0,4,16835,142,0.781930
1,434,188,16835,142,0.740351
2,48540,25437,16835,142,0.964247
3,51161,26834,16835,142,0.761634
4,73056,38465,16835,142,0.609190
...,...,...,...,...,...
381380,263826,139361,7203,142,0.180864
381381,314245,165372,7203,142,0.028509
381382,318929,167864,7203,142,0.027872
381383,356013,186485,7203,142,0.092848


In [297]:
df_test[['id','rating']].to_csv('trial4.csv',index=False)