In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
df = pd.read_csv('training.csv')

In [3]:
df

Unnamed: 0,user_id,item_id,context_feature_id
0,0,28366,2
1,0,16109,2
2,0,11500,3
3,0,20750,2
4,0,8759,2
...,...,...,...
970240,200152,30710,2
970241,200152,30710,2
970242,200152,12006,2
970243,200152,25030,2


In [4]:
df_features = pd.read_csv('item_feature.csv')
df_features

Unnamed: 0,item_id,item_feature_id
0,0,139
1,1,55
2,2,11
3,3,138
4,4,138
...,...,...
39896,39896,138
39897,39897,95
39898,39898,142
39899,39899,131


In [5]:
df['rating'] = 1  #Setting implicit rating

In [6]:
df.head()

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1


In [7]:
train_user_ids = np.sort(np.unique(df.user_id.values))
train_user_ids[:15]

array([ 0,  1,  2,  3,  5,  7,  8, 10, 12, 14, 15, 17, 18, 19, 20])

In [8]:
# number of unique ids
num_users = len(train_user_ids)
num_users

169698

In [9]:
train_item_ids = np.sort(np.unique(df_features.item_id))
num_items = len(train_item_ids)
print(num_items)
train_item_ids[:15]

39901


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [10]:
#Random uniform negative sampling
cols = list(df.columns)
val_list = []
for itr in range(df.shape[0]):
    user_id_sample = np.random.randint(0,num_users)
    item_id_sample = np.random.randint(0,num_items)
    val_list.append([user_id_sample,item_id_sample,None,0])

In [11]:
val_df = pd.DataFrame(val_list,columns = df.columns)

In [12]:
val_df.shape

(970245, 4)

In [13]:
df_new = pd.concat([df,val_df])

In [14]:
df_new

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1
...,...,...,...,...
970240,109838,13210,,0
970241,54044,16869,,0
970242,68198,18628,,0
970243,10267,36497,,0


In [15]:
df_new = df_new.drop_duplicates(
  subset = ['user_id','item_id'],
  keep = 'first').reset_index(drop = True)

df_new

Unnamed: 0,user_id,item_id,context_feature_id,rating
0,0,28366,2,1
1,0,16109,2,1
2,0,11500,3,1
3,0,20750,2,1
4,0,8759,2,1
...,...,...,...,...
1833031,109838,13210,,0
1833032,54044,16869,,0
1833033,68198,18628,,0
1833034,10267,36497,,0


In [16]:
#Splitting into train and validation set
train = df_new.sample(frac=0.8, random_state=25)
val = df_new.drop(train.index)

## MF with bias

In [17]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

## Training MF model with bias

In [18]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user_id.values)
        items = torch.LongTensor(train.item_id.values) 
        ratings = torch.FloatTensor(train.rating.values)  
    
        y_hat = model(users, items)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings) # binary cross entropy with logits
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss(model)
        if i%10 ==0:
            print("train loss %.3f valid loss %.3f" % (loss.item(), testloss))  

In [19]:
def valid_loss(model):
    model.eval()
    users = torch.LongTensor(val.user_id.values) # .cuda()
    items = torch.LongTensor(val.item_id.values) #.cuda()
    ratings = torch.FloatTensor(val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.binary_cross_entropy_with_logits(y_hat, ratings)
    return loss.item()

In [20]:
num_users = df_new.user_id.max() + 1
num_items = df_new.item_id.max() + 1

In [21]:
print(num_users, num_items)

200153 39901


In [22]:
model = MF_bias(num_users, num_items, emb_size=3)

# Hyper parameters search

## Learning Rate

In [23]:
# learning rate 1
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs(model, epochs=100, lr=1, wd=1e-5)

train loss 0.693 valid loss 1.486
train loss 0.515 valid loss 0.481
train loss 0.424 valid loss 0.484
train loss 0.418 valid loss 0.443
train loss 0.427 valid loss 0.447
train loss 0.425 valid loss 0.444
train loss 0.427 valid loss 0.442
train loss 0.426 valid loss 0.442
train loss 0.427 valid loss 0.442
train loss 0.428 valid loss 0.442


In [24]:
# learning rate 0.1
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs(model, epochs=100, lr=0.1, wd=1e-5)

train loss 0.693 valid loss 0.658
train loss 0.469 valid loss 0.478
train loss 0.450 valid loss 0.465
train loss 0.433 valid loss 0.445
train loss 0.427 valid loss 0.443
train loss 0.429 valid loss 0.443
train loss 0.427 valid loss 0.442
train loss 0.428 valid loss 0.443
train loss 0.428 valid loss 0.442
train loss 0.428 valid loss 0.442


In [25]:
# learning rate 0.01
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs(model, epochs=100, lr=0.01, wd=1e-5)

train loss 0.693 valid loss 0.689
train loss 0.642 valid loss 0.648
train loss 0.604 valid loss 0.615
train loss 0.576 valid loss 0.589
train loss 0.554 valid loss 0.568
train loss 0.536 valid loss 0.550
train loss 0.521 valid loss 0.537
train loss 0.510 valid loss 0.526
train loss 0.500 valid loss 0.517
train loss 0.493 valid loss 0.510


In [26]:
# learning rate 0.001
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs(model, epochs=100, lr=0.001, wd=1e-5)

train loss 0.693 valid loss 0.693
train loss 0.687 valid loss 0.688
train loss 0.682 valid loss 0.684
train loss 0.677 valid loss 0.680
train loss 0.672 valid loss 0.676
train loss 0.667 valid loss 0.672
train loss 0.662 valid loss 0.668
train loss 0.657 valid loss 0.664
train loss 0.653 valid loss 0.660
train loss 0.649 valid loss 0.657


In [27]:
# learning rate 0.0001
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs(model, epochs=100, lr=0.0001, wd=1e-5)

train loss 0.693 valid loss 0.693
train loss 0.693 valid loss 0.693
train loss 0.692 valid loss 0.692
train loss 0.691 valid loss 0.692
train loss 0.691 valid loss 0.691
train loss 0.690 valid loss 0.691
train loss 0.690 valid loss 0.690
train loss 0.689 valid loss 0.690
train loss 0.689 valid loss 0.689
train loss 0.688 valid loss 0.689


### Conclusion: LR 0.1 is enough to quickly converge to minimum loss.

## Embedding Size

In [28]:
# emb_size 3
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs(model, epochs=100, lr=0.1, wd=1e-5)

train loss 0.693 valid loss 0.658
train loss 0.469 valid loss 0.478
train loss 0.450 valid loss 0.465
train loss 0.433 valid loss 0.445
train loss 0.427 valid loss 0.443
train loss 0.429 valid loss 0.443
train loss 0.427 valid loss 0.442
train loss 0.428 valid loss 0.443
train loss 0.428 valid loss 0.442
train loss 0.428 valid loss 0.442


In [29]:
# emb_size 4
model = MF_bias(num_users, num_items, emb_size=4)
train_epocs(model, epochs=100, lr=0.1, wd=1e-5)

train loss 0.693 valid loss 0.660
train loss 0.469 valid loss 0.478
train loss 0.450 valid loss 0.465
train loss 0.433 valid loss 0.445
train loss 0.427 valid loss 0.443
train loss 0.429 valid loss 0.443
train loss 0.427 valid loss 0.442
train loss 0.428 valid loss 0.443
train loss 0.428 valid loss 0.442
train loss 0.428 valid loss 0.442


In [30]:
# emb_size 5
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs(model, epochs=100, lr=0.1, wd=1e-5)

train loss 0.693 valid loss 0.662
train loss 0.469 valid loss 0.478
train loss 0.450 valid loss 0.465
train loss 0.433 valid loss 0.445
train loss 0.427 valid loss 0.443
train loss 0.429 valid loss 0.443
train loss 0.427 valid loss 0.442
train loss 0.428 valid loss 0.443
train loss 0.428 valid loss 0.442
train loss 0.428 valid loss 0.442


In [31]:
# emb_size 6
model = MF_bias(num_users, num_items, emb_size=6)
train_epocs(model, epochs=100, lr=0.1, wd=1e-5)

train loss 0.693 valid loss 0.664
train loss 0.469 valid loss 0.478
train loss 0.450 valid loss 0.465
train loss 0.433 valid loss 0.445
train loss 0.427 valid loss 0.443
train loss 0.429 valid loss 0.443
train loss 0.427 valid loss 0.442
train loss 0.428 valid loss 0.443
train loss 0.428 valid loss 0.442
train loss 0.428 valid loss 0.442


### Conclusion: Embedding size 3 is enough to quickly converge to minimum loss.

# Generate submission file

In [33]:
df_test = pd.read_csv('test_kaggle.csv')

In [34]:
users = torch.LongTensor(df_test.user_id.values) # .cuda()
items = torch.LongTensor(df_test.item_id.values) #.cuda()
y_pred = torch.sigmoid(model(users,items))

In [35]:
y_pred.detach()

tensor([0.5968, 0.4197, 0.8415,  ..., 0.8995, 0.8995, 0.4179])

In [42]:
df_test['rating'] = y_pred.detach()

In [43]:
df_test[['id','rating']]

Unnamed: 0,id,rating
0,0,0.596785
1,1,0.419673
2,2,0.841454
3,3,0.383190
4,4,0.355154
...,...,...
381380,381380,0.771630
381381,381381,0.937398
381382,381382,0.899503
381383,381383,0.899503


In [44]:
df_test[['id','rating']].to_csv('trial2.csv', index=False)

In [45]:
pd.read_csv('trial2.csv')

Unnamed: 0,id,rating
0,0,0.596785
1,1,0.419673
2,2,0.841454
3,3,0.383190
4,4,0.355154
...,...,...
381380,381380,0.771630
381381,381381,0.937398
381382,381382,0.899503
381383,381383,0.899503
