In [870]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from statistics import mode
from torch.utils.data import Dataset, DataLoader

In [871]:
df_users = pd.read_csv('training.csv')
df_users.head()

Unnamed: 0,user_id,item_id,context_feature_id
0,0,28366,2
1,0,16109,2
2,0,11500,3
3,0,20750,2
4,0,8759,2


In [872]:
df_items = pd.read_csv('item_feature.csv')
df_items.head()

Unnamed: 0,item_id,item_feature_id
0,0,139
1,1,55
2,2,11
3,3,138
4,4,138


In [873]:
df = pd.merge(df_users.drop('context_feature_id',axis=1),df_items,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,item_feature_id
0,0,28366,7
1,1731,28366,7
2,10168,28366,7
3,18883,28366,7
4,19763,28366,7


In [875]:
df['rating'] = 1  #Setting implicit rating

In [876]:
num_users = df.user_id.max()
num_items = len(np.unique(df_items.item_id))
num_users, num_items

(200152, 39901)

## Negative sampling based on user popularity

In [956]:
df_group_item = df.groupby('item_id').count().sort_values('user_id',ascending=False)
df_group_item.head(5)

Unnamed: 0_level_0,user_id,item_feature_id,rating
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
400,9029,9029,9029
5619,7814,7814,7814
26467,7742,7742,7742
24490,7061,7061,7061
20489,6865,6865,6865


In [880]:
ar = df_group_item.user_id.values/sum(df_group_item.user_id.values)
br = np.array([1/x for x in ar])
cr = br/sum(br)
cr

array([6.19559999e-09, 7.15895473e-09, 7.22553246e-09, ...,
       5.59400723e-05, 5.59400723e-05, 5.59400723e-05])

In [955]:
df_group = df.groupby('user_id').count()

In [892]:
val_list = []
for user in df_group.index:
    items = np.random.choice(np.array(df_group_item.index),size=int(df_group.loc[user]['item_id']),p=cr)
    [val_list.append([user,item, None,0]) for item in items]

In [893]:
val_df = pd.DataFrame(val_list,columns = df.columns)


In [894]:
val_df

Unnamed: 0,user_id,item_id,item_feature_id,rating
0,0,33994,,0
1,0,35239,,0
2,0,31314,,0
3,0,32651,,0
4,0,17531,,0
...,...,...,...,...
970240,200152,37105,,0
970241,200152,21124,,0
970242,200152,27424,,0
970243,200152,16072,,0


In [895]:
neg_df = pd.merge(val_df.drop('item_feature_id',axis=1),df_items,on='item_id')

In [957]:
neg_df.head(5)

Unnamed: 0,user_id,item_id,rating,item_feature_id
0,0,33994,0,168
1,3723,33994,0,168
2,5088,33994,0,168
3,7083,33994,0,168
4,10349,33994,0,168


In [899]:
df_new = pd.concat([df,neg_df])
df_new.reset_index(drop=True, inplace=True)
df_new

Unnamed: 0,user_id,item_id,item_feature_id,rating
0,0,28366,7,1
1,1731,28366,7,1
2,10168,28366,7,1
3,18883,28366,7,1
4,19763,28366,7,1
...,...,...,...,...
1940485,199678,16276,138,0
1940486,199704,11331,138,0
1940487,199917,416,142,0
1940488,200011,39633,55,0


In [900]:
df_new = df_new.drop_duplicates(
  subset = ['user_id','item_id'],
  keep = 'first').reset_index(drop = True)

## Cold start user training by setting random users to 0

In [902]:
#Increase user id by 1 to make unk at 0 position
df_new['user_id'] = df_new.user_id.values+1

In [903]:
#Splitting into train and validation set
train = df_new.sample(frac=0.8, random_state=25)
val = df_new.drop(train.index)

In [904]:
test_unk = len(set(df_test.user_id.values) - set(df_new.user_id.values))/len(set(df_test.user_id.values))

In [905]:
test_unk

0.1539639378019483

In [906]:
num_user_unk = int(len(set(train.user_id.values))*test_unk)
num_user_unk

26121

In [907]:
idx = np.random.randint(0,train.shape[0],size = int(num_user_unk*1.5))
idx

array([ 194753,  106760,  682633, ...,  475268,  723151, 1095614])

In [911]:
train_unk = train.iloc[idx].copy()

In [912]:
train_unk.iloc[:,0] = 0

In [913]:
train_unk

Unnamed: 0,user_id,item_id,item_feature_id,rating
1812659,0,27052,138,0
1084908,0,21277,148,0
11192,0,11275,139,1
568950,0,21095,20,1
1559612,0,29940,18,0
...,...,...,...,...
627074,0,23310,129,1
37905,0,27433,138,1
790655,0,35714,6,1
850415,0,8451,139,1


In [914]:
train = pd.concat([train,train_unk]) 

In [915]:
train

Unnamed: 0,user_id,item_id,item_feature_id,rating
1801164,98468,372,11,0
1486763,79460,3673,11,0
881749,184206,25035,111,0
846061,79268,18986,154,1
1404311,195051,24040,142,0
...,...,...,...,...
627074,0,23310,129,1
37905,0,27433,138,1
790655,0,35714,6,1
850415,0,8451,139,1


## Dataloader implementation

In [922]:
class user_feature_dataset(Dataset):
    def __init__(self,train):
        u = torch.LongTensor(train.user_id.values)
        v = torch.LongTensor(train.item_id.values)
        features = torch.LongTensor(train.item_feature_id.values)
        ratings = torch.FloatTensor(train.rating.values) 
        self.u, self.v, self.features, self.ratings = u, v, features, ratings
        
    def __len__(self):
        return len(self.ratings)
    def __getitem__(self, idx):
        return self.u[idx],self.v[idx],self.features[idx], self.ratings[idx]

In [924]:
train_ds = user_feature_dataset(train)
valid_ds = user_feature_dataset(val)

train_dl = DataLoader(train_ds, batch_size=100000)
valid_dl = DataLoader(valid_ds, batch_size=100000)

## Model 1 without using extra features

In [925]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, p=0.5):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
        self.linear1 = nn.Linear(emb_size*2, 30)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.linear2 = nn.Linear(30,1)
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        x = self.linear1(torch.cat((U,V),1))
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [926]:
def train_epocs1(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user_id.values)
        items = torch.LongTensor(train.item_id.values) 
        ratings = torch.FloatTensor(train.rating.values)  
    
        y_hat = model(users, items)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1)) # binary cross entropy with logits
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss1(model)
        if i%10 ==0:
            print("train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 


In [927]:
def valid_loss1(model):
    model.eval()
    users = torch.LongTensor(val.user_id.values) # .cuda()
    items = torch.LongTensor(val.item_id.values) #.cuda()
    ratings = torch.FloatTensor(val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1))
    return loss.item()

In [928]:
def train_epocs1(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=wd)
    losses = []
    for i in range(epochs):
        for users, items, features, ratings in train_dl: 
            model.train()  
            y_hat = model(users, items)
            loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1)) # binary cross entropy with logits
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        train_loss = np.mean(losses)
        testloss = valid_loss1(model)
        print("train loss %.3f valid loss %.3f" % (train_loss, testloss)) 
            
            
def valid_loss1(model):
    model.eval()
    losses = []
    for users, items, features, ratings in valid_dl: 
        y_hat = model(users, items)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1))
        losses.append(loss.item())
    val_loss = np.mean(losses)
    return val_loss

In [929]:
num_users = df_new.user_id.max()+1
num_items = df_new.item_id.max()+1
print(num_users, num_items) 

200154 39901


In [954]:
model = MF_bias(num_users, num_items, emb_size=30, p=0.5)
train_epocs1(model, epochs=3, lr=0.001, wd=1e-5)

train loss 0.693 valid loss 0.688
train loss 0.686 valid loss 0.667
train loss 0.674 valid loss 0.622


## Model 2 with extra feature

In [806]:
class MF_extra(nn.Module):
    def __init__(self, num_users, num_items,num_features, emb_size=100, p=0.5):
        super(MF_extra, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.feature_emb = nn.Embedding(num_features,emb_size)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.feature_emb.weight.data.uniform_(0,0.05)
        
        self.linear1 = nn.Linear(emb_size*3, 30)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p)
        self.linear2 = nn.Linear(30,1)
    def forward(self, u, v, f):
        U = self.user_emb(u)
        V = self.item_emb(v)
        Fe = self.feature_emb(f)
        x = self.linear1(torch.cat((U,V,Fe),1))
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [807]:
num_users = df_new.user_id.max()+1
num_items = df_new.item_id.max()+1
num_features = len(np.unique(df_items.item_feature_id))
print(num_users, num_items,num_features) 

200154 39901 195


In [808]:
def train_epocs2(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=wd)
    losses = []
    for i in range(epochs):
        for users, items, features, ratings in train_dl: 
            model.train() 
            y_hat = model(users, items, features)
            loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1)) # binary cross entropy with logits
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        train_loss = np.mean(losses)
        testloss = valid_loss2(model)
        print("train loss %.3f valid loss %.3f" % (train_loss, testloss)) 
            
            
def valid_loss2(model):
    model.eval()
    losses = []
    for users, items, features, ratings in valid_dl: 
        y_hat = model(users, items, features)
        loss = F.binary_cross_entropy_with_logits(y_hat, ratings.unsqueeze(-1))
        losses.append(loss.item())
    val_loss = np.mean(losses)
    return val_loss

In [816]:
model = MF_extra(num_users, num_items,num_features, emb_size=30)
train_epocs2(model, epochs=3, lr=0.01, wd=1e-5)

train loss 0.595 valid loss 0.380
train loss 0.485 valid loss 0.346
train loss 0.434 valid loss 0.343


# Hyper parameters search

## Learning rate

In [282]:
# learning rate 1
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=1, wd=1e-5)

train loss 0.661 valid loss 1.932
train loss 0.601 valid loss 0.427
train loss 0.502 valid loss 0.469
train loss 0.403 valid loss 0.487
train loss 0.403 valid loss 0.446
train loss 0.380 valid loss 0.435
train loss 0.368 valid loss 0.416
train loss 0.354 valid loss 0.400
train loss 0.341 valid loss 0.394
train loss 0.332 valid loss 0.395
train loss 0.327 valid loss 0.402
train loss 0.326 valid loss 0.403
train loss 0.324 valid loss 0.401
train loss 0.320 valid loss 0.396


In [284]:
# learning rate 0.1
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=0.1, wd=1e-5)

train loss 0.766 valid loss 0.690
train loss 0.353 valid loss 0.383
train loss 0.252 valid loss 0.389
train loss 0.225 valid loss 0.389
train loss 0.191 valid loss 0.452
train loss 0.186 valid loss 0.496
train loss 0.184 valid loss 0.540
train loss 0.183 valid loss 0.573
train loss 0.185 valid loss 0.589
train loss 0.186 valid loss 0.609
train loss 0.186 valid loss 0.626
train loss 0.187 valid loss 0.637
train loss 0.187 valid loss 0.649
train loss 0.188 valid loss 0.659


In [285]:
# learning rate 0.01
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=0.01, wd=1e-5)

train loss 0.647 valid loss 0.644
train loss 0.628 valid loss 0.623
train loss 0.581 valid loss 0.573
train loss 0.494 valid loss 0.484
train loss 0.432 valid loss 0.427
train loss 0.391 valid loss 0.408
train loss 0.357 valid loss 0.398
train loss 0.329 valid loss 0.393
train loss 0.304 valid loss 0.392
train loss 0.280 valid loss 0.392
train loss 0.252 valid loss 0.388
train loss 0.229 valid loss 0.388
train loss 0.217 valid loss 0.396
train loss 0.212 valid loss 0.407


In [286]:
# learning rate 0.001
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=140, lr=0.001, wd=1e-5)

train loss 0.663 valid loss 0.663
train loss 0.660 valid loss 0.660
train loss 0.657 valid loss 0.657
train loss 0.654 valid loss 0.654
train loss 0.651 valid loss 0.651
train loss 0.647 valid loss 0.647
train loss 0.643 valid loss 0.643
train loss 0.639 valid loss 0.639
train loss 0.633 valid loss 0.634
train loss 0.628 valid loss 0.629
train loss 0.621 valid loss 0.623
train loss 0.614 valid loss 0.616
train loss 0.607 valid loss 0.608
train loss 0.598 valid loss 0.600


### Conclusion: LR 0.01 is enough to quickly converge to minimum loss. 1 and 0.1 would be too large and 0.001 would be too small

## Embedding Size

In [289]:
# emb_size 3
model = MF_bias(num_users, num_items, emb_size=3)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.654 valid loss 0.651
train loss 0.627 valid loss 0.623
train loss 0.571 valid loss 0.563
train loss 0.471 valid loss 0.457
train loss 0.377 valid loss 0.362


In [290]:
# emb_size 10
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.710 valid loss 0.708
train loss 0.679 valid loss 0.675
train loss 0.627 valid loss 0.617
train loss 0.547 valid loss 0.527
train loss 0.473 valid loss 0.445


In [291]:
# emb_size 50
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.657 valid loss 0.653
train loss 0.617 valid loss 0.610
train loss 0.521 valid loss 0.509
train loss 0.409 valid loss 0.403
train loss 0.337 valid loss 0.355


In [292]:
# emb_size 100
model = MF_bias(num_users, num_items, emb_size=5)
train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

train loss 0.727 valid loss 0.721
train loss 0.675 valid loss 0.670
train loss 0.603 valid loss 0.596
train loss 0.494 valid loss 0.478
train loss 0.404 valid loss 0.373


### Conclusion: Embedding size 50 is can converge to minimum loss.

## Dropout rate

In [295]:
nb_epochs = 20
learning_rate = 0.001
p_vals = []
for p in [0.1,0.5,0.9]:
    print('Dropout rate:',p)
    model = MF_bias(num_users, num_items, emb_size=50,p=p)
    train_epocs1(model, epochs=50, lr=0.01, wd=1e-5)

Dropout rate: 0.1
train loss 0.737 valid loss 0.727
train loss 0.588 valid loss 0.567
train loss 0.344 valid loss 0.352
train loss 0.252 valid loss 0.311
train loss 0.201 valid loss 0.304
Dropout rate: 0.5
train loss 0.739 valid loss 0.728
train loss 0.599 valid loss 0.577
train loss 0.384 valid loss 0.357
train loss 0.284 valid loss 0.310
train loss 0.238 valid loss 0.310
Dropout rate: 0.9
train loss 0.664 valid loss 0.657
train loss 0.576 valid loss 0.531
train loss 0.511 valid loss 0.398
train loss 0.477 valid loss 0.361
train loss 0.463 valid loss 0.353


### Dropout rate of 0.5 achieves lowest validation loss

## Generate submission file

In [943]:
df_test_ = pd.read_csv('test_kaggle.csv')
df_test_.drop('context_feature_id',inplace=True, axis=1)
df_test_.head()

Unnamed: 0,id,user_id,item_id
0,0,4,16835
1,1,4,22590
2,2,4,1978
3,3,4,28916
4,4,4,14427


In [944]:
df_test_['user_id'] = df_test_.user_id.values+1

In [945]:
test_unk = list(set(df_test_.user_id.values)-set(train.user_id.values))

In [946]:
df_test_.iloc[test_unk,1] = 0

In [947]:
df_test = pd.merge(df_test_,df_items,on='item_id')
df_test.head()

Unnamed: 0,id,user_id,item_id,item_feature_id
0,0,5,16835,142
1,434,189,16835,142
2,48540,25438,16835,142
3,51161,0,16835,142
4,73056,0,16835,142


In [948]:
#Running model with extra feature for predictions
users = torch.LongTensor(df_test.user_id.values) # .cuda()
items = torch.LongTensor(df_test.item_id.values) #.cuda()
features = torch.LongTensor(df_test.item_feature_id.values)
y_pred = torch.sigmoid(model(users,items,features))

In [949]:
y_pred.detach()

tensor([[0.9194],
        [0.9688],
        [0.9039],
        ...,
        [0.4718],
        [0.4944],
        [0.3682]])

In [950]:
df_test['rating'] = y_pred.detach()

In [951]:
df_test['user_id'] = df_test.user_id.values-1

In [952]:
df_test = df_test.sort_values('id')

In [953]:
df_test

Unnamed: 0,id,user_id,item_id,item_feature_id,rating
0,0,4,16835,142,0.919385
25,1,4,22590,142,0.878200
38,2,4,1978,142,0.993686
77,3,4,28916,148,0.289209
97,4,4,14427,63,0.131593
...,...,...,...,...,...
378700,381380,200151,1702,139,0.936520
31638,381381,200151,21632,130,0.999670
247881,381382,200151,30477,130,0.999192
247882,381383,200151,30477,130,0.998240


In [942]:
df_test[['id','rating']].to_csv('trial9.csv',index=False)