In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import io
import os
import json
import time
import sys
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch import optim
from torch import tensor
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

In [3]:
sys.path.append("src/")
from constants import *

### Normalize baseline features

In [11]:
TRAIN_FN = os.path.join(PREPARED_DATA_DIR, 'user_train_data_1.h5')
TEST_FN = os.path.join(PREPARED_DATA_DIR, 'user_test_data_1.h5')

train_df = pd.read_hdf(TRAIN_FN, key='stage')
test_df = pd.read_hdf(TEST_FN, key='stage')

print(train_df.shape)
print(train_df.head())
print('\n\n\n')

print(test_df.shape)
print(test_df.head())

(22851074, 20)
     User  Rating       Date  Movie  Rating_class  \
0  161459     4.0 2004-07-17   2138             0   
1   87375     2.0 2004-03-14   3253             0   
2  191296     2.0 2005-12-23   1154             0   
3   27266     5.0 2004-09-26   1201             1   
4  175666     3.0 2004-08-03   4377             0   

   days_since_first_user_rating  sqrt_days_since_first_user_rating  \
0                            23                           4.795832   
1                            13                           3.605551   
2                           453                          21.283797   
3                            15                           3.872983   
4                           446                          21.118712   

   rating_age_days_user  rating_age_weeks_user  rating_age_months_user  \
0                   251              35.857143                8.366667   
1                   617              88.142857               20.566667   
2                   455

In [12]:
numeric_cols = ['days_since_first_user_rating',
                'sqrt_days_since_first_user_rating',
                'rating_age_days_user', 'rating_age_weeks_user',
                'rating_age_months_user', 'mean_ratings_user',
                'num_ratings_user', 'days_since_first_item_rating',
                'sqrt_days_since_first_item_rating',
                'rating_age_days_item', 'rating_age_weeks_item',
                'rating_age_months_item', 'mean_ratings_movie',
                'weighted_mean_ratings_movie', 'num_ratings_movie']

In [13]:
scaler = StandardScaler()

print('fit\n')
%time scaler.fit(train_df[numeric_cols])

print('checkout the calculated parameters\n')
print(scaler.mean_)
print('\n\n')
print(scaler.var_)

fit



  return self.partial_fit(X, y)


CPU times: user 13.1 s, sys: 26.2 s, total: 39.3 s
Wall time: 46.8 s
checkout the calculated parameters

[2.74709960e+02 1.30171303e+01 6.01375438e+02 8.59107768e+01
 2.00458479e+01 3.60770401e+00 1.38134583e+02 1.01209640e+03
 2.99323996e+01 1.45806966e+03 2.08295666e+02 4.86023221e+01
 3.59968341e+00 3.60876007e+00 5.15705185e+04]



[1.25198956e+05 1.05264278e+02 2.34023739e+05 4.77599468e+03
 2.60026377e+02 1.94270819e-01 3.21530138e+04 4.03047266e+05
 1.16147858e+02 3.92335018e+05 8.00683711e+03 4.35927798e+02
 1.53311077e-01 1.35894022e-01 2.08705953e+09]


In [15]:
%time train_df_trans = scaler.transform(train_df[numeric_cols]) 
%time test_df_trans = scaler.transform(test_df[numeric_cols])

  """Entry point for launching an IPython kernel.


CPU times: user 6.77 s, sys: 13.6 s, total: 20.4 s
Wall time: 26.9 s
CPU times: user 85.6 ms, sys: 133 ms, total: 219 ms
Wall time: 276 ms


  """Entry point for launching an IPython kernel.


In [20]:
print(train_df_trans[0, :])
print(train_df.loc[0, numeric_cols])

[-0.71137737 -0.80130883 -0.72427534 -0.72427534 -0.72427534 -0.47948529
 -0.61420432  0.9433632   0.94689463  1.09349768  1.09349768  1.09349768
 -0.18610436 -0.21999151 -0.66435269]
days_since_first_user_rating              23
sqrt_days_since_first_user_rating    4.79583
rating_age_days_user                     251
rating_age_weeks_user                35.8571
rating_age_months_user               8.36667
mean_ratings_user                    3.39637
num_ratings_user                          28
days_since_first_item_rating            1611
sqrt_days_since_first_item_rating    40.1373
rating_age_days_item                    2143
rating_age_weeks_item                306.143
rating_age_months_item               71.4333
mean_ratings_movie                   3.52681
weighted_mean_ratings_movie          3.52766
num_ratings_movie                      21220
Name: 0, dtype: object


In [23]:
(4.79583 - scaler.mean_[1])/np.sqrt(scaler.var_[1])

-0.8013089747646966

In [26]:
# store the params for later use
means = scaler.mean_
stds = [np.sqrt(x) for x in scaler.var_]
numeric_params_dct = {}
for i, col in enumerate(numeric_cols):
    numeric_params_dct[col] = {'mean': means[i], 'std': stds[i]}

print(numeric_params_dct)

{'days_since_first_user_rating': {'mean': 274.7099601095336, 'std': 353.83464511490894}, 'sqrt_days_since_first_user_rating': {'mean': 13.017130344209974, 'std': 10.259838093819116}, 'rating_age_days_user': {'mean': 601.3754375396097, 'std': 483.76000175538024}, 'rating_age_weeks_user': {'mean': 85.91077679137302, 'std': 69.10857167933996}, 'rating_age_months_user': {'mean': 20.045847917987032, 'std': 16.125333391846}, 'mean_ratings_user': {'mean': 3.6077040142672616, 'std': 0.4407616348363394}, 'num_ratings_user': {'mean': 138.1345834335839, 'std': 179.3126146705238}, 'days_since_first_item_rating': {'mean': 1012.0964039589561, 'std': 634.8600365564057}, 'sqrt_days_since_first_item_rating': {'mean': 29.93239959797679, 'std': 10.777191576008672}, 'rating_age_days_item': {'mean': 1458.0696618460909, 'std': 626.3665206343762}, 'rating_age_weeks_item': {'mean': 208.2956659780133, 'std': 89.48093151919663}, 'rating_age_months_item': {'mean': 48.602322061536434, 'std': 20.8788840211459}, 'm

In [27]:
# persist
out_fn = os.path.join(METADATA_DIR, 'numeric_feats_params_dct.json')
json.dump(numeric_params_dct, open(out_fn, 'w'))

In [10]:
# utility functions
def construct_tensor(a):
    final = []
    for i in a:
        out = []
        for j in i:
            out.append(j.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
        final += out1
    return tensor(final)


def construct_tensor_test(a):
    out = []
    for i in a:
        out.append(i.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
    return tensor(out1)


def construct_tensor_y(a):
    out = []
    for i in a:
        out += i.tolist()
    return tensor(out)


def transform_numeric_cols(numeric_params_dct, numeric_cols, x):
    x_new = []
    count = 0
    for item in x:
        if isinstance(item, list):
            x_new_item = [] 
            for i, value in enumerate(item): 
                d = numeric_params_dct[numeric_cols[i]] 
                x_new_item.append((value - d['mean'])/d['std']) 
            x_new.append(x_new_item)
        else:
            d = numeric_params_dct[numeric_cols[count]]
            x_new.append((item - d['mean'])/d['std'])
            count += 1
    return x_new

## Experiment 3.1 - NN Regression with baseline features (normalized)

In [5]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import IterableDataset
from itertools import chain, islice


class InteractionsStream(IterableDataset):

    def __init__(self, prep_data_dir=PREPARED_DATA_DIR, file_num=None,
                 sample='train', user_col='User', item_col='Movie',
                 end_token='.h5', start_token='user_{}_data_',
                 baseline_feats=False, model_type='regression',
                 chunksize=10, normalize=False,
                 numeric_params_fn=NUMERIC_FEATS_PARAMS_DCT_FN):

        if file_num is None:
            self.files = [os.path.join(prep_data_dir, x) for x in
                          _find_files(prep_data_dir,
                                      start_token.format(sample),
                                      end_token)]
        else:
            self.files = [
                os.path.join(prep_data_dir,
                             start_token.format(sample)+str(file_num)+
                             end_token)]
        print(self.files)
        self.user_col = user_col
        self.item_col = item_col
        self.baseline_feats = baseline_feats
        self.sample = sample
        self.chunksize = chunksize
        if model_type == 'regression':
            self.dv_col = 'Rating'
        elif model_type == 'classification':
            self.dv_col = 'Rating_class'
        self.cat_cols = [self.user_col, self.item_col]
        self.normalize = normalize
        
        if self.normalize:
            self.numeric_params_dct = json.load(open(numeric_params_fn))
        
        if baseline_feats:
            self.numeric_cols = [
                'days_since_first_user_rating',
                'sqrt_days_since_first_user_rating',
                'rating_age_days_user', 'rating_age_weeks_user',
                'rating_age_months_user', 'mean_ratings_user',
                'num_ratings_user', 'days_since_first_item_rating',
                'sqrt_days_since_first_item_rating',
                'rating_age_days_item', 'rating_age_weeks_item',
                'rating_age_months_item', 'mean_ratings_movie',
                'weighted_mean_ratings_movie', 'num_ratings_movie']
        else:
            self.numeric_cols = []            

    def read_file(self, fn):
        
        if self.sample == 'train':
            df = pd.read_hdf(fn, key='stage', iterator=True,
                             chunksize=self.chunksize)
        else:
            df = pd.read_hdf(fn, key='stage')
        
        return df
    
    def transform_numeric_cols(self, numeric_params_dct, numeric_cols,
                                x):
        x_new = []
        count = 0
        for item in x:
            if isinstance(item, list):
                x_new_item = [] 
                for i, value in enumerate(item): 
                    d = numeric_params_dct[numeric_cols[i]] 
                    x_new_item.append((value - d['mean'])/d['std']) 
                x_new.append(x_new_item)
            else:
                d = numeric_params_dct[numeric_cols[count]]
                x_new.append((item - d['mean'])/d['std'])
                count += 1
        return x_new

    def process_data(self, fn):

        print('read data')
        data = self.read_file(fn)

        print('create an iterable')
        if self.sample == 'train':
            if self.baseline_feats:
                for row in data:
                    x1 = row[self.cat_cols].values.tolist()
                    x2 = row[self.numeric_cols].values.tolist()
                    if self.normalize:
                        x2 = self.transform_numeric_cols(
                            self.numeric_params_dct, self.numeric_cols,
                            x2)
                    y = row[self.dv_col].tolist()
                    yield (x1, x2, y)
            else:
                for row in data:
                    user = row[self.user_col].tolist()
                    item = row[self.item_col].tolist()
                    y = row[self.dv_col].tolist()
                    yield (user, item), y
        else:
            if self.baseline_feats:
                for i, row in data.iterrows():
                    x1 = row[self.cat_cols].tolist()
                    x2 = row[self.numeric_cols].tolist()
                    if self.normalize:
                        x2 = self.transform_numeric_cols(
                            self.numeric_params_dct, self.numeric_cols,
                            x2)
                    y = row[self.dv_col]
                    yield (x1, x2, y)
            else:
                for i, row in data.iterrows():
                    yield (row[self.user_col],
                           row[self.item_col]), row[self.dv_col]

    def get_stream(self, files):
        return chain.from_iterable(map(self.process_data, files))

    def __iter__(self):
        return self.get_stream(self.files)

In [20]:
class TabularModel(nn.Module):
    """
    Defines the neural network for tabular data
    """

    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(categories, size) for
             categories, size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, 1)
        #self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn1 = nn.BatchNorm1d(200)
        self.bn2 = nn.BatchNorm1d(70)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)


    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        #x2 = self.bn1(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn1(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = self.lin3(x)

        return x

In [38]:
import torch, time
import torch.optim as torch_optim
import torch.nn.functional as F
from torch import tensor
from tqdm import tqdm
from sklearn.metrics import mean_squared_error


class Train(object):
    
    def __init__(self, loss_fn=nn.MSELoss(reduction='sum'), file_num=1,
                 n_users=480189, n_items=17770, n_cont=15,
                 min_emb_dim=100, cat_cols=['User', 'Movie'],
                 lr=0.02, wd=0.00001):
        self.loss_fn = loss_fn
        self.device = (torch.device('cuda') if torch.cuda.is_available()
                       else torch.device('cpu'))
        self.file_num = file_num
        self.n_users = n_users
        self.n_items = n_items
        self.n_cont = n_cont
        self.cat_cols = cat_cols
        self.min_emb_dim = min_emb_dim
        self.embedding_sizes = self.choose_embedding_size(
            self.cat_cols, [self.n_users, self.n_items],
            self.min_emb_dim)
        self.model = TabularModel(self.embedding_sizes, self.n_cont)
        self.model.to(self.device)
        self.lr = lr
        self.wd = wd
        self.optimizer = self.get_optimizer(self.model, lr=self.lr,
                                            wd=self.wd)
        
    def choose_embedding_size(self, cat_cols, cat_num_values,
                              min_emb_dim=100):
        """
        cat_cols: list of categorical columns
        cat_num_values: list of number of unique values for each
        categorical column
        """
        embedded_cols = dict(zip(cat_cols, cat_num_values))
        embedding_sizes = [
            (n_categories, min(min_emb_dim, (n_categories+1)//2))
             for _, n_categories in embedded_cols.items()]
        return embedding_sizes
    
    def get_optimizer(self, model, lr = 0.001, wd = 0.0):
        parameters = filter(lambda p: p.requires_grad,
                            model.parameters())
        optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
        return optim
    
    def construct_tensor(self, a):
        final = []
        for i in a:
            out = []
            for j in i:
                out.append(j.tolist())
            out1 = []
            for item in zip(*out):
                out1.append(list(item))
            final += out1
        return tensor(final)


    def construct_tensor_test(self, a):
        out = []
        for i in a:
            out.append(i.tolist())
            out1 = []
            for item in zip(*out):
                out1.append(list(item))
        return tensor(out1)


    def construct_tensor_y(self, a):
        out = []
        for i in a:
            out += i.tolist()
        return tensor(out)
    
    def train(self, train_dl, train_size, chunksize, batch_size):
        self.model.train()
        total = 0
        sum_loss = 0
        with tqdm(total=train_size // (batch_size * chunksize)) as pbar:
            for x1, x2, y in train_dl:
                x1, x2, y = (self.construct_tensor(x1),
                             self.construct_tensor(x2),
                             self.construct_tensor_y(y))
                x1 = x1.to(self.device)
                x2 = x2.to(self.device)
                y = y.to(self.device)
                batch = y.size()[0]
                y = y.reshape((y.size()[0], 1))
                output = self.model(x1, x2)
                loss = self.loss_fn(output, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                total += batch
                sum_loss += loss.item()
                pbar.update(1)
        return sum_loss/total
    
    def evaluate(self, valid_dl, test_size, batch_size):
        self.model.eval()
        total = 0
        sum_loss = 0
        with tqdm(total=test_size // (batch_size)) as pbar:
            for x1, x2, y in valid_dl:
                x1, x2 = (self.construct_tensor_test(x1),
                          self.construct_tensor_test(x2))
                x1 = x1.to(self.device)
                x2 = x2.to(self.device)
                y = y.to(self.device)
                current_batch_size = y.size()[0]
                y = y.reshape((y.size()[0], 1))
                y = y.float()
                out = self.model(x1, x2)
                loss = self.loss_fn(out, y)
                sum_loss += loss.item()
                total += current_batch_size
                pbar.update(1)
        print("valid loss %.3f" % (sum_loss/total))

        return sum_loss/total
    
    def batch_fit(self, train_dl, valid_dl, epochs, train_size,
                  test_size, chunksize, batch_size):
        start = time.time()
        losses = []
        for i in range(epochs):
            stats = {'epoch': i+1}
            train_loss = self.train(train_dl, train_size, chunksize,
                                    batch_size)
            print("training loss: ", train_loss)
            stats['train_loss'] = train_loss
            test_loss = self.evaluate(valid_dl, test_size, batch_size)
            print('time taken: %0.2f' % (time.time() - start))
            stats['test_loss'] = test_loss
            losses.append(stats)
        return losses
    
    def predict(self, test_dl):
        preds = []
        actuals = []
        with torch.no_grad():
            for x1, x2, y in test_dl:
                x1, x2 = (self.construct_tensor_test(x1),
                          self.construct_tensor_test(x2))
                x1 = x1.to(self.device)
                x2 = x2.to(self.device)
                y = y.to(self.device)
                y = y.reshape((y.size()[0], 1))
                pred = self.model(x1, x2)
                preds.append(pred.tolist())
                actuals.append(y.tolist())
        final_preds = [item for sublist in preds for item in sublist]
        final_actuals = [item for sublist in actuals for item in sublist]
        rmse = np.sqrt(mean_squared_error(y_true=final_actuals,
                                          y_pred=final_preds))
        return final_actuals, final_preds, rmse

In [39]:
# GLOBALS
FILE_NUM = 1
N_USERS = 480189
N_ITEMS = 17770
N_CONT = 15
BATCH_SIZE = 50
CHUNKSIZE = 100
TRAIN_SIZE = 22851074
VAL_SIZE = 962152 
TEST_SIZE = 240538

In [40]:
# dataset

from torch.utils.data import DataLoader

train_dataset = InteractionsStream(
    file_num=FILE_NUM, baseline_feats=True, model_type='regression',
    sample='train', chunksize=CHUNKSIZE, normalize=True)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=False)

test_dataset = InteractionsStream(file_num=FILE_NUM, baseline_feats=True,
                                  model_type='regression', sample='test',
                                  normalize=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         shuffle=False)

['/Users/varunn/Documents/kaggle/netflix-prize-data/prepared_data_for_NN_modelling/user_train_data_1.h5']
['/Users/varunn/Documents/kaggle/netflix-prize-data/prepared_data_for_NN_modelling/user_test_data_1.h5']


In [14]:
from itertools import islice

for x1, x2, y in islice(train_loader, 1):
    x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                 construct_tensor_y(y))
    y = y.reshape((y.size()[0], 1))
    print(x1)
    print('\n')
    print(x2)
    print('\n')
    print(y)
    print(x2.shape)
    print('\n\n\n')
    #x2_new = transform_numeric_cols(numeric_params_dct, numeric_cols, x2)
    #print(x2_new)
    #print(x2_new.shape)

read data
create an iterable
tensor([[161459,   2138],
        [191296,   1154],
        [ 87375,   3253],
        [ 27266,   1201]])


tensor([[-0.7114, -0.8013, -0.7243, -0.7243, -0.7243, -0.4795, -0.6142,  0.9434,
          0.9469,  1.0935,  1.0935,  1.0935, -0.1861, -0.2200, -0.6644],
        [ 0.5039,  0.8057, -0.3026, -0.3026, -0.3026,  0.7880, -0.1681, -0.7956,
         -0.6881, -1.5072, -1.5072, -1.5072,  0.5598,  0.4936, -1.0917],
        [-0.7396, -0.9173,  0.0323,  0.0323,  0.0323,  1.6471,  0.1387, -0.9720,
         -0.9332, -0.6483, -0.6483, -0.6483, -1.5902, -1.7066,  0.1748],
        [-0.7340, -0.8913, -0.3563, -0.3563, -0.3563,  0.3406, -0.0788,  1.1686,
          1.1087,  1.2084,  1.2084,  1.2084,  0.4392,  0.4403,  0.5106]])


tensor([[4.],
        [2.],
        [2.],
        [5.]])
torch.Size([4, 15])






In [37]:
"""
for x1, x2, y in islice(test_loader, 1):
    x1, x2 = construct_tensor_test(x1), construct_tensor_test(x2)
    y = y.reshape((y.size()[0], 1))
    y = y.float()
    print(x1)
    print('\n')
    print(x2)
    print('\n')
    print(y)
    print(x2.shape)
    print(y.shape)
    out = model.model(x1, x2)
    print(out)
    loss = torch.nn.MSELoss(reduction='sum')(out, y)
    print(loss)
"""

"\nfor x1, x2, y in islice(test_loader, 1):\n    x1, x2 = construct_tensor_test(x1), construct_tensor_test(x2)\n    y = y.reshape((y.size()[0], 1))\n    y = y.float()\n    print(x1)\n    print('\n')\n    print(x2)\n    print('\n')\n    print(y)\n    print(x2.shape)\n    print(y.shape)\n    out = model.model(x1, x2)\n    print(out)\n    loss = torch.nn.MSELoss(reduction='sum')(out, y)\n    print(loss)\n"

In [41]:
# Instantiate train class

model = Train(file_num=FILE_NUM, n_users=N_USERS, n_items=N_ITEMS,
              n_cont=N_CONT, lr=0.02, wd=0.00001)

In [42]:
model.model

TabularModel(
  (embeddings): ModuleList(
    (0): Embedding(480189, 100)
    (1): Embedding(17770, 100)
  )
  (lin1): Linear(in_features=215, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=1, bias=True)
  (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [43]:
start = time.time()

model.batch_fit(train_dl=train_loader, valid_dl=test_loader, epochs=2,
                train_size=TRAIN_SIZE, test_size=TEST_SIZE,
                chunksize=CHUNKSIZE, batch_size=BATCH_SIZE)

print('time taken: %0.2f' % (time.time() - start))

  0%|          | 0/4570 [00:00<?, ?it/s]

read data
create an iterable


4571it [2:00:27,  1.42s/it]                            
  0%|          | 0/4810 [00:00<?, ?it/s]

training loss:  0.8844703704705356
read data
create an iterable


4811it [06:00, 13.36it/s]                          
  0%|          | 0/4570 [00:00<?, ?it/s]

valid loss 0.804
time taken: 7587.23
read data
create an iterable


4571it [1:57:31,  1.32s/it]                            
  0%|          | 0/4810 [00:00<?, ?it/s]

training loss:  0.8080374877252133
read data
create an iterable


4811it [05:59, 13.37it/s]                          

valid loss 0.791
time taken: 14998.71
time taken: 14998.71





In [44]:
np.sqrt(0.791)

0.8893818077743664