In [1]:
import pandas as pd
import numpy as np
import os, sys, time, joblib, json

In [2]:
sys.path.append("src/")
from constants import *

In [3]:
# read movie titles
movie_titles = pd.read_csv(os.path.join(DATA_DIR, 'movie_titles.csv'),
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name'])

In [4]:
print(movie_titles.shape)
print(movie_titles.head())

(17770, 3)
   Id    Year                          Name
0   1  2003.0               Dinosaur Planet
1   2  2004.0    Isle of Man TT 2004 Review
2   3  1997.0                     Character
3   4  1994.0  Paula Abdul's Get Up & Dance
4   5  2004.0      The Rise and Fall of ECW


In [7]:
# Vocabulary
names = movie_titles['Name'].tolist()
movie_ids = movie_titles['Id'].tolist()
print(len(names), names[:3])

17770 ['Dinosaur Planet', 'Isle of Man TT 2004 Review', 'Character']


In [18]:
# remove everything except words and numbers
import re, nltk

def regex_cleaning(text, only_alphanumeric=True, only_alpha=True):
    text = text.lower()
    text = text.replace('\x01', ' ')
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    text = re.sub(r"http[s]?\S+", " ", text)
    text = re.sub(r"xx+", " ", text)
    text = re.sub(r"x{2,}", "", text)
    text = re.sub(r"`", "", text)
    if only_alpha:
        text = re.sub('[0-9]+', '', text)
    text = text.replace('"', '')
    text = text.replace("'", "")
    if only_alphanumeric:
        text = re.sub('[^A-Za-z0-9]+', ' ', text)
    return text

In [9]:
names_cleaned = [regex_cleaning(x) for x in names]

In [10]:
(len(names_cleaned), names_cleaned[:5], names_cleaned[3].split(),
 nltk.word_tokenize(names_cleaned[3]))

(17770,
 ['dinosaur planet',
  'isle of man tt review',
  'character',
  'paula abduls get up dance',
  'the rise and fall of ecw'],
 ['paula', 'abduls', 'get', 'up', 'dance'],
 ['paula', 'abduls', 'get', 'up', 'dance'])

In [11]:
# vocabulary size
from collections import Counter

words = [w for item in names_cleaned for w in nltk.word_tokenize(item)]

counter = Counter(words)

In [12]:
print(len(counter.keys()))
counter.most_common(10)

11808


[('the', 6102),
 ('of', 2149),
 ('a', 764),
 ('and', 742),
 ('in', 742),
 ('season', 742),
 ('to', 422),
 ('live', 296),
 ('vol', 292),
 ('on', 256)]

In [13]:
# remove stopwords + keep only words and numbers
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
words = [w for item in names_cleaned for w in nltk.word_tokenize(item)
         if w not in stop]

counter = Counter(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varunn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
print(len(counter.keys()))
counter.most_common(10)

11683


[('season', 742),
 ('live', 296),
 ('vol', 292),
 ('series', 218),
 ('man', 216),
 ('bonus', 216),
 ('material', 216),
 ('love', 201),
 ('best', 161),
 ('world', 158)]

### Modify dataset class to accommodate text embedding based features

In [3]:
# utility functions
from torch import tensor
def construct_tensor(a):
    final = []
    for i in a:
        out = []
        for j in i:
            out.append(j.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
        final += out1
    return tensor(final)


def construct_tensor_test(a):
    out = []
    for i in a:
        out.append(i.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
    return tensor(out1)


def construct_tensor_y(a):
    out = []
    for i in a:
        out += i.tolist()
    return tensor(out)


def transform_numeric_cols(numeric_params_dct, numeric_cols, x):
    x_new = []
    count = 0
    for item in x:
        if isinstance(item, list):
            x_new_item = [] 
            for i, value in enumerate(item): 
                d = numeric_params_dct[numeric_cols[i]] 
                x_new_item.append((value - d['mean'])/d['std']) 
            x_new.append(x_new_item)
        else:
            d = numeric_params_dct[numeric_cols[count]]
            x_new.append((item - d['mean'])/d['std'])
            count += 1
    return x_new

In [4]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import IterableDataset
from itertools import chain, islice


class InteractionsStream(IterableDataset):

    def __init__(self, prep_data_dir=PREPARED_DATA_DIR, file_num=None,
                 sample='train', user_col='User', item_col='Movie',
                 end_token='.h5', start_token='user_{}_data_',
                 baseline_feats=False, model_type='regression',
                 chunksize=10, normalize=False, title_features=False,
                 numeric_params_fn=NUMERIC_FEATS_PARAMS_DCT_FN,
                 title_feats_fn=MOVIE_TITLES_TFIDF_COMPS_FN):

        if file_num is None:
            self.files = [os.path.join(prep_data_dir, x) for x in
                          _find_files(prep_data_dir,
                                      start_token.format(sample),
                                      end_token)]
        else:
            self.files = [
                os.path.join(prep_data_dir,
                             start_token.format(sample)+str(file_num)+
                             end_token)]
        print(self.files)
        self.user_col = user_col
        self.item_col = item_col
        self.baseline_feats = baseline_feats
        self.sample = sample
        self.chunksize = chunksize
        if model_type == 'regression':
            self.dv_col = 'Rating'
        elif model_type == 'classification':
            self.dv_col = 'Rating_class'
        self.cat_cols = [self.user_col, self.item_col]
        self.normalize = normalize
        self.title_features = title_features
        
        if self.normalize:
            self.numeric_params_dct = json.load(open(numeric_params_fn))
        
        if self.title_features:
            self.title_feats_dct = json.load(open(title_feats_fn))
        
        if baseline_feats:
            self.numeric_cols = [
                'days_since_first_user_rating',
                'sqrt_days_since_first_user_rating',
                'rating_age_days_user', 'rating_age_weeks_user',
                'rating_age_months_user', 'mean_ratings_user',
                'num_ratings_user', 'days_since_first_item_rating',
                'sqrt_days_since_first_item_rating',
                'rating_age_days_item', 'rating_age_weeks_item',
                'rating_age_months_item', 'mean_ratings_movie',
                'weighted_mean_ratings_movie', 'num_ratings_movie']
        else:
            self.numeric_cols = []            

    def read_file(self, fn):
        
        if self.sample == 'train':
            df = pd.read_hdf(fn, key='stage', iterator=True,
                             chunksize=self.chunksize)
        else:
            df = pd.read_hdf(fn, key='stage')
        
        return df
    
    def transform_numeric_cols(self, numeric_params_dct, numeric_cols,
                                x):
        x_new = []
        count = 0
        for item in x:
            if isinstance(item, list):
                x_new_item = [] 
                for i, value in enumerate(item): 
                    d = numeric_params_dct[numeric_cols[i]] 
                    x_new_item.append((value - d['mean'])/d['std']) 
                x_new.append(x_new_item)
            else:
                d = numeric_params_dct[numeric_cols[count]]
                x_new.append((item - d['mean'])/d['std'])
                count += 1
        return x_new

    def process_data(self, fn):

        print('read data')
        data = self.read_file(fn)

        print('create an iterable')
        if self.sample == 'train':
            if self.baseline_feats:
                for row in data:
                    x1 = row[self.cat_cols].values.tolist()
                    x2 = row[self.numeric_cols].values.tolist()
                    if self.normalize:
                        x2 = self.transform_numeric_cols(
                            self.numeric_params_dct, self.numeric_cols,
                            x2)
                    if self.title_features:
                        item_cols = row[self.item_col].tolist()
                        x2_new = []
                        for i, item in enumerate(item_cols):
                            x2_new.append(
                                x2[i] + self.title_feats_dct[str(item)])
                        x2 = x2_new
                    y = row[self.dv_col].tolist()
                    yield (x1, x2, y)
            else:
                for row in data:
                    user = row[self.user_col].tolist()
                    item = row[self.item_col].tolist()
                    y = row[self.dv_col].tolist()
                    yield (user, item), y
        else:
            if self.baseline_feats:
                for i, row in data.iterrows():
                    x1 = row[self.cat_cols].tolist()
                    x2 = row[self.numeric_cols].tolist()
                    if self.normalize:
                        x2 = self.transform_numeric_cols(
                            self.numeric_params_dct, self.numeric_cols,
                            x2)
                    if self.title_features:
                        item = row[self.item_col]
                        x2 += self.title_feats_dct[str(item)]
                    y = row[self.dv_col]
                    yield (x1, x2, y)
            else:
                for i, row in data.iterrows():
                    yield (row[self.user_col],
                           row[self.item_col]), row[self.dv_col]

    def get_stream(self, files):
        return chain.from_iterable(map(self.process_data, files))

    def __iter__(self):
        return self.get_stream(self.files)

In [5]:
class TabularModel(nn.Module):
    """
    Defines the neural network for tabular data
    """

    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(categories, size) for
             categories, size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 300)
        self.lin2 = nn.Linear(300, 100)
        self.lin3 = nn.Linear(100, 1)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(300)
        self.bn3 = nn.BatchNorm1d(100)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)


    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)

        return x

In [6]:
import torch, time
import torch.optim as torch_optim
import torch.nn.functional as F
from torch import tensor
from tqdm import tqdm
from sklearn.metrics import mean_squared_error


class Train(object):
    
    def __init__(self, loss_fn=nn.MSELoss(reduction='sum'), file_num=1,
                 n_users=480189, n_items=17770, n_cont=15,
                 min_emb_dim=100, cat_cols=['User', 'Movie'],
                 lr=0.02, wd=0.00001):
        self.loss_fn = loss_fn
        self.device = (torch.device('cuda') if torch.cuda.is_available()
                       else torch.device('cpu'))
        self.file_num = file_num
        self.n_users = n_users
        self.n_items = n_items
        self.n_cont = n_cont
        self.cat_cols = cat_cols
        self.min_emb_dim = min_emb_dim
        self.embedding_sizes = self.choose_embedding_size(
            self.cat_cols, [self.n_users, self.n_items],
            self.min_emb_dim)
        self.model = TabularModel(self.embedding_sizes, self.n_cont)
        self.model.to(self.device)
        self.lr = lr
        self.wd = wd
        self.optimizer = self.get_optimizer(self.model, lr=self.lr,
                                            wd=self.wd)
        
    def choose_embedding_size(self, cat_cols, cat_num_values,
                              min_emb_dim=100):
        """
        cat_cols: list of categorical columns
        cat_num_values: list of number of unique values for each
        categorical column
        """
        embedded_cols = dict(zip(cat_cols, cat_num_values))
        embedding_sizes = [
            (n_categories, min(min_emb_dim, (n_categories+1)//2))
             for _, n_categories in embedded_cols.items()]
        return embedding_sizes
    
    def get_optimizer(self, model, lr = 0.001, wd = 0.0):
        parameters = filter(lambda p: p.requires_grad,
                            model.parameters())
        optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
        return optim
    
    def construct_tensor(self, a):
        final = []
        for i in a:
            out = []
            for j in i:
                out.append(j.tolist())
            out1 = []
            for item in zip(*out):
                out1.append(list(item))
            final += out1
        return tensor(final)


    def construct_tensor_test(self, a):
        out = []
        for i in a:
            out.append(i.tolist())
            out1 = []
            for item in zip(*out):
                out1.append(list(item))
        return tensor(out1)


    def construct_tensor_y(self, a):
        out = []
        for i in a:
            out += i.tolist()
        return tensor(out)
    
    def train(self, train_dl, train_size, chunksize, batch_size):
        self.model.train()
        total = 0
        sum_loss = 0
        with tqdm(total=train_size // (batch_size * chunksize)) as pbar:
            for x1, x2, y in train_dl:
                x1, x2, y = (self.construct_tensor(x1),
                             self.construct_tensor(x2),
                             self.construct_tensor_y(y))
                x1 = x1.to(self.device)
                x2 = x2.to(self.device)
                y = y.to(self.device)
                batch = y.size()[0]
                y = y.reshape((y.size()[0], 1))
                output = self.model(x1, x2)
                loss = self.loss_fn(output, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                total += batch
                sum_loss += loss.item()
                pbar.update(1)
        return sum_loss/total
    
    def evaluate(self, valid_dl, test_size, batch_size):
        self.model.eval()
        total = 0
        sum_loss = 0
        with tqdm(total=test_size // (batch_size)) as pbar:
            for x1, x2, y in valid_dl:
                x1, x2 = (self.construct_tensor_test(x1),
                          self.construct_tensor_test(x2))
                x1 = x1.to(self.device)
                x2 = x2.to(self.device)
                y = y.to(self.device)
                current_batch_size = y.size()[0]
                y = y.reshape((y.size()[0], 1))
                y = y.float()
                out = self.model(x1, x2)
                loss = self.loss_fn(out, y)
                sum_loss += loss.item()
                total += current_batch_size
                pbar.update(1)
        print("valid loss %.3f" % (sum_loss/total))

        return sum_loss/total
    
    def batch_fit(self, train_dl, valid_dl, epochs, train_size,
                  test_size, chunksize, batch_size):
        start = time.time()
        losses = []
        for i in range(epochs):
            stats = {'epoch': i+1}
            train_loss = self.train(train_dl, train_size, chunksize,
                                    batch_size)
            print("training loss: ", train_loss)
            stats['train_loss'] = train_loss
            test_loss = self.evaluate(valid_dl, test_size, batch_size)
            print('time taken: %0.2f' % (time.time() - start))
            stats['test_loss'] = test_loss
            losses.append(stats)
        return losses
    
    def predict(self, test_dl):
        preds = []
        actuals = []
        with torch.no_grad():
            for x1, x2, y in test_dl:
                x1, x2 = (self.construct_tensor_test(x1),
                          self.construct_tensor_test(x2))
                x1 = x1.to(self.device)
                x2 = x2.to(self.device)
                y = y.to(self.device)
                y = y.reshape((y.size()[0], 1))
                pred = self.model(x1, x2)
                preds.append(pred.tolist())
                actuals.append(y.tolist())
        final_preds = [item for sublist in preds for item in sublist]
        final_actuals = [item for sublist in actuals for item in sublist]
        rmse = np.sqrt(mean_squared_error(y_true=final_actuals,
                                          y_pred=final_preds))
        return final_actuals, final_preds, rmse

### Experiments
1. Create embeddings for each movie based on tf-idf+SVD computed on titles and then use them in a NN along with other features
2. Use pretrained word embeddings for each word in the movie title and combine them using an average. Then, use them in a NN along with other features
3. Train embeddings from scratch on movie titles

### Experiment 1 - TFIDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer

In [14]:
def create_pipeline(train, max_df=0.5, max_features=10000,
                    min_df=20, ngram_range=(1, 3), n_components=100):

    pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=max_df, max_features=max_features,
                              min_df=min_df, ngram_range=ngram_range)),
    ('svd', TruncatedSVD(n_components=n_components, random_state=6062018)),
    ('normalize', Normalizer(copy=False))])

    start = time.time()
    print('fitting begins')
    X_train_trans = pipeline.fit_transform(train)
    print('time taken: %0.f' % (time.time() - start))

    print('shape of transformed train: ', X_train_trans.shape)

    print('Record pipeline metrics')
    svd = pipeline.steps[1][1]
    num_components = len(svd.components_)
    explained_var_ratio = svd.explained_variance_ratio_.sum()
    tfidf = pipeline.steps[0][1]
    feature_names = tfidf.get_feature_names()

    comp_feats = {}
    for t in range(5):
        best_features = [feature_names[i] for i in svd.components_[t].argsort()[::-1]]
        comp_feats['comp_{}'.format(t)] = best_features[:100]

    comp_feats = pd.DataFrame(comp_feats)
    print('shape of DF of top features from top 5 components: ',
          comp_feats.shape)
    comp_feats['num_components'] = num_components
    comp_feats['explained_var_ratio'] = explained_var_ratio

    print('create transformed train DF')
    column_names = ['comp_{}'.format(i) for i in range(num_components)]
    df_train_trans = pd.DataFrame(X_train_trans, columns=column_names)

    return pipeline, comp_feats, df_train_trans

In [15]:
def preprocessing(text, stop):
    text = regex_cleaning(text)
    text = " ".join([w for w in nltk.word_tokenize(text)
                     if w not in stop])
    return text

In [16]:
movie_titles.fillna({'Name': ''}, inplace=True)
stop = set(stopwords.words('english'))
movie_titles['cleaned_name'] = movie_titles['Name'].apply(
    lambda x: preprocessing(str(x), stop))

In [17]:
X_train = movie_titles['cleaned_name'].values
print(X_train.shape)

(17770,)


In [18]:
# params
max_df = 0.9
min_df = 1
max_features = 8000
n_components = 600
ngram_range=(1, 3)

In [19]:
pipeline, comp_feats, df_train_trans = create_pipeline(
    train=X_train, max_df=max_df, ngram_range=ngram_range,
    max_features=max_features, min_df=min_df,
    n_components=n_components)

fitting begins
time taken: 7
shape of transformed train:  (17770, 600)
Record pipeline metrics
shape of DF of top features from top 5 components:  (100, 5)
create transformed train DF


In [20]:
print(comp_feats.shape)
print(comp_feats.head())

(100, 7)
           comp_0          comp_1      comp_2      comp_3             comp_4  \
0          season        material        love         man                vol   
1            show  bonus material         man  spider man           twilight   
2     show season           bonus       story      spider               zone   
3  friends season            love  love story      little      twilight zone   
4         friends             man       death    thin man  twilight zone vol   

   num_components  explained_var_ratio  
0             600             0.424208  
1             600             0.424208  
2             600             0.424208  
3             600             0.424208  
4             600             0.424208  


In [21]:
df_train_trans = pd.concat([df_train_trans, movie_titles[['Id', 'Year']]],
                           axis=1)
print(df_train_trans.shape)
df_train_trans.head()

(17770, 602)


Unnamed: 0,comp_0,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8,comp_9,...,comp_592,comp_593,comp_594,comp_595,comp_596,comp_597,comp_598,comp_599,Id,Year
0,0.00029,0.006528,0.001948,0.009814,0.001207,0.001848,0.004495,0.002681,0.00283,0.001798,...,0.018909,-0.011516,0.018872,0.006439,-0.025365,0.0082,-0.00684,-0.009207,1,2003.0
1,0.017549,0.104052,0.285407,0.936571,-0.022265,-0.024887,-0.040205,-0.011232,-0.000946,-0.004132,...,-0.006473,0.006893,-0.001241,-0.012508,-0.001735,-0.000228,0.007632,0.014738,2,2004.0
2,0.000823,-0.002925,-0.002762,-0.001771,-0.012129,-0.013147,-0.02085,-0.015118,0.014392,-0.000394,...,-0.04617,-0.011522,0.009152,0.001655,0.032514,0.001405,0.003044,0.006212,3,1997.0
3,0.000265,0.000591,0.000562,0.000784,0.001527,0.008167,0.008958,0.004294,0.004266,0.010011,...,-0.006488,0.003249,-0.003207,-0.001946,0.001793,-0.002254,-0.003513,0.000935,4,1994.0
4,0.000213,0.002631,0.00247,-0.000727,0.000769,0.00457,0.011066,0.001506,0.00177,-0.000768,...,-0.001975,0.026948,0.005072,-0.007858,0.000405,-0.002845,0.012002,0.007192,5,2004.0


In [22]:
df_train_trans.memory_usage()

Index          128
comp_0      142160
comp_1      142160
comp_2      142160
comp_3      142160
             ...  
comp_597    142160
comp_598    142160
comp_599    142160
Id          142160
Year        142160
Length: 603, dtype: int64

In [23]:
cols = [x for x in list(df_train_trans.columns) if x.startswith('comp_')]

for col in cols:
    df_train_trans[col] = df_train_trans[col].astype(np.float32)
    
print(df_train_trans[cols].dtypes)
df_train_trans.memory_usage()

comp_0      float32
comp_1      float32
comp_2      float32
comp_3      float32
comp_4      float32
             ...   
comp_595    float32
comp_596    float32
comp_597    float32
comp_598    float32
comp_599    float32
Length: 600, dtype: object


Index          128
comp_0       71080
comp_1       71080
comp_2       71080
comp_3       71080
             ...  
comp_597     71080
comp_598     71080
comp_599     71080
Id          142160
Year        142160
Length: 603, dtype: int64

In [24]:
# convert df to a dict
df_train_trans['comp_feat'] = df_train_trans[cols].values.tolist()

In [25]:
# read item2idx dct
item2idx = json.load(open(ITEM2IDX_FN))
table = df_train_trans[['Id', 'comp_feat']]
table['Id'] = table['Id'].apply(lambda x: item2idx[str(x)])
comp_dct = dict(zip(table['Id'], table['comp_feat']))
del table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [27]:
list(comp_dct.keys())[-10:]

[17760, 17761, 17762, 17763, 17764, 17765, 17766, 17767, 17768, 17769]

In [28]:
# save
MOVIE_METADATA_DIR = os.path.join(DATA_DIR, 'movie_metadata')
MOVIE_TITLES_TFIDF_COMPS_FN = os.path.join(
    MOVIE_METADATA_DIR, 'movie_titles_tfidf_comps.json')
MOVIE_TITLES_TFIDF_FEAT_IMP_FN = os.path.join(
    MOVIE_METADATA_DIR, 'movie_titles_tfidf_feat_imp.csv')
MOVIE_TITLES_TFIDF_PIPELINE_FN = os.path.join(
    MOVIE_METADATA_DIR, 'movie_titles_tfidf_pipeline.pkl')

json.dump(comp_dct, open(MOVIE_TITLES_TFIDF_COMPS_FN, 'w'))
comp_feats.to_csv(MOVIE_TITLES_TFIDF_FEAT_IMP_FN, index=False)
joblib.dump(pipeline, open(MOVIE_TITLES_TFIDF_PIPELINE_FN, 'wb'))

### Experiment 2 - Bert Pretrained Sentence Embeddings

In [16]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Using cached https://files.pythonhosted.org/packages/b9/46/b7d6c37d92d1bd65319220beabe4df845434930e3f30e42d3cfaecb74dc4/sentence-transformers-0.2.6.1.tar.gz
Collecting transformers>=2.8.0 (from sentence-transformers)
  Using cached https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl
Collecting tokenizers==0.7.0 (from transformers>=2.8.0->sentence-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/e9/9b/fefc49f80e3b5cc48f0b1c8aa2c25f673735b70b0984810f5cc3c8438175/tokenizers-0.7.0-cp36-cp36m-macosx_10_10_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 2.1MB/s ta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/varunn/Library/Caches/pip/wheels/d7/fa/17/2b081a8cd8b0a86753fb0e9826b3cc19f0207062c0

In [17]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [19]:
movie_titles.fillna({'Name': ''}, inplace=True)
movie_titles['cleaned_name'] = movie_titles['Name'].apply(
    lambda x: regex_cleaning(str(x), only_alpha=False,
                             only_alphanumeric=False))

In [20]:
movie_titles.head()

Unnamed: 0,Id,Year,Name,cleaned_name
0,1,2003.0,Dinosaur Planet,dinosaur planet
1,2,2004.0,Isle of Man TT 2004 Review,isle of man tt 2004 review
2,3,1997.0,Character,character
3,4,1994.0,Paula Abdul's Get Up & Dance,paula abduls get up & dance
4,5,2004.0,The Rise and Fall of ECW,the rise and fall of ecw


In [21]:
inps = movie_titles['cleaned_name'].tolist()
embs = model.encode(inps)

In [22]:
print(len(inps))
print(len(embs))
print(embs[0].shape)

17770
17770
(768,)


In [24]:
embs = [x.tolist() for x in embs]
print(len(embs))
print(len(embs[0]))

17770
768


In [26]:
# read item2idx dct
item2idx = json.load(open(ITEM2IDX_FN))
table = pd.DataFrame({'Id': movie_titles['Id'].tolist(), 'emb': embs})
table['Id'] = table['Id'].apply(lambda x: item2idx[str(x)])
comp_dct = dict(zip(table['Id'], table['emb']))
del table

In [30]:
# save
MOVIE_TITLES_BERT_COMPS_FN = os.path.join(MOVIE_METADATA_DIR,
                                          'movie_titles_bert_comps.json')
json.dump(comp_dct, open(MOVIE_TITLES_BERT_COMPS_FN, 'w'))

#### Model Training

In [7]:
# GLOBALS
FILE_NUM = 1
N_USERS = 480189
N_ITEMS = 17770
N_CONT = 768+15
BATCH_SIZE = 50
CHUNKSIZE = 100
TRAIN_SIZE = 22851074
VAL_SIZE = 962152 
TEST_SIZE = 240538

In [9]:
# dataset

from torch.utils.data import DataLoader

train_dataset = InteractionsStream(
    file_num=FILE_NUM, baseline_feats=True, model_type='regression',
    sample='train', chunksize=CHUNKSIZE, normalize=False,
    title_features=True, title_feats_fn=MOVIE_TITLES_BERT_COMPS_FN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=False)

test_dataset = InteractionsStream(
    file_num=FILE_NUM, baseline_feats=True, model_type='regression',
    sample='test', normalize=False, title_features=True,
    title_feats_fn=MOVIE_TITLES_BERT_COMPS_FN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         shuffle=False)

['/Users/varunn/Documents/kaggle/netflix-prize-data/prepared_data_for_NN_modelling/user_train_data_1.h5']
['/Users/varunn/Documents/kaggle/netflix-prize-data/prepared_data_for_NN_modelling/user_test_data_1.h5']


In [10]:
from itertools import islice

for x1, x2, y in islice(train_loader, 1):
    x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                 construct_tensor_y(y))
    y = y.reshape((y.size()[0], 1))
    print(x1)
    print('\n')
    print(x2)
    print('\n')
    print(y)
    print(x2.shape)
    print(y.shape)
    print('\n\n\n')

read data
create an iterable
tensor([[161459,   2138],
        [412629,   2861],
        [362249,   3729],
        ...,
        [265782,   3289],
        [122286,   4305],
        [398320,   1072]])


tensor([[ 2.3000e+01,  4.7958e+00,  2.5100e+02,  ...,  3.7790e-02,
         -2.9324e-01,  2.6283e-01],
        [ 0.0000e+00,  0.0000e+00,  1.2870e+03,  ...,  1.9136e+00,
          7.0776e-01, -4.5362e-01],
        [ 2.2000e+01,  4.6904e+00,  1.9290e+03,  ..., -3.3361e-01,
         -2.6666e-01, -1.7506e-01],
        ...,
        [ 4.5000e+01,  6.7082e+00,  2.4300e+02,  ..., -3.0964e-01,
          2.6775e-01, -4.6342e-01],
        [ 2.4300e+02,  1.5588e+01,  5.0700e+02,  ...,  2.4133e-01,
         -3.8084e-01, -2.6416e-01],
        [ 4.9500e+02,  2.2249e+01,  5.8000e+02,  ...,  2.5979e-01,
         -1.7485e-01, -3.9449e-01]])


tensor([[4.],
        [4.],
        [5.],
        ...,
        [2.],
        [1.],
        [5.]])
torch.Size([5000, 783])
torch.Size([5000, 1])






In [27]:
"""
for x1, x2, y in islice(test_loader, 1):
    x1, x2 = construct_tensor_test(x1), construct_tensor_test(x2)
    y = y.reshape((y.size()[0], 1))
    y = y.float()
    print(x1)
    print('\n')
    print(x2)
    print('\n')
    print(y)
    print(x2.shape)
    print(y.shape)
    out = model.model(x1, x2)
    print(out)
    loss = torch.nn.MSELoss(reduction='sum')(out, y)
    print(loss)
"""

"\nfor x1, x2, y in islice(test_loader, 1):\n    x1, x2 = construct_tensor_test(x1), construct_tensor_test(x2)\n    y = y.reshape((y.size()[0], 1))\n    y = y.float()\n    print(x1)\n    print('\n')\n    print(x2)\n    print('\n')\n    print(y)\n    print(x2.shape)\n    print(y.shape)\n    out = model.model(x1, x2)\n    print(out)\n    loss = torch.nn.MSELoss(reduction='sum')(out, y)\n    print(loss)\n"

In [11]:
# Instantiate train class

model = Train(file_num=FILE_NUM, n_users=N_USERS, n_items=N_ITEMS,
              n_cont=N_CONT, lr=0.02, wd=0.00001)

In [12]:
model.model

TabularModel(
  (embeddings): ModuleList(
    (0): Embedding(480189, 100)
    (1): Embedding(17770, 100)
  )
  (lin1): Linear(in_features=983, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=1, bias=True)
  (bn1): BatchNorm1d(783, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [13]:
start = time.time()

losses = model.batch_fit(train_dl=train_loader, valid_dl=test_loader,
                         epochs=2, train_size=TRAIN_SIZE,
                         test_size=TEST_SIZE, chunksize=CHUNKSIZE,
                         batch_size=BATCH_SIZE)

print('time taken: %0.2f' % (time.time() - start))

  0%|          | 0/4570 [00:00<?, ?it/s]

read data
create an iterable


4571it [4:49:00,  3.15s/it]                            
  0%|          | 0/4810 [00:00<?, ?it/s]

training loss:  0.8844701523971533
read data
create an iterable


4811it [31:31,  2.12it/s]                          
  0%|          | 0/4570 [00:00<?, ?it/s]

valid loss 0.797
time taken: 19232.30
read data
create an iterable


4571it [4:42:38,  3.09s/it]                            
  0%|          | 0/4810 [00:00<?, ?it/s]

training loss:  0.7908056428095602
read data
create an iterable


4811it [33:13,  1.38it/s]                          

valid loss 0.776
time taken: 38184.52
time taken: 38184.52





In [14]:
losses

[{'epoch': 1,
  'train_loss': 0.8844701523971533,
  'test_loss': 0.7971235709575782},
 {'epoch': 2,
  'train_loss': 0.7908056428095602,
  'test_loss': 0.7762977449611403}]

In [20]:
np.sqrt(losses[-1]['test_loss'])

0.881077604391997

In [18]:
model_fn = os.path.join(MODEL_DIR,
                        "NN_DenseFFNN_FBaselineAndTitleEmbBert_E2.pt")
torch.save(model.model.state_dict(), model_fn)