In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os, sys, time, json, copy, re, joblib

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from collections import deque, defaultdict

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# To use recommender systems
import surprise as sp
from surprise.model_selection import cross_validate

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
from scipy.sparse import coo_matrix, csr_matrix
from scipy import sparse

# To light fm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

# To stack sparse matrices
from scipy.sparse import vstack

Using TensorFlow backend.

Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


LightFM was compiled without OpenMP support. Only a single thread will be used.

# Approach
1. Train models on each of combined_data_1.txt, combined_data_2.txt and combined_data_3.txt.
2. Use combined_data_4.txt data to combine predictions from the trained models. Include some additional features computed on this dataset

In [2]:
# GLOBALS
LOCAL_DIR = '/Users/varunn/Documents/'
PROJ_DIR = os.path.join(LOCAL_DIR, 'kaggle')
DATA_DIR = os.path.join(PROJ_DIR, 'netflix-prize-data')
OUT_DIR = os.path.join(DATA_DIR, 'interim')
FEATS_DIR = os.path.join(DATA_DIR, 'features')
USER_DATA_FN = os.path.join(DATA_DIR, 'combined_data_{}.txt')
MOVIES_DATA_FN = os.path.join(DATA_DIR, 'movie_titles.csv')
PROBE_DATA_FN = os.path.join(DATA_DIR, 'probe.txt')
QUALIFYING_DATA_FN = os.path.join(DATA_DIR, 'qualifying.txt')

## Preprocessing

In [3]:
# load movie dataset
movie_titles = pd.read_csv(MOVIES_DATA_FN, 
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name'])

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
print(movie_titles.sample(5))

Shape Movie-Titles:	(17770, 3)
          Id    Year                                      Name
16347  16348  1998.0               The Arrival / The Arrival 2
7188    7189  1968.0                          The Avengers '68
4611    4612  1994.0                                   Timecop
15742  15743  1995.0                        Jefferson in Paris
8014    8015  2005.0  Cold Case Files: The Most Infamous Cases


In [11]:
# load user data

def load_single_user_file(user_file_num):
    """
    loads a single user file into memory
    """
    print('read user data')
    df = pd.read_csv(USER_DATA_FN.format(user_file_num), header=None,
                     names=['User', 'Rating', 'Date'],
                     usecols=[0, 1, 2])
    print('Shape user-data:\t{}'.format(df.shape))
    
    print('convert Date to datetime format')
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    
    print('Find empty rows to slice dataframe for each movie')
    tmp_movies = df[df['Rating'].isnull()]['User'].reset_index()    
    movie_indices = [[index, int(movie[:-1])] for index, movie in
                     tmp_movies.values]
    
    print('Shift the movie_indices by one to get start and endpoints of all movies')
    shifted_movie_indices = deque(movie_indices)
    shifted_movie_indices.rotate(-1)
    
    print('create a dataframe with movie id and user ratings')
    user_data = []

    for [df_id_1, m_id], [df_id_2, n_m_id] in zip(movie_indices,
                                                  shifted_movie_indices):
        if df_id_1 < df_id_2:
            tmp_df = df.loc[df_id_1+1: df_id_2-1, :].copy()
        else:
            # last movie
            tmp_df = df.loc[df_id_1+1:, :].copy()

        tmp_df['Movie'] = m_id
        user_data.append(tmp_df)

    print('Combine all dataframes')
    df_1 = pd.concat(user_data)
    del (user_data, df, tmp_movies, tmp_df, shifted_movie_indices,
         movie_indices, df_id_1, m_id, df_id_2, n_m_id)
    print('Shape User-Ratings:\t{}'.format(df_1.shape))
    print('num users: ', df_1['User'].nunique())
    
    return df_1


def load_qualifying_data(inp_fn):
    """
    loads a single user file into memory
    """
    print('read user data')
    df = pd.read_csv(inp_fn, header=None,
                     names=['User', 'Date'],
                     usecols=[0, 1])
    print('Shape user-data:\t{}'.format(df.shape))
    
    print('convert Date to datetime format')
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    
    print('Find empty rows to slice dataframe for each movie')
    tmp_movies = df[df['Date'].isnull()]['User'].reset_index()    
    movie_indices = [[index, int(movie[:-1])] for index, movie in
                     tmp_movies.values]
    
    print('Shift the movie_indices by one to get start and endpoints of all movies')
    shifted_movie_indices = deque(movie_indices)
    shifted_movie_indices.rotate(-1)
    
    print('create a dataframe with movie id and user ratings')
    user_data = []

    for [df_id_1, m_id], [df_id_2, n_m_id] in zip(movie_indices,
                                                  shifted_movie_indices):
        if df_id_1 < df_id_2:
            tmp_df = df.loc[df_id_1+1: df_id_2-1, :].copy()
        else:
            # last movie
            tmp_df = df.loc[df_id_1+1:, :].copy()

        tmp_df['Movie'] = m_id
        user_data.append(tmp_df)

    print('Combine all dataframes')
    df_1 = pd.concat(user_data)
    del (user_data, df, tmp_movies, tmp_df, shifted_movie_indices,
         movie_indices, df_id_1, m_id, df_id_2, n_m_id)
    print('Shape User-Ratings:\t{}'.format(df_1.shape))
    print('num users: ', df_1['User'].nunique())
    
    return df_1


def train_test_split(data, split=0.3):
    # Shuffle DataFrame
    d = data.sample(frac=1).reset_index(drop=True)

    # 70:30 split
    n = int(split*d.shape[0])

    # Split into train & test sets
    df_train = d[:-n]
    df_test = d[-n:]
    
    return df_train, df_test

In [26]:
start = time.time()
for file_num in range(1, 5, 1):
    
    print('file num: %d' % (file_num))
    
    out_fn = os.path.join(OUT_DIR, 'user_data_{}.h5'.format(file_num))
    if not os.path.isfile(out_fn):
        out = load_single_user_file(file_num)
    else:
        out = pd.read_hdf(out_fn, key='stage')
    
    print('train and test split')
    df_train, df_test = train_test_split(out)

    if not os.path.isfile(out_fn):
        print('save out')
        out.to_hdf(out_fn, key='stage', mode='w')
    
    print('save df_train')
    out_fn = os.path.join(OUT_DIR, 'user_train_data_{}.h5'.format(
        file_num))
    if not os.path.isfile(out_fn):
        df_train.to_hdf(out_fn, key='stage', mode='w')
    
    print('save df_test')
    out_fn = os.path.join(OUT_DIR, 'user_test_data_{}.h5'.format(
        file_num))
    if not os.path.isfile(out_fn):
        df_test.to_hdf(out_fn, key='stage', mode='w')
    
    print('release memory')
    del out, df_train, df_test
    
    print('time taken for file_num %d: %0.2f' % (file_num, time.time()-start))

print('total time taken: %0.2f' % (time.time() - start))

file num: 1
train and test split
save df_train
save df_test
release memory
time taken for file_num 1: 39.71
file num: 2
train and test split
save df_train
save df_test
release memory
time taken for file_num 2: 79.75
file num: 3
train and test split
save df_train
save df_test
release memory
time taken for file_num 3: 112.73
file num: 4
train and test split
save df_train
save df_test
release memory
time taken for file_num 4: 153.09
total time taken: 153.09


In [27]:
# checks - ensure that train and test have all movie ids
for i in range(1, 5, 1):
    print('file num: ', i)
    out_fn = os.path.join(OUT_DIR, 'user_data_{}.h5'.format(i))
    df_all = pd.read_hdf(out_fn, key='stage')
    out_fn = os.path.join(OUT_DIR, 'user_train_data_{}.h5'.format(i))
    df_train = pd.read_hdf(out_fn, key='stage')
    out_fn = os.path.join(OUT_DIR, 'user_test_data_{}.h5'.format(i))
    df_test = pd.read_hdf(out_fn, key='stage')
    
    print(df_train.head())
    
    print('All: \n')
    print('shape: ', df_all.shape)
    print('num_users: ', df_all['User'].nunique())
    print('num_movies: ', df_all['Movie'].nunique())
    print('Train: \n')
    print('shape: ', df_train.shape)
    print('num_users: ', df_train['User'].nunique())
    print('num_movies: ', df_train['Movie'].nunique())
    print('Test: \n')
    print('shape: ', df_test.shape)
    print('num_users: ', df_test['User'].nunique())
    print('num_movies: ', df_test['Movie'].nunique())

file num:  1
      User  Rating       Date  Movie
0  2293175     3.0 2005-01-29   2000
1   623709     3.0 2001-11-08   2001
2  1177874     3.0 2005-06-22    758
3    93753     5.0 2004-04-24   2803
4   696079     3.0 2005-07-18   4369
All: 

shape:  (24053764, 4)
num_users:  470758
num_movies:  4499
Train: 

shape:  (16837635, 4)
num_users:  463956
num_movies:  4499
Test: 

shape:  (7216129, 4)
num_users:  435146
num_movies:  4499
file num:  2
      User  Rating       Date  Movie
0  1426869     4.0 2005-07-24   5582
1  2473986     4.0 2005-08-05   6357
2   903858     4.0 2005-05-14   8881
3  2237185     3.0 2004-07-07   5836
4  1785544     4.0 2004-03-11   5085
All: 

shape:  (26977591, 4)
num_users:  474062
num_movies:  4711
Train: 

shape:  (18884314, 4)
num_users:  470009
num_movies:  4711
Test: 

shape:  (8093277, 4)
num_users:  448406
num_movies:  4711
file num:  3
      User  Rating       Date  Movie
0  1563935     1.0 2005-08-15  10301
1  1719477     2.0 2005-06-29  12400
2  256

In [11]:
###################### NOT USED #################

### Create sparse matrix from dataframe

In [7]:
start = time.time()

for i in range(1, 5, 1):
    
    print('file num: ', i)
    
    print('load train data')
    inp_fn = os.path.join(OUT_DIR, 'user_train_data_{}.h5'.format(i))
    train_out_fn = os.path.join(OUT_DIR,
                                'user_train_sparse_matrix_{}.npz'.format(i))
    df_train = pd.read_hdf(inp_fn, key='stage')
    
    print('load test data')
    inp_fn = os.path.join(OUT_DIR, 'user_test_data_{}.h5'.format(i))
    test_out_fn = os.path.join(OUT_DIR,
                               'user_test_sparse_matrix_{}.npz'.format(i))
    df_test = pd.read_hdf(inp_fn, key='stage')
    
    print("creating sparse_matrix from the dataframe for train..")
    users = list(set(df_train['User'].unique().tolist() +
                     df_test['User'].unique().tolist()))
    movies = list(set(df_train['Movie'].unique().tolist() +
                      df_test['Movie'].unique().tolist()))
    user_id_mapping = {id: i for i, id in enumerate(users)}
    movie_id_mapping = {id: i for i, id in enumerate(movies)}
    
    user_ids = df_train['User'].map(user_id_mapping)
    movie_ids = df_train['Movie'].map(movie_id_mapping)
    shape = (len(user_id_mapping), len(movie_id_mapping))
    sparse_matrix = sparse.csr_matrix(
        (df_train['Rating'].values, (user_ids, movie_ids)), shape=shape)
    
    del df_train
    
    print('Done. It\'s shape is : (user, movie) : ', sparse_matrix.shape)
    print('Saving it into disk for furthur usage..')
    # save it into disk
    sparse.save_npz(train_out_fn, sparse_matrix)
    del sparse_matrix
    print('Done..\n')
    
    print("creating sparse_matrix from the dataframe for test..")
    user_ids = df_test['User'].map(user_id_mapping)
    movie_ids = df_test['Movie'].map(movie_id_mapping)
    sparse_matrix = sparse.csr_matrix(
        (df_test['Rating'].values, (user_ids, movie_ids)), shape=shape)
    
    del df_test
    
    print('Done. It\'s shape is : (user, movie) : ', sparse_matrix.shape)
    print('Saving it into disk for furthur usage..')
    # save it into disk
    sparse.save_npz(test_out_fn, sparse_matrix)
    del sparse_matrix
    
    user_id_mapping = {str(k): v for k, v in user_id_mapping.items()}
    movie_id_mapping = {str(k): v for k, v in movie_id_mapping.items()}
    json.dump(user_id_mapping, open(
        OUT_DIR+'/user_id_mapping_{}.json'.format(i), 'w'))
    json.dump(movie_id_mapping, open(
        OUT_DIR+'/movie_id_mapping_{}.json'.format(i), 'w'))
    print('Done..\n')
    

print('time taken: %0.2f' % (time.time() - start))

file num:  1
load train data
load test data
creating sparse_matrix from the dataframe for train..
Done. It's shape is : (user, movie) :  (470758, 4499)
Saving it into disk for furthur usage..
Done..

creating sparse_matrix from the dataframe for test..
Done. It's shape is : (user, movie) :  (470758, 4499)
Saving it into disk for furthur usage..
Done..

file num:  2
load train data
load test data
creating sparse_matrix from the dataframe for train..
Done. It's shape is : (user, movie) :  (474062, 4711)
Saving it into disk for furthur usage..
Done..

creating sparse_matrix from the dataframe for test..
Done. It's shape is : (user, movie) :  (474062, 4711)
Saving it into disk for furthur usage..
Done..

file num:  3
load train data
load test data
creating sparse_matrix from the dataframe for train..
Done. It's shape is : (user, movie) :  (474662, 4157)
Saving it into disk for furthur usage..
Done..

creating sparse_matrix from the dataframe for test..
Done. It's shape is : (user, movie) :

## Feature Engineering

In [25]:
def get_weighted_mean_ratings(df, m):

    # overall mean of all movies
    C = df['Rating'].mean()

    # mean by movies
    R = df.groupby('Movie')['Rating'].mean()

    # count by movies
    v = df.groupby('Movie')['Rating'].count().values
    
    # movie indices
    movie_ids = R.index
    R = R.values

    # weighted score calculation
    weighted_scores = (v/(v+m))*R + (m/(v+m))*C

    # rank based on weighted score
    weighted_ranking = np.argsort(weighted_scores)[::-1]
    weighted_scores = np.sort(weighted_scores)[::-1]

    # get movie ids corresponding to rankings
    weighted_movie_ids = movie_ids[weighted_ranking]
    
    weighted_scores_dct = dict(zip(weighted_movie_ids, weighted_scores))

    return weighted_scores_dct


def _update_user_dct(file_dct, global_dct, how='earliest'):
    
    if not global_dct:
        return file_dct
    
    d = {}
    if how == 'earliest':
        func = lambda x, y: min(x, y)
    elif how == 'latest':
        func = lambda x, y: max(x, y)
    elif how == 'count':
        func = lambda x, y: sum([x, y])

    for k in file_dct:
        if k in global_dct:
            d[k] = func(file_dct[k], global_dct[k])
        else:
            d[k] = file_dct[k]
    return d

In [26]:
# mean movie rating and weighted mean movie rating

mean_ratings_dct = {}
weighted_mean_ratings_dct = {}
user_earliest_date_dct = {}
movie_earliest_date_dct = {}
user_latest_date_dct = {}
movie_latest_date_dct = {}
movie_num_ratings = {}
user_num_ratings = {}
for i in range(1, 5, 1):
    
    print('file num: ', i)
    
    print('reading train data')
    out_fn = os.path.join(OUT_DIR, 'user_train_data_{}.h5'.format(i))
    df_train = pd.read_hdf(out_fn, key='stage')
    
    print('convert Date to datetime format')
    df_train['Date'] = pd.to_datetime(df_train['Date'], format='%Y-%m-%d')
    
    print('calc mean ratings')
    mean_ratings_df = df_train.groupby('Movie')['Rating'].mean().rename(
        'Mean_Rating').reset_index()
    d = dict(zip(mean_ratings_df['Movie'],
                 mean_ratings_df['Mean_Rating']))
    mean_ratings_dct.update(d)
    del mean_ratings_df, d
    
    print('calc weighted mean ratings')
    d = get_weighted_mean_ratings(df_train, 250)
    weighted_mean_ratings_dct.update(d)
    del d
    
    print('movie earliest date dct')
    earliest_date_df = df_train.groupby('Movie')['Date'].min().rename(
        'earliest_date').reset_index()
    d = dict(zip(earliest_date_df['Movie'],
                 earliest_date_df['earliest_date']))
    movie_earliest_date_dct.update(d)
    del earliest_date_df, d
    
    print('movie latest date dct')
    latest_date_df = df_train.groupby('Movie')['Date'].max().rename(
        'latest_date').reset_index()
    d = dict(zip(latest_date_df['Movie'],
                 latest_date_df['latest_date']))
    movie_latest_date_dct.update(d)
    del latest_date_df, d
    
    print('user earliest date dct')
    earliest_date_df = df_train.groupby('User')['Date'].min().rename(
        'earliest_date').reset_index()
    d = dict(zip(earliest_date_df['User'],
                 earliest_date_df['earliest_date']))
    user_earliest_date_dct = _update_user_dct(d, user_earliest_date_dct,
                                              'earliest')
    del earliest_date_df, d
    
    print('user latest date dct')
    latest_date_df = df_train.groupby('User')['Date'].max().rename(
        'latest_date').reset_index()
    d = dict(zip(latest_date_df['User'],
                 latest_date_df['latest_date']))
    user_latest_date_dct = _update_user_dct(d, user_latest_date_dct,
                                            'latest')
    del latest_date_df, d
    
    print('movie num ratings')
    count_df = df_train.groupby('Movie')['Rating'].count().rename(
        'num_ratings').reset_index()
    d = dict(zip(count_df['Movie'], count_df['num_ratings']))
    movie_num_ratings.update(d)
    del count_df, d
    
    print('user num ratings')
    count_df = df_train.groupby('User')['Rating'].count().rename(
        'num_ratings').reset_index()
    d = dict(zip(count_df['User'], count_df['num_ratings']))
    user_num_ratings = _update_user_dct(d, user_num_ratings, 'count')
    del count_df, d
    
    del df_train

file num:  1
reading train data
convert Date to datetime format
calc mean ratings
calc weighted mean ratings
movie earliest date dct
movie latest date dct
user earliest date dct
user latest date dct
movie num ratings
user num ratings
file num:  2
reading train data
convert Date to datetime format
calc mean ratings
calc weighted mean ratings
movie earliest date dct
movie latest date dct
user earliest date dct
user latest date dct
movie num ratings
user num ratings
file num:  3
reading train data
convert Date to datetime format
calc mean ratings
calc weighted mean ratings
movie earliest date dct
movie latest date dct
user earliest date dct
user latest date dct
movie num ratings
user num ratings
file num:  4
reading train data
convert Date to datetime format
calc mean ratings
calc weighted mean ratings
movie earliest date dct
movie latest date dct
user earliest date dct
user latest date dct
movie num ratings
user num ratings


In [38]:
# save outputs to disk
json.dump(mean_ratings_dct,
          open(FEATS_DIR+'/mean_ratings_movie.json', 'w'))
json.dump(weighted_mean_ratings_dct,
          open(FEATS_DIR+'/weighted_mean_ratings_movie.json', 'w'))

user_earliest_date_dct = {k: v.strftime('%Y-%m-%d') for k, v in 
                          user_earliest_date_dct.items()}
json.dump(user_earliest_date_dct,
          open(FEATS_DIR+'/earliest_rating_date_user.json', 'w'))

movie_earliest_date_dct = {k: v.strftime('%Y-%m-%d') for k, v in 
                           movie_earliest_date_dct.items()}
json.dump(movie_earliest_date_dct,
          open(FEATS_DIR+'/earliest_rating_date_movie.json', 'w'))

user_latest_date_dct = {k: v.strftime('%Y-%m-%d') for k, v in 
                        user_latest_date_dct.items()}
json.dump(user_latest_date_dct,
          open(FEATS_DIR+'/latest_rating_date_user.json', 'w'))

movie_latest_date_dct = {k: v.strftime('%Y-%m-%d') for k, v in 
                         movie_latest_date_dct.items()}
json.dump(movie_latest_date_dct,
          open(FEATS_DIR+'/latest_rating_date_movie.json', 'w'))

json.dump(movie_num_ratings,
          open(FEATS_DIR+'/num_ratings_movie.json', 'w'))
json.dump(user_num_ratings,
          open(FEATS_DIR+'/num_ratings_user.json', 'w'))

### Strategy to sample users and movies for modelling
1. Create bins of users and movies based on num_ratings
2. Train - 98%, Val - 1% and Test - 1% (stratified sampling)

3. Train models on each sample (user_bin * movie_bin)

3a. Segment = user_bin + movie_bin

3b. Train - 70%, Val -20% and Test - 10%

3c. Find similar movies on train sample

3d. Find movies seen by users and their corresponding ratings on train sample

3e. Train a regression model on Val sample to find weights for movie ratings with DV as rating for that user and movie

3f. Evaluate on test set

4. Repeat step 3 for all segments

4a. Save the similar movie dict and user dict for seen movies from the train samples of every segment

4b. Save the regression model object for each segment

5. On the global Val sample, calculate features based on all segments and train a regression model to find weights for features

5a. Evaluate on test set

In [3]:
# GLOBALS
SAMPLE_DATA_DIR = os.path.join(DATA_DIR, 'interim_sampled')
SAMPLE_FEATS_DIR = os.path.join(DATA_DIR, 'features_sampled')
BASELINE_FEATS_DIR = os.path.join(DATA_DIR, 'baseline_features')
NEIGHBOURHOOD_FEATS_DIR = os.path.join(DATA_DIR, 'neighbourhood_features')
INP_FN = os.path.join(OUT_DIR, 'user_data_{}.h5')
TRAIN_FN = os.path.join(OUT_DIR, 'user_train_data_{}.h5')
TEST_FN = os.path.join(OUT_DIR, 'user_test_data_{}.h5')
VAL_FN = os.path.join(OUT_DIR, 'user_val_data_{}.h5')
FEATS_TEST_FN = os.path.join(SAMPLE_FEATS_DIR, 'features_data_test_{}.h5')
FEATS_VAL_FN = os.path.join(SAMPLE_FEATS_DIR, 'features_data_val_{}.h5')
USER_MOVIE_RATINGS_DCT_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                                         'user_movie_ratings_dct_{}.json')
SIM_MOVIE_DCT_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                                'sim_movie_dct_U{}_M{}_{}.json')
SIM_MOVIE_DCT_GLOBAL_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                                       'sim_movie_dct_global_{}.json')
MOVIE_SEGMENT_DCT_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                                    'movie_segment_dct_{}.json')
SEGMENT_TRAIN_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                                'segment_train_df_U{}_M{}_{}.csv')
SEGMENT_TEST_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                               'segment_test_df_U{}_M{}_{}.csv')
TEST_RMSE_FN = os.path.join(NEIGHBOURHOOD_FEATS_DIR,
                            'segment_test_rmse_{}.csv')
MEAN_RATINGS_MOVIE_DCT_FN = os.path.join(BASELINE_FEATS_DIR,
                                         'mean_ratings_movie_dct.json')
MEAN_RATINGS_USER_DCT_FN = os.path.join(BASELINE_FEATS_DIR,
                                        'mean_ratings_user_dct.json')
NUM_RATINGS_MOVIE_DCT_FN = os.path.join(BASELINE_FEATS_DIR,
                                        'num_ratings_movie_dct.json')
NUM_RATINGS_USER_DCT_FN = os.path.join(BASELINE_FEATS_DIR,
                                       'num_ratings_user_dct.json')
WEIGHTED_MEAN_RATINGS_MOVIE_DCT_FN = os.path.join(
    BASELINE_FEATS_DIR, 'weighted_mean_ratings_movie_dct.json')
WEIGHTED_MEAN_RATINGS_USER_DCT_FN = os.path.join(
    BASELINE_FEATS_DIR, 'weighted_mean_ratings_user_dct.json')
EARLIEST_RATING_DATE_MOVIE_FN = os.path.join(
    BASELINE_FEATS_DIR, 'earliest_rating_date_movie_dct.json')
EARLIEST_RATING_DATE_USER_FN = os.path.join(
    BASELINE_FEATS_DIR, 'earliest_rating_date_user_dct.json')
LATEST_RATING_DATE_MOVIE_FN = os.path.join(
    BASELINE_FEATS_DIR, 'latest_rating_date_movie_dct.json')
LATEST_RATING_DATE_USER_FN = os.path.join(
    BASELINE_FEATS_DIR, 'latest_rating_date_user_dct.json')
FILE_NUMS = range(1, 5, 1)
MODEL_DIR = os.path.join(DATA_DIR, 'models')
PREDICTION_DIR = os.path.join(DATA_DIR, 'predictions')
REG_MODEL_IMPUTE_DCT_FN = os.path.join(MODEL_DIR,
                                       'reg_model_impute_dct_{}.json')
REG_MODEL_OBJ_FN = os.path.join(MODEL_DIR, 'reg_model_obj_{}.pkl')
REG_MODEL_PRED_VAL_FN = os.path.join(PREDICTION_DIR,
                                     'reg_model_prediction_val_{}.h5')
REG_MODEL_PRED_TEST_FN = os.path.join(PREDICTION_DIR,
                                      'reg_model_prediction_test_{}.h5')

In [4]:
# utility functions for neighbourhood feature calculation

def _initialize(file_num, num_user_bins, num_movie_bins):
    
    print('read overall data')
    inp_fn = INP_FN.format(file_num)
    print('file: %s' % (inp_fn))
    df = pd.read_hdf(inp_fn, key='stage')
    print('\n')
    
    print('shape: ', df.shape)
    print('\n')
    
    print('bins for users and movies based on num_ratings')
    
    # num_ratings_user
    user_df = df.groupby('User')['Rating'].count().rename(
        'num_rating_user').reset_index()
    user_df['num_rating_user_bins'] = pd.qcut(
        user_df['num_rating_user'], num_user_bins, labels=False)

    # num_ratings_movie
    movie_df = df.groupby('Movie')['Rating'].count().rename(
        'num_rating_movie').reset_index()
    movie_df['num_rating_movie_bins'] = pd.qcut(
        movie_df['num_rating_movie'], num_movie_bins, labels=False)
    
    # merge
    df = pd.merge(pd.merge(df, user_df, on='User'), movie_df, on='Movie')
    
    del user_df, movie_df
    
    return df


def sampling(data, split=0.01, _type='overall'):
    
    # Shuffle DataFrame
    d = data.sample(frac=1).reset_index(drop=True)
    
    test_size = int(round(split*d.shape[0]))
    
    if _type == 'overall':
        # size
        val_size = 4*test_size
        train_size = d.shape[0] - val_size - test_size

        # Split into train & test sets
        df_train = d[:train_size]
        df_val = d[train_size:train_size+val_size]
        df_test = d[train_size+val_size:]

        return df_train, df_val, df_test
    else:
        # Split into train & test sets
        df_test = d[:test_size]
        df_train = d[test_size:]

        return df_train, df_test


def define_segments_for_modelling(df, num_segments=3):
    
    print('mapping dict')
    user_bins_num_rating_dct = df['num_rating_user_bins'].value_counts().to_dict()
    movie_bins_num_rating_dct = df['num_rating_movie_bins'].value_counts().to_dict()
    
    print('compute crosstab')
    table = pd.crosstab(df['num_rating_user_bins'],
                        df['num_rating_movie_bins'], margins=True)
    
    segments = []   # [(user_bin, movie_bin),...]
    
    print('bins for movies')
    for movie_bin in list(movie_bins_num_rating_dct.keys()):
        user_bins = []
        sparsities = []
        for user_bin in list(user_bins_num_rating_dct.keys()):
            numerator = 100.*table.loc[user_bin, movie_bin]
            denominator = user_bins_num_rating_dct[user_bin]*movie_bins_num_rating_dct[movie_bin]
            sparsity = numerator/denominator
            sparsities.append(sparsity)
            user_bins.append(user_bin)
        idxs = np.argsort(sparsities)[::-1][:num_segments]
        selected_user_bins = [user_bins[idx] for idx in idxs]
        segments += list(zip(selected_user_bins,
                             [movie_bin]*len(selected_user_bins)))

    print('bins for users')
    for user_bin in list(user_bins_num_rating_dct.keys()):
        movie_bins = []
        sparsities = []
        for movie_bin in list(movie_bins_num_rating_dct.keys()):
            numerator = 100.*table.loc[user_bin, movie_bin]
            denominator = user_bins_num_rating_dct[user_bin]*movie_bins_num_rating_dct[movie_bin]
            sparsity = numerator/denominator
            sparsities.append(sparsity)
            movie_bins.append(movie_bin)
        idxs = np.argsort(sparsities)[::-1][:num_segments]
        selected_movie_bins = [movie_bins[idx] for idx in idxs]
        segments += list(zip([user_bin]*len(selected_movie_bins),
                             selected_movie_bins))

    return list(set(segments))


def get_similar_movies_dct(sim_matrix, movie_ids, top_n=20):
    
    out = {}
    movie_id_mapping_inv = {i: id for i, id in enumerate(movie_ids)}
    for i, movie_id in enumerate(movie_ids):
        
        if i % 50 == 0:
            print(i)
        # sort similar movies by index
        similar_movie_index = np.argsort(sim_matrix[i])[::-1][:top_n]
        
        similar_movie_index = [movie_id_mapping_inv[x]
                               for x in similar_movie_index]

        # sort similar movies by score
        similar_movie_score = np.sort(sim_matrix[i])[::-1][:top_n]
        
        # save
        out[movie_id] = list(zip(similar_movie_index,
                                 similar_movie_score))
        
    return out


def find_similar_movies(data, num_similar):
    
    print('Create a user-movie matrix with empty values')
    df_p = data.pivot_table(index='User', columns='Movie',
                            values='Rating')
    
    print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
    
    print('fill in missing values with mean of each movie')
    df_p_imputed = df_p.fillna(df_p.mean(axis=0)).T
    
    del df_p
    
    print(df_p_imputed.shape)

    print('similarity between all users')
    similarity = cosine_similarity(df_p_imputed.values)

    print('remove self-similarity')
    similarity -= np.eye(similarity.shape[0])
    
    print('shape of similarity matrix: ', similarity.shape)
    
    movie_ids = df_p_imputed.index.tolist()
    print('num movies: ', len(movie_ids))
    
    print('get similar movies dct')
    sim_movie_dct = get_similar_movies_dct(similarity, movie_ids,
                                           num_similar)
    
    return sim_movie_dct


def calc_feature_per_user_movie(user_id, movie_id, sim_movie_dct,
                                user_movies_ratings_dct):
    
    user_movie_rating_tup = user_movies_ratings_dct.get(user_id, None)
    sim_movie_tup = sim_movie_dct.get(movie_id, None)
    
    if (user_movie_rating_tup) and (sim_movie_tup):
        sim_movie_ids = [x[0] for x in sim_movie_tup]
        sim_movie_scores = [x[1] for x in sim_movie_tup]
        numerator, denominator = 0, 0
        for movie_id, rating in user_movie_rating_tup:
            if movie_id in sim_movie_ids:
                idx = sim_movie_ids.index(movie_id)
                score = sim_movie_scores[idx]
                numerator += (rating*score)
                denominator += score
        
        return numerator/denominator if denominator > 0 else None
    
    
def calc_features_test_df(test_df, sim_movie_dct,
                          user_movies_ratings_dct, user_col='User',
                          movie_col='Movie'):

    scores = []
    count = 1
    for _, row in test_df.iterrows():
        if count % 1000 == 0:
            print(count)
        user_id, movie_id = row[user_col], row[movie_col]
        score = calc_feature_per_user_movie(
            str(user_id), str(movie_id), sim_movie_dct, user_movies_ratings_dct)
        scores.append(score)
        count += 1
    
    return scores
    
    
def calc_rmse(pred_df, target_col='Rating',
              score_col='predicted_rating'):
    mask = pred_df[score_col].notnull()
    if mask.sum():
        y_true = pred_df.loc[mask, target_col]
        y_pred = pred_df.loc[mask, score_col]
        rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))
        return rmse
    else:
        return None


def segment_modelling(df_train, user_bin, movie_bin, split=0.1,
                      num_similar=150):
    
    print('define segment')
    mask1 = df_train['num_rating_user_bins'] == user_bin
    mask2 = df_train['num_rating_movie_bins'] == movie_bin

    df_train_segment = df_train.loc[mask1&mask2, :]
    df_train_segment.reset_index(drop=True, inplace=True)

    print('overall shape: ', df_train_segment.shape)

    print('sampling')
    df_train_segment_train, df_train_segment_test = sampling(
        df_train_segment, split=split, _type='segment')
    
    del df_train_segment
    
    print('train shape: ', df_train_segment_train.shape)
    print('test shape: ', df_train_segment_test.shape)
    
    print('find similar movies')
    sim_movie_dct = find_similar_movies(df_train_segment_train,
                                        num_similar)
    
    print('find movies rated by users and their corresponding ratings')
    df_train_segment_train['movie_rating_tup'] = list(
        map(lambda x, y: (x, y), df_train_segment_train['Movie'],
            df_train_segment_train['Rating']))
    user_movies_ratings_df = df_train_segment_train.groupby('User')[
        'movie_rating_tup'].apply(list).rename(
        'movie_rating_tup').reset_index()
    user_movies_ratings_dct = dict(
        zip(user_movies_ratings_df['User'],
            user_movies_ratings_df['movie_rating_tup']))
    del user_movies_ratings_df
    
    print('Evaluation on test set')
    scores = calc_features_test_df(df_train_segment_test, sim_movie_dct,
                                   user_movies_ratings_dct)
    
    df_train_segment_test['predicted_rating'] = scores
    
    rmse = calc_rmse(df_train_segment_test, target_col='Rating',
                     score_col='predicted_rating')
    
    return (sim_movie_dct, user_movies_ratings_dct,
            df_train_segment_train, df_train_segment_test, rmse)

In [5]:
# utility functions for baseline feature calculation

def get_weighted_mean_ratings(df, m, group_col='Movie',
                              agg_col='Rating'):

    # overall mean of all movies
    C = df[agg_col].mean()

    # mean by movies
    R = df.groupby(group_col)[agg_col].mean()

    # count by movies
    v = df.groupby(group_col)[agg_col].count().values
    
    # movie indices
    movie_ids = R.index
    R = R.values

    # weighted score calculation
    weighted_scores = (v/(v+m))*R + (m/(v+m))*C

    # rank based on weighted score
    weighted_ranking = np.argsort(weighted_scores)[::-1]
    weighted_scores = np.sort(weighted_scores)[::-1]

    # get movie ids corresponding to rankings
    weighted_movie_ids = movie_ids[weighted_ranking]
    
    weighted_scores_dct = dict(zip(weighted_movie_ids, weighted_scores))

    return weighted_scores_dct


def _update_user_dct(file_dct, global_dct, file_dct_count=None,
                     global_dct_count=None, how='earliest'):
    
    if not global_dct:
        return file_dct
    
    d = copy.deepcopy(global_dct)
    if how == 'earliest':
        func = lambda x, y: min(x, y)
    elif how == 'latest':
        func = lambda x, y: max(x, y)
    elif how == 'count':
        func = lambda x, y: sum([x, y])

    for k in file_dct:
        if k in global_dct:
            if how == 'mean':
                numerator = sum([(file_dct[k] * file_dct_count[k]),
                                 (global_dct[k] * global_dct_count[k])])
                denominator = sum([file_dct_count[k], global_dct_count[k]])
                d[k] = 1.*numerator/denominator
            else:
                d[k] = func(file_dct[k], global_dct[k])
        else:
            d[k] = file_dct[k]
    
    return d


def init_feat_dict(fn):
    if os.path.isfile(fn):
        return json.load(open(fn))
    else:
        return {}
    
    
def baseline_feat_calc_helper(df, group_col, func_type):
    """
    func_type: mean, count, earliest, latest
    group_col: User, Movie
    """
    if func_type in ('mean', 'count'):
        agg_col = 'Rating'
    elif func_type in ('earliest', 'latest'):
        agg_col = 'Date'
        
    agg_name = '_'.join([func_type, agg_col])
        
    func_dict = {'mean': lambda x: x.mean(),
                 'count': lambda x: x.count(),
                 'earliest': lambda x: x.min(),
                 'latest': lambda x: x.max()}
    
    grouped_df = func_dict[func_type](
        df.groupby(group_col)[agg_col]).rename(agg_name).reset_index()
    
    if func_type in ('earliest', 'latest'):
        grouped_df[agg_name] = grouped_df[agg_name].apply(
            lambda x: x.strftime('%Y-%m-%d'))
    
    d = dict(zip(grouped_df[group_col], grouped_df[agg_name]))
    
    return d

In [6]:
# utility functions for feature calculation


def main_neighbourhood_feats_calc(file_num, df_train, num_segments,
                                  num_similar):
    
    start = time.time()
    print('define segments \n')
    segments = define_segments_for_modelling(df_train, num_segments)
    print('time taken: %0.2f' % (time.time()))
    
    print('number of segments: %d' % (len(segments)))
    print('\n')
    
    test_rmses = []
    user_movies_ratings_global_dct = {}
    movie_segment_dct = defaultdict(list)
    for user_bin, movie_bin in segments:
        print('User Bin: %s and Movie Bin: %s' % (str(user_bin),
                                                  str(movie_bin)))
        (sim_movie_dct, user_movies_ratings_dct,
         df_train_segment_train, df_train_segment_test,
         rmse) = segment_modelling(df_train, user_bin, movie_bin,
                                   0.1, num_similar)
        
        print('update movie_segment_dct \n')
        movie_ids = list(sim_movie_dct.keys())
        for movie_id in movie_ids:
            if movie_id not in movie_segment_dct:
                movie_segment_dct[movie_id].append((user_bin, movie_bin))
            elif (user_bin, movie_bin) not in movie_segment_dct[movie_id]:
                movie_segment_dct[movie_id].append((user_bin, movie_bin))
                
        print('update user_movies_ratings_global_dct')
        for k in user_movies_ratings_dct:
            if k in user_movies_ratings_global_dct:
                user_movies_ratings_global_dct[k] += user_movies_ratings_dct[k]
            else:
                user_movies_ratings_global_dct[k] = user_movies_ratings_dct[k]
        
        test_rmses.append(rmse)
        
        print('save df_train_segment_train \n')
        df_train_segment_train.to_csv(
            SEGMENT_TRAIN_FN.format(user_bin, movie_bin, file_num),
            index=False)
        print('save df_train_segment_test \n')
        df_train_segment_test.to_csv(
            SEGMENT_TEST_FN.format(user_bin, movie_bin, file_num),
            index=False)
        print('save sim_movie_dct \n')
        json.dump(sim_movie_dct, open(
            SIM_MOVIE_DCT_FN.format(user_bin, movie_bin, file_num), 'w'))
        
        print('time taken: %0.2f' % (time.time()))
        print('\n')
        
        
    test_rmse_df = pd.DataFrame({'segment': segments,
                                 'test_rmse': test_rmses})
    print('save test_rmse_df \n')
    test_rmse_df.to_csv(TEST_RMSE_FN.format(file_num), index=False)
    
    print('save user_movies_ratings_global_dct \n')
    json.dump(user_movies_ratings_global_dct,
              open(USER_MOVIE_RATINGS_DCT_FN.format(file_num), 'w'))
    
    print('save movie_segment_dct \n')
    json.dump(movie_segment_dct,
              open(MOVIE_SEGMENT_DCT_FN.format(file_num), 'w'))
    
    print('overall time taken: %0.2f' % (time.time()))
    print('done')

    
def main_baseline_feats_calc(df_train):
    
    print('initialize feature dicts\n')
    
    mean_ratings_movie_dct = init_feat_dict(MEAN_RATINGS_MOVIE_DCT_FN)
    mean_ratings_user_dct = init_feat_dict(MEAN_RATINGS_USER_DCT_FN)
    weighted_mean_ratings_movie_dct = init_feat_dict(
        WEIGHTED_MEAN_RATINGS_MOVIE_DCT_FN)
    user_earliest_date_dct = init_feat_dict(EARLIEST_RATING_DATE_USER_FN)
    movie_earliest_date_dct = init_feat_dict(EARLIEST_RATING_DATE_MOVIE_FN)
    user_latest_date_dct = init_feat_dict(LATEST_RATING_DATE_USER_FN)
    movie_latest_date_dct = init_feat_dict(LATEST_RATING_DATE_MOVIE_FN)
    movie_num_ratings = init_feat_dict(NUM_RATINGS_MOVIE_DCT_FN)
    user_num_ratings = init_feat_dict(NUM_RATINGS_USER_DCT_FN)

    print('convert Date to datetime format\n')
    df_train['Date'] = pd.to_datetime(df_train['Date'], format='%Y-%m-%d')

    print('mean ratings for movie\n')
    d = baseline_feat_calc_helper(df_train, 'Movie', 'mean')
    mean_ratings_movie_dct.update(d)
    del d

    print('weighted mean ratings for movie\n')
    d = get_weighted_mean_ratings(df_train, 250, 'Movie', 'Rating')
    weighted_mean_ratings_movie_dct.update(d)
    del d

    print('movie earliest date dct\n')
    d = baseline_feat_calc_helper(df_train, 'Movie', 'earliest')
    movie_earliest_date_dct.update(d)
    del d

    print('movie latest date dct\n')
    d = baseline_feat_calc_helper(df_train, 'Movie', 'latest')
    movie_latest_date_dct.update(d)
    del d

    print('user earliest date dct\n')
    d = baseline_feat_calc_helper(df_train, 'User', 'earliest')
    user_earliest_date_dct = _update_user_dct(d, user_earliest_date_dct,
                                              'earliest')
    del d

    print('user latest date dct\n')
    d = baseline_feat_calc_helper(df_train, 'User', 'latest')
    user_latest_date_dct = _update_user_dct(d, user_latest_date_dct,
                                            'latest')
    del d

    print('movie num ratings\n')
    d = baseline_feat_calc_helper(df_train, 'Movie', 'count')
    movie_num_ratings.update(d)
    del d

    print('user num ratings\n')
    user_num_ratings_local = baseline_feat_calc_helper(
        df_train, 'User', 'count')
    user_num_ratings = _update_user_dct(user_num_ratings_local,
                                        user_num_ratings, 'count')
    
    print('mean ratings for user\n')
    d = baseline_feat_calc_helper(df_train, 'User', 'mean')
    mean_ratings_user_dct = _update_user_dct(
        d, mean_ratings_user_dct, user_num_ratings_local,
        user_num_ratings, 'mean')
    del d, user_num_ratings_local, df_train
    
    print('save outputs to disk\n')
    
    print('mean_ratings_movie_dct\n')
    json.dump(mean_ratings_movie_dct,
              open(MEAN_RATINGS_MOVIE_DCT_FN, 'w'))
    
    print('mean_ratings_user_dct\n')
    json.dump(mean_ratings_user_dct,
              open(MEAN_RATINGS_USER_DCT_FN, 'w'))
    
    print('weighted_mean_ratings_movie_dct\n')
    json.dump(weighted_mean_ratings_movie_dct,
              open(WEIGHTED_MEAN_RATINGS_MOVIE_DCT_FN, 'w'))

    print('user_earliest_date_dct\n')
    json.dump(user_earliest_date_dct,
              open(EARLIEST_RATING_DATE_USER_FN, 'w'))

    print('movie_earliest_date_dct\n')
    json.dump(movie_earliest_date_dct,
              open(EARLIEST_RATING_DATE_MOVIE_FN, 'w'))

    print('user_latest_date_dct\n')
    json.dump(user_latest_date_dct,
              open(LATEST_RATING_DATE_USER_FN, 'w'))

    print('movie_latest_date_dct\n')
    json.dump(movie_latest_date_dct,
              open(LATEST_RATING_DATE_MOVIE_FN, 'w'))
    
    print('movie_num_ratings\n')
    json.dump(movie_num_ratings, open(NUM_RATINGS_MOVIE_DCT_FN, 'w'))
    
    print('user_num_ratings\n')
    json.dump(user_num_ratings, open(NUM_RATINGS_USER_DCT_FN, 'w'))

In [7]:
# utility functions for feature creation on val and test sets


def combine_sim_movie_dcts(file_num, movie_segment_dct, sim_dct,
                           top_k=150):
    
    sim_dct_global = {} 
    count = 1 
    for movie_id in movie_segment_dct: 
        if count % 50 == 0: 
            print('num completed: ', count) 
            print('\n') 
        print('Movie: ', movie_id) 
        segments = ['U{}_M{}'.format(x[0],x[1])
                    for x in movie_segment_dct[movie_id]] 
        print('num segments: ', len(segments)) 
        movie_tups = [] 
        for segment in segments: 
            movie_tups += sim_dct[segment][movie_id] 
        
        tmp = pd.DataFrame(movie_tups, columns=['movie_id', 'sim']) 
        tmp1 = tmp.groupby('movie_id')['sim'].max().rename(
            'max_sim').reset_index() 
        tmp1.sort_values('max_sim', ascending=False, inplace=True) 
        tmp1 = tmp1[:top_k] 
        sim_dct_global[movie_id] = list(zip(tmp1['movie_id'],
                                            tmp1['max_sim'])) 
        count += 1 
        print('\n')
        
    print('save sim_dct_global')
    json.dump(sim_dct_global,
              open(SIM_MOVIE_DCT_GLOBAL_FN.format(file_num), 'w'))
    
    return sim_dct_global


def main_baseline_feats_calc_val(
    df, mean_ratings_movie_dct, mean_ratings_user_dct,
    weighted_mean_ratings_movie_dct, user_earliest_date_dct,
    movie_earliest_date_dct, user_latest_date_dct, movie_latest_date_dct,
    movie_num_ratings, user_num_ratings, movie_col='Movie',
    user_col='User', date_col='Date'):
    
    df_val = df.copy()
    
    print('convert Date to datetime format\n')
    df_val[date_col] = pd.to_datetime(df_val[date_col], format='%Y-%m-%d')
    
    print('convert to datetime format in date_dcts')
    user_earliest_date_dct = {k: pd.to_datetime(v, format='%Y-%m-%d')
                              for k, v in user_earliest_date_dct.items()}
    movie_earliest_date_dct = {k: pd.to_datetime(v, format='%Y-%m-%d')
                               for k, v in movie_earliest_date_dct.items()}
    user_latest_date_dct = {k: pd.to_datetime(v, format='%Y-%m-%d')
                            for k, v in user_latest_date_dct.items()}
    movie_latest_date_dct = {k: pd.to_datetime(v, format='%Y-%m-%d')
                             for k, v in movie_latest_date_dct.items()}
    
    print('mean_ratings_movie')
    df_val['mean_ratings_movie'] = df_val[movie_col].apply(
        lambda x: mean_ratings_movie_dct.get(str(x), None))
    
    print('mean_ratings_user')
    df_val['mean_ratings_user'] = df_val[user_col].apply(
        lambda x: mean_ratings_user_dct.get(str(x), None))
    
    print('weighted_mean_ratings_movie')
    df_val['weighted_mean_ratings_movie'] = df_val[movie_col].apply(
        lambda x: weighted_mean_ratings_movie_dct.get(str(x), None))
    
    print('days_since_first_user_rating')
    df_val['days_since_first_user_rating'] = list(
        map(lambda user, date: (date -
                                user_earliest_date_dct[str(user)]).days
            if str(user) in user_earliest_date_dct else None,
            df_val[user_col], df_val[date_col]))
    
    print('sqrt_days_since_first_user_rating')
    df_val['sqrt_days_since_first_user_rating'] = df_val[
        'days_since_first_user_rating'].apply(
        lambda x: np.sqrt(x) if x is not None else None)
    mask = df_val['days_since_first_user_rating'] < 0
    df_val.loc[mask, 'sqrt_days_since_first_user_rating'] = 0
    
    print('rating_age_days_user')
    df_val['rating_age_days_user'] = list(
        map(lambda user: (user_latest_date_dct[str(user)] -
                          user_earliest_date_dct[str(user)]).days
            if (str(user) in user_earliest_date_dct) and
            (str(user) in user_latest_date_dct) else None,
            df_val[user_col]))
    
    print('rating_age_weeks_user')
    df_val['rating_age_weeks_user'] = df_val['rating_age_days_user'].apply(
        lambda x: x/7. if x is not None else None)
    
    print('rating_age_months_user')
    df_val['rating_age_months_user'] = df_val['rating_age_days_user'].apply(
        lambda x: x/30. if x is not None else None)
    
    print('days_since_first_movie_rating')
    df_val['days_since_first_movie_rating'] = list(
        map(lambda movie, date: (date -
                                 movie_earliest_date_dct[str(movie)]).days
            if str(movie) in movie_earliest_date_dct else None,
            df_val[movie_col], df_val[date_col]))
    
    print('sqrt_days_since_first_movie_rating')
    df_val['sqrt_days_since_first_movie_rating'] = df_val[
        'days_since_first_movie_rating'].apply(
        lambda x: np.sqrt(x) if x is not None else None)
    mask = df_val['days_since_first_movie_rating'] < 0
    df_val.loc[mask, 'sqrt_days_since_first_movie_rating'] = 0
    
    print('rating_age_days_movie')
    df_val['rating_age_days_movie'] = list(
        map(lambda movie: (movie_latest_date_dct[str(movie)] -
                           movie_earliest_date_dct[str(movie)]).days
            if (str(movie) in movie_earliest_date_dct) and
            (str(movie) in movie_latest_date_dct) else None,
            df_val[movie_col]))
    
    print('rating_age_weeks_movie')
    df_val['rating_age_weeks_movie'] = df_val['rating_age_days_movie'].apply(
        lambda x: x/7. if x is not None else None)
    
    print('rating_age_months_movie')
    df_val['rating_age_months_movie'] = df_val['rating_age_days_movie'].apply(
        lambda x: x/30. if x is not None else None)
    
    print('num_ratings_movie')
    df_val['num_ratings_movie'] = df_val[movie_col].apply(
        lambda x: movie_num_ratings.get(str(x), None))
    
    print('num_ratings_user')
    df_val['num_ratings_user'] = df_val[user_col].apply(
        lambda x: user_num_ratings.get(str(x), None))
    
    return df_val
    
    
def main_neighbourhood_feats_calc_val(file_num, df, movie_col='Movie',
                                      user_col='User', date_col='Date'):
    
    df_val = df.copy()
    
    if os.path.isfile(SIM_MOVIE_DCT_GLOBAL_FN.format(file_num)):
        print('sim_movie_dct_global exists. Reading from disk...')
        sim_movie_dct_global = json.load(open(
            SIM_MOVIE_DCT_GLOBAL_FN.format(file_num)))
    else:
        print('identify segments on which training was done')
        sw, ew = 'sim_movie_dct_', '_{}.json'.format(file_num)
        files = [x for x in os.listdir(NEIGHBOURHOOD_FEATS_DIR) if
                 x.startswith(sw) and x.endswith(ew)]
        segments = [x.split(sw)[-1].split(ew)[0] for x in files]

        print('number of segments: ', len(segments))
        
        print('read sim_movie_dcts')
        sim_dct = {}
        for i, file in enumerate(files): 
            segment = segments[i] 
            fn = os.path.join(NEIGHBOURHOOD_FEATS_DIR, file)
            sim_dct[segment] = json.load(open(fn))

        print('read movie_segment_dct')
        movie_segment_dct = json.load(open(
            MOVIE_SEGMENT_DCT_FN.format(file_num)))

        print('combine sim_movie_dcts')
        sim_movie_dct_global = combine_sim_movie_dcts(
            file_num, movie_segment_dct, sim_dct, top_k=150)
    
    print('read user_movie_ratings_dct')
    user_movies_ratings_dct = json.load(open(
        USER_MOVIE_RATINGS_DCT_FN.format(file_num)))
    
    print('calc weighted score based on sim_movie_dct_global')
    scores = calc_features_test_df(df_val, sim_movie_dct_global,
                                   user_movies_ratings_dct)
    feat_name = 'pred_score_item_item_neighbourhood_model'
    df_val[feat_name] = scores
    
    print('shape: ', df_val.shape)
    return df_val

In [8]:
# main function for training and validation


def main_feats_calc(file_num, task='training', feat_type=None,
                    num_user_bins=15, num_movie_bins=10,
                    num_segments=4, num_similar=150):
    """
    feat_type: neighbourhood, baseline
    task: training, validation
    """
    
    if task == 'training':
        print('Task: ', task)
        start_time = time.time()

        if not os.path.isfile(TRAIN_FN.format(file_num)):
            print('init \n')
            df = _initialize(file_num, num_user_bins, num_movie_bins)

            print('overall sampling \n')
            df_train, df_val, df_test = sampling(df, split=0.01,
                                                 _type='overall')

            print('save df_train \n')
            df_train.to_hdf(TRAIN_FN.format(file_num), key='stage',
                            mode='w')

            print('save df_val \n')
            df_val.to_hdf(VAL_FN.format(file_num), key='stage', mode='w')

            print('save df_test \n')
            df_test.to_hdf(TEST_FN.format(file_num), key='stage', mode='w')

        else:
            print('File exists. Reading from disk...\n')
            df_train = pd.read_hdf(TRAIN_FN.format(file_num), key='stage')

        if feat_type == 'neighbourhood':
            print('feature calculation begins...\n')
            main_neighbourhood_feats_calc(file_num, df_train, num_segments,
                                          num_similar)
            print('time taken for feature calculation: %0.2f' % (
                time.time() - start_time))

        elif feat_type == 'baseline':
            print('feature calculation begins...\n')
            main_baseline_feats_calc(df_train)
            print('time taken for feature calculation: %0.2f' % (
                time.time() - start_time))
    
    elif task == 'validation':
        print('Task: ', task)
        
        if os.path.isfile(FEATS_VAL_FN.format(file_num)):
            print('features data for val exist. Reading from disk')
            df_val = pd.read_hdf(FEATS_VAL_FN.format(file_num),
                                 key='stage')
        else:
            print('reading val interim data from disk')
            df_val = pd.read_hdf(VAL_FN.format(file_num), key='stage')
            
        print('baseline feature calculation on val\n')
        
        print('read feature dictionaries\n')
        
        print('mean_ratings_movie_dct\n')
        mean_ratings_movie_dct = json.load(
            open(MEAN_RATINGS_MOVIE_DCT_FN))
        
        print('mean_ratings_user_dct\n')
        mean_ratings_user_dct = json.load(open(MEAN_RATINGS_USER_DCT_FN))

        print('weighted_mean_ratings_movie_dct\n')
        weighted_mean_ratings_movie_dct = json.load(
            open(WEIGHTED_MEAN_RATINGS_MOVIE_DCT_FN))

        print('user_earliest_date_dct\n')
        user_earliest_date_dct = json.load(
            open(EARLIEST_RATING_DATE_USER_FN))

        print('movie_earliest_date_dct\n')
        movie_earliest_date_dct = json.load(
            open(EARLIEST_RATING_DATE_MOVIE_FN))

        print('user_latest_date_dct\n')
        user_latest_date_dct = json.load(
            open(LATEST_RATING_DATE_USER_FN))

        print('movie_latest_date_dct\n')
        movie_latest_date_dct = json.load(
            open(LATEST_RATING_DATE_MOVIE_FN))

        print('movie_num_ratings\n')
        movie_num_ratings = json.load(open(NUM_RATINGS_MOVIE_DCT_FN))

        print('user_num_ratings\n')
        user_num_ratings = json.load(open(NUM_RATINGS_USER_DCT_FN))
        
        print('feature calculation begins...\n')
        df_val = main_baseline_feats_calc_val(
            df_val, mean_ratings_movie_dct, mean_ratings_user_dct,
            weighted_mean_ratings_movie_dct, user_earliest_date_dct,
            movie_earliest_date_dct, user_latest_date_dct,
            movie_latest_date_dct, movie_num_ratings, user_num_ratings,
            movie_col='Movie', user_col='User', date_col='Date')
        
        print('xxxxxxxxxxxxxxxx\n\n')
        
        print('neighbourhood feature calculation on val\n')
        df_val = main_neighbourhood_feats_calc_val(
            file_num, df_val, movie_col='Movie', user_col='User',
            date_col='Date')
        
        print('xxxxxxxxxxxxxxxx\n\n')
        
        print('save df_val\n')
        df_val.to_hdf(FEATS_VAL_FN.format(file_num), key='stage',
                      mode='w')
        del df_val
        
        print('xxxxxxxxxxxxxxxx\n\n')
            
        if os.path.isfile(FEATS_TEST_FN.format(file_num)):
            print('features data for test exist. Reading from disk')
            df_test = pd.read_hdf(FEATS_TEST_FN.format(file_num),
                                  key='stage')
        else:
            print('reading test interim data from disk')
            df_test = pd.read_hdf(TEST_FN.format(file_num), key='stage')
        
        print('baseline feature calculation on test\n')
        
        df_test = main_baseline_feats_calc_val(
            df_test, mean_ratings_movie_dct, mean_ratings_user_dct,
            weighted_mean_ratings_movie_dct, user_earliest_date_dct,
            movie_earliest_date_dct, user_latest_date_dct,
            movie_latest_date_dct, movie_num_ratings, user_num_ratings,
            movie_col='Movie', user_col='User', date_col='Date')
        
        print('xxxxxxxxxxxxxxxx\n\n')
        
        print('neighbourhood feature calculation on test\n')
        df_test = main_neighbourhood_feats_calc_val(
            file_num, df_test, movie_col='Movie', user_col='User',
            date_col='Date')
        
        print('xxxxxxxxxxxxxxxx\n\n')
        
        print('save df_test\n')
        df_test.to_hdf(FEATS_TEST_FN.format(file_num), key='stage',
                       mode='w')

In [46]:
# neighbourhood features
main_feats_calc(file_num=1, feat_type='neighbourhood', num_segments=5)

File exists. Reading from disk...

feature calculation begins...

define segments 

mapping dict
compute crosstab
bins for movies
bins for users
time taken: 1588790246.65
number of segments: 94


User Bin: 7 and Movie Bin: 3
define segment
overall shape:  (4413, 8)
sampling
train shape:  (3972, 8)
test shape:  (441, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(3236, 444)
fill in missing values with mean of each movie
(444, 3236)
similarity between all users
remove self-similarity
shape of similarity matrix:  (444, 444)
num movies:  444
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790248.43


User Bin: 6 and Movie Bin: 9
define segment
overall shape:  (460724, 8)
sampling
train shape:  (41

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790315.14


User Bin: 1 and Movie Bin: 1
define segment
overall shape:  (353, 8)
sampling
train shape:  (318, 8)
test shape:  (35, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(309, 223)
fill in missing values with mean of each movie
(223, 309)
similarity between all users
remove self-similarity
shape of similarity matrix:  (223, 223)
num movies:  223
get similar movies dct
0
50
100
150
200
find movies rated by users and their corresponding ratings
Evaluation on test set
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790315.78


User Bin: 3 and Movie Bin: 2
define segment
overall sha

69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790375.79


User Bin: 14 and Movie Bin: 8
define segment
overall shape:  (1398314, 8)
sampling
train shape:  (1258483, 8)
test shape:  (139831, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(31154, 450)
fill in missing values with mean of each movie
(450, 31154)
similarity between all users
remove self-similarity
shape of similarity matrix:  (450, 450)
num movies:  450
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
210

overall shape:  (135296, 8)
sampling
train shape:  (121766, 8)
test shape:  (13530, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(23856, 449)
fill in missing values with mean of each movie
(449, 23856)
similarity between all users
remove self-similarity
shape of similarity matrix:  (449, 449)
num movies:  449
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790517.34


User Bin: 2 and Movie Bin: 2
define segment
overall shape:  (1069, 8)
sampling
train shape:  (962, 8)
test shape:  (107, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(911, 374)
fill in missing values with mean of 

save sim_movie_dct 

time taken: 1588790667.95


User Bin: 14 and Movie Bin: 6
define segment
overall shape:  (268935, 8)
sampling
train shape:  (242041, 8)
test shape:  (26894, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(29051, 450)
fill in missing values with mean of each movie
(450, 29051)
similarity between all users
remove self-similarity
shape of similarity matrix:  (450, 450)
num movies:  450
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790683.69


User Bin: 13 and Movie Bin: 6
define segment
overall shape:  (77709, 8)
sampling
train sh

(447, 14355)
similarity between all users
remove self-similarity
shape of similarity matrix:  (447, 447)
num movies:  447
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
1000
2000
3000
4000
5000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588790707.99


User Bin: 13 and Movie Bin: 9
define segment
overall shape:  (3174854, 8)
sampling
train shape:  (2857369, 8)
test shape:  (317485, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(31477, 450)
fill in missing values with mean of each movie
(450, 31477)
similarity between all users
remove self-similarity
shape of similarity matrix:  (450, 450)
num movies:  450
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test s

145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588791011.59


User Bin: 12 and Movie Bin: 3
define segment
overall shape:  (11369, 8)
sampling
train shape:  (10232, 8)
test shape:  (1137, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(6445, 447)
fill in missing values

Shape User-Movie-Matrix:	(30973, 450)
fill in missing values with mean of each movie
(450, 30973)
similarity between all users
remove self-similarity
shape of similarity matrix:  (450, 450)
num movies:  450
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588792204.37


User Bin: 13 and Movie Bin: 5
define segment
overall shape:  (39377, 8)
sampling
train shape:  (35439, 8)
test shape:  (3938, 8)
find similar movies
Create

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588793996.10


User Bin: 1 and Movie Bin: 2
define segment
overall shape:  (489, 8)
sampling
train shape:  (440, 8)
test shape:  (49, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(423, 269)
fill in missing values with mean of each movie
(269, 423)
similarity between all users
remove self-similarity
shape of similarity matrix:  (269, 269)
num movies:  269
get similar movies dct
0
50
100
150
200
250
find movies rated by users and their corresponding ratings
Evaluation on test set
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588793996.77


User Bin: 4 and Movie Bin: 9
define segment
overall shape:  (252461, 8)
sampling
train shape:  (227215, 8)
test shape:  (25246, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(29297, 450)

time taken: 1588795215.48


User Bin: 4 and Movie Bin: 3
define segment
overall shape:  (1769, 8)
sampling
train shape:  (1592, 8)
test shape:  (177, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(1430, 420)
fill in missing values with mean of each movie
(420, 1430)
similarity between all users
remove self-similarity
shape of similarity matrix:  (420, 420)
num movies:  420
get similar movies dct
0
50
100
150
200
250
300
350
400
find movies rated by users and their corresponding ratings
Evaluation on test set
update movie_segment_dct 

update user_movies_ratings_global_dct
save df_train_segment_train 

save df_train_segment_test 

save sim_movie_dct 

time taken: 1588795216.50


User Bin: 5 and Movie Bin: 2
define segment
overall shape:  (1758, 8)
sampling
train shape:  (1582, 8)
test shape:  (176, 8)
find similar movies
Create a user-movie matrix with empty values
Shape User-Movie-Matrix:	(1425, 413)
fill in missing values with mean of eac

In [11]:
# baseline features
main_feats_calc(file_num=4, feat_type='baseline')

Task:  training
init 

read overall data
file: /Users/varunn/Documents/kaggle/netflix-prize-data/interim/user_data_4.h5


shape:  (26847523, 4)


bins for users and movies based on num_ratings
overall sampling 

save df_train 

save df_val 

save df_test 

feature calculation begins...

initialize feature dicts

convert Date to datetime format

mean ratings for movie

weighted mean ratings for movie

movie earliest date dct

movie latest date dct

user earliest date dct

user latest date dct

movie num ratings

user num ratings

mean ratings for user

save outputs to disk

mean_ratings_movie_dct

mean_ratings_user_dct

weighted_mean_ratings_movie_dct

user_earliest_date_dct

movie_earliest_date_dct

user_latest_date_dct

movie_latest_date_dct

movie_num_ratings

user_num_ratings

time taken for feature calculation: 160.54


In [20]:
# calc features on val and test sets
main_feats_calc(file_num=1, task='validation')

Task:  validation
reading val interim data from disk
baseline feature calculation on val

read feature dictionaries

mean_ratings_movie_dct

mean_ratings_user_dct

weighted_mean_ratings_movie_dct

user_earliest_date_dct

movie_earliest_date_dct

user_latest_date_dct

movie_latest_date_dct

movie_num_ratings

user_num_ratings

feature calculation begins...

convert Date to datetime format

convert to datetime format in date_dcts
mean_ratings_movie
mean_ratings_user
weighted_mean_ratings_movie
days_since_first_user_rating
sqrt_days_since_first_user_rating
rating_age_days_user
rating_age_weeks_user
rating_age_months_user
days_since_first_movie_rating
sqrt_days_since_first_movie_rating
rating_age_days_movie
rating_age_weeks_movie
rating_age_months_movie
num_ratings_movie
num_ratings_user
xxxxxxxxxxxxxxxx


neighbourhood feature calculation on val

identify segments on which training was done
number of segments:  94
read sim_movie_dcts
read movie_segment_dct
combine sim_movie_dcts
Movie:  1



Movie:  2459
num segments:  13


Movie:  2461
num segments:  11


Movie:  2474
num segments:  12


Movie:  2483
num segments:  13


Movie:  2488
num segments:  14


Movie:  2491
num segments:  14


Movie:  2492
num segments:  12


Movie:  2541
num segments:  12


num completed:  250


Movie:  2546
num segments:  12


Movie:  2556
num segments:  14


Movie:  2557
num segments:  13


Movie:  2570
num segments:  13


Movie:  2581
num segments:  14


Movie:  2583
num segments:  13


Movie:  2584
num segments:  13


Movie:  2586
num segments:  11


Movie:  2600
num segments:  12


Movie:  2605
num segments:  12


Movie:  2630
num segments:  14


Movie:  2652
num segments:  14


Movie:  2664
num segments:  12


Movie:  2669
num segments:  13


Movie:  2671
num segments:  13


Movie:  2679
num segments:  14


Movie:  2696
num segments:  14


Movie:  2700
num segments:  14


Movie:  2715
num segments:  14


Movie:  2721
num segments:  12


Movie:  2729
num segments:  14


Movie:  2750
num se



Movie:  811
num segments:  12


Movie:  818
num segments:  13


Movie:  819
num segments:  13


Movie:  831
num segments:  13


Movie:  851
num segments:  13


Movie:  862
num segments:  13


Movie:  872
num segments:  13


Movie:  886
num segments:  13


Movie:  896
num segments:  13


Movie:  897
num segments:  13


Movie:  937
num segments:  13


Movie:  940
num segments:  13


Movie:  962
num segments:  13


Movie:  963
num segments:  13


Movie:  985
num segments:  13


Movie:  989
num segments:  13


Movie:  994
num segments:  13


Movie:  996
num segments:  13


Movie:  1020
num segments:  13


Movie:  1027
num segments:  13


Movie:  1035
num segments:  13


Movie:  1046
num segments:  13


Movie:  1050
num segments:  13


Movie:  1066
num segments:  13


num completed:  550


Movie:  1073
num segments:  13


Movie:  1096
num segments:  13


Movie:  1102
num segments:  13


Movie:  1110
num segments:  13


Movie:  1138
num segments:  13


Movie:  1144
num segments:  13


Movi

Movie:  3385
num segments:  13


Movie:  3414
num segments:  13


Movie:  3418
num segments:  13


Movie:  3427
num segments:  13


Movie:  3433
num segments:  13


Movie:  3434
num segments:  13


Movie:  3437
num segments:  13


Movie:  3446
num segments:  13


Movie:  3463
num segments:  13


Movie:  3466
num segments:  13


Movie:  3478
num segments:  13


Movie:  3489
num segments:  13


Movie:  3522
num segments:  13


Movie:  3526
num segments:  13


Movie:  3535
num segments:  13


Movie:  3538
num segments:  13


Movie:  3541
num segments:  13


Movie:  3544
num segments:  13


num completed:  800


Movie:  3551
num segments:  13


Movie:  3579
num segments:  13


Movie:  3605
num segments:  13


Movie:  3610
num segments:  13


Movie:  3611
num segments:  13


Movie:  3612
num segments:  13


Movie:  3617
num segments:  13


Movie:  3624
num segments:  13


Movie:  3626
num segments:  13


Movie:  3638
num segments:  13


Movie:  3648
num segments:  13


Movie:  3650
num segm



Movie:  1442
num segments:  12


Movie:  1449
num segments:  10


Movie:  1463
num segments:  8


Movie:  1472
num segments:  11


Movie:  1474
num segments:  11


Movie:  1493
num segments:  12


Movie:  1512
num segments:  13


Movie:  1516
num segments:  8


Movie:  1522
num segments:  11


Movie:  1534
num segments:  11


Movie:  1537
num segments:  11


Movie:  1545
num segments:  10


Movie:  1568
num segments:  12


num completed:  1050


Movie:  1584
num segments:  11


Movie:  1587
num segments:  12


Movie:  1603
num segments:  11


Movie:  1624
num segments:  7


Movie:  1643
num segments:  11


Movie:  1653
num segments:  12


Movie:  1662
num segments:  8


Movie:  1663
num segments:  11


Movie:  1675
num segments:  12


Movie:  1680
num segments:  12


Movie:  1688
num segments:  8


Movie:  1701
num segments:  9


Movie:  1742
num segments:  12


Movie:  1747
num segments:  11


Movie:  1755
num segments:  11


Movie:  1763
num segments:  11


Movie:  1813
num segment



Movie:  4000
num segments:  10


Movie:  4003
num segments:  8


num completed:  1300


Movie:  4006
num segments:  11


Movie:  4022
num segments:  8


Movie:  4023
num segments:  12


Movie:  4024
num segments:  9


Movie:  4058
num segments:  12


Movie:  4059
num segments:  12


Movie:  4065
num segments:  11


Movie:  4075
num segments:  11


Movie:  4083
num segments:  12


Movie:  4090
num segments:  13


Movie:  4107
num segments:  12


Movie:  4113
num segments:  12


Movie:  4114
num segments:  9


Movie:  4142
num segments:  11


Movie:  4158
num segments:  12


Movie:  4180
num segments:  10


Movie:  4183
num segments:  9


Movie:  4188
num segments:  13


Movie:  4190
num segments:  11


Movie:  4191
num segments:  10


Movie:  4195
num segments:  11


Movie:  4202
num segments:  9


Movie:  4205
num segments:  13


Movie:  4235
num segments:  12


Movie:  4243
num segments:  13


Movie:  4277
num segments:  10


Movie:  4303
num segments:  8


Movie:  4307
num segments



Movie:  2429
num segments:  8


Movie:  2445
num segments:  7


Movie:  2468
num segments:  8


Movie:  2469
num segments:  8


Movie:  2477
num segments:  7


Movie:  2480
num segments:  7


Movie:  2490
num segments:  8


Movie:  2494
num segments:  8


Movie:  2508
num segments:  8


Movie:  2511
num segments:  8


Movie:  2515
num segments:  8


Movie:  2517
num segments:  8


Movie:  2522
num segments:  8


Movie:  2526
num segments:  8


Movie:  2538
num segments:  8


Movie:  2544
num segments:  7


Movie:  2547
num segments:  7


Movie:  2567
num segments:  8


Movie:  2569
num segments:  8


Movie:  2579
num segments:  8


Movie:  2582
num segments:  8


Movie:  2597
num segments:  7


Movie:  2621
num segments:  8


Movie:  2631
num segments:  8


Movie:  2650
num segments:  8


Movie:  2661
num segments:  8


Movie:  2678
num segments:  8


Movie:  2682
num segments:  8


Movie:  2691
num segments:  8


Movie:  2706
num segments:  8


Movie:  2713
num segments:  8


Movie:



Movie:  470
num segments:  6


Movie:  490
num segments:  5


Movie:  497
num segments:  5


Movie:  514
num segments:  6


Movie:  526
num segments:  5


Movie:  547
num segments:  5


Movie:  552
num segments:  5


Movie:  553
num segments:  6


Movie:  556
num segments:  6


Movie:  558
num segments:  6


Movie:  566
num segments:  6


Movie:  572
num segments:  5


Movie:  576
num segments:  5


Movie:  589
num segments:  6


Movie:  606
num segments:  6


Movie:  609
num segments:  5


Movie:  613
num segments:  5


num completed:  1850


Movie:  617
num segments:  6


Movie:  620
num segments:  5


Movie:  625
num segments:  6


Movie:  628
num segments:  6


Movie:  632
num segments:  5


Movie:  637
num segments:  5


Movie:  647
num segments:  5


Movie:  655
num segments:  6


Movie:  663
num segments:  6


Movie:  694
num segments:  6


Movie:  712
num segments:  6


Movie:  719
num segments:  6


Movie:  738
num segments:  5


Movie:  764
num segments:  6


Movie:  768
nu



Movie:  3101
num segments:  5


Movie:  3112
num segments:  6


Movie:  3119
num segments:  6


Movie:  3127
num segments:  5


Movie:  3133
num segments:  6


Movie:  3149
num segments:  6


Movie:  3155
num segments:  6


Movie:  3157
num segments:  6


Movie:  3172
num segments:  6


Movie:  3176
num segments:  6


Movie:  3177
num segments:  5


Movie:  3187
num segments:  5


Movie:  3188
num segments:  6


Movie:  3209
num segments:  6


Movie:  3217
num segments:  5


Movie:  3231
num segments:  6


Movie:  3236
num segments:  6


Movie:  3240
num segments:  6


Movie:  3247
num segments:  6


Movie:  3249
num segments:  5


Movie:  3255
num segments:  5


Movie:  3258
num segments:  6


Movie:  3263
num segments:  6


Movie:  3273
num segments:  5


Movie:  3286
num segments:  6


Movie:  3287
num segments:  5


Movie:  3289
num segments:  5


Movie:  3303
num segments:  5


Movie:  3317
num segments:  5


Movie:  3325
num segments:  5


Movie:  3330
num segments:  6


Movie:



Movie:  1950
num segments:  5


Movie:  1952
num segments:  8


Movie:  1958
num segments:  7


Movie:  1979
num segments:  9


Movie:  1984
num segments:  7


Movie:  2005
num segments:  5


Movie:  2036
num segments:  9


Movie:  2038
num segments:  6


Movie:  2041
num segments:  6


Movie:  2065
num segments:  8


Movie:  2070
num segments:  8


Movie:  2082
num segments:  6


Movie:  2092
num segments:  8


Movie:  2115
num segments:  10


Movie:  2124
num segments:  9


Movie:  2157
num segments:  6


Movie:  2166
num segments:  8


Movie:  2179
num segments:  8


Movie:  2191
num segments:  7


Movie:  2201
num segments:  8


Movie:  2227
num segments:  7


Movie:  2245
num segments:  9


Movie:  2274
num segments:  9


Movie:  2280
num segments:  6


Movie:  2291
num segments:  6


Movie:  2313
num segments:  7


Movie:  2320
num segments:  6


Movie:  2336
num segments:  8


num completed:  2400


Movie:  2343
num segments:  8


Movie:  2345
num segments:  4


Movie:  2354
n



Movie:  907
num segments:  5


Movie:  908
num segments:  5


Movie:  918
num segments:  5


Movie:  919
num segments:  5


Movie:  953
num segments:  5


Movie:  954
num segments:  5


Movie:  992
num segments:  5


Movie:  1001
num segments:  5


Movie:  1011
num segments:  5


Movie:  1012
num segments:  5


Movie:  1026
num segments:  5


num completed:  2650


Movie:  1043
num segments:  5


Movie:  1058
num segments:  5


Movie:  1060
num segments:  5


Movie:  1068
num segments:  5


Movie:  1075
num segments:  5


Movie:  1080
num segments:  5


Movie:  1092
num segments:  5


Movie:  1094
num segments:  5


Movie:  1098
num segments:  5


Movie:  1100
num segments:  5


Movie:  1104
num segments:  5


Movie:  1123
num segments:  5


Movie:  1129
num segments:  5


Movie:  1134
num segments:  5


Movie:  1140
num segments:  5


Movie:  1151
num segments:  5


Movie:  1163
num segments:  5


Movie:  1176
num segments:  5


Movie:  1200
num segments:  5


Movie:  1221
num segme



Movie:  3513
num segments:  5


Movie:  3515
num segments:  5


num completed:  2900


Movie:  3521
num segments:  5


Movie:  3523
num segments:  5


Movie:  3542
num segments:  5


Movie:  3563
num segments:  5


Movie:  3567
num segments:  5


Movie:  3571
num segments:  5


Movie:  3573
num segments:  5


Movie:  3581
num segments:  5


Movie:  3582
num segments:  5


Movie:  3583
num segments:  5


Movie:  3590
num segments:  5


Movie:  3593
num segments:  5


Movie:  3608
num segments:  5


Movie:  3623
num segments:  5


Movie:  3640
num segments:  5


Movie:  3649
num segments:  5


Movie:  3668
num segments:  5


Movie:  3701
num segments:  5


Movie:  3703
num segments:  5


Movie:  3725
num segments:  5


Movie:  3736
num segments:  5


Movie:  3740
num segments:  5


Movie:  3743
num segments:  5


Movie:  3758
num segments:  5


Movie:  3777
num segments:  5


Movie:  3796
num segments:  5


Movie:  3801
num segments:  5


Movie:  3814
num segments:  5


Movie:  3816
nu



Movie:  398
num segments:  5


Movie:  400
num segments:  5


Movie:  401
num segments:  5


Movie:  408
num segments:  5


Movie:  418
num segments:  5


Movie:  427
num segments:  5


Movie:  431
num segments:  5


Movie:  432
num segments:  5


Movie:  436
num segments:  5


Movie:  438
num segments:  5


Movie:  452
num segments:  5


Movie:  454
num segments:  5


Movie:  456
num segments:  5


Movie:  463
num segments:  5


Movie:  474
num segments:  5


Movie:  476
num segments:  5


Movie:  477
num segments:  5


Movie:  488
num segments:  5


Movie:  504
num segments:  5


Movie:  505
num segments:  5


Movie:  518
num segments:  5


Movie:  561
num segments:  5


Movie:  575
num segments:  5


Movie:  577
num segments:  5


Movie:  579
num segments:  5


Movie:  582
num segments:  5


Movie:  585
num segments:  5


num completed:  3200


Movie:  586
num segments:  5


Movie:  588
num segments:  5


Movie:  600
num segments:  5


Movie:  643
num segments:  5


Movie:  657
nu



Movie:  3159
num segments:  5


Movie:  3181
num segments:  5


Movie:  3183
num segments:  5


num completed:  3450


Movie:  3190
num segments:  5


Movie:  3207
num segments:  5


Movie:  3208
num segments:  5


Movie:  3210
num segments:  5


Movie:  3212
num segments:  5


Movie:  3220
num segments:  5


Movie:  3229
num segments:  5


Movie:  3237
num segments:  5


Movie:  3242
num segments:  5


Movie:  3264
num segments:  5


Movie:  3266
num segments:  5


Movie:  3268
num segments:  5


Movie:  3295
num segments:  5


Movie:  3301
num segments:  5


Movie:  3313
num segments:  5


Movie:  3314
num segments:  5


Movie:  3324
num segments:  5


Movie:  3348
num segments:  5


Movie:  3376
num segments:  5


Movie:  3394
num segments:  5


Movie:  3398
num segments:  5


Movie:  3403
num segments:  5


Movie:  3404
num segments:  5


Movie:  3405
num segments:  5


Movie:  3410
num segments:  5


Movie:  3416
num segments:  5


Movie:  3422
num segments:  5


Movie:  3451
nu



Movie:  1762
num segments:  14


Movie:  1815
num segments:  14


Movie:  1820
num segments:  14


Movie:  1824
num segments:  14


Movie:  1826
num segments:  12


Movie:  1846
num segments:  12


Movie:  1863
num segments:  11


Movie:  1879
num segments:  13


Movie:  1898
num segments:  12


Movie:  1904
num segments:  13


Movie:  1927
num segments:  11


Movie:  1929
num segments:  11


Movie:  1930
num segments:  12


Movie:  1936
num segments:  12


Movie:  1954
num segments:  14


Movie:  1961
num segments:  12


Movie:  1966
num segments:  14


Movie:  1980
num segments:  10


Movie:  1992
num segments:  13


Movie:  2002
num segments:  12


Movie:  2003
num segments:  14


Movie:  2042
num segments:  13


Movie:  2049
num segments:  11


Movie:  2066
num segments:  14


Movie:  2084
num segments:  13


Movie:  2094
num segments:  12


Movie:  2105
num segments:  12


Movie:  2123
num segments:  13


Movie:  2134
num segments:  12


Movie:  2147
num segments:  13


Movie:  



Movie:  602
num segments:  6


Movie:  624
num segments:  6


Movie:  634
num segments:  6


Movie:  641
num segments:  6


Movie:  642
num segments:  6


Movie:  667
num segments:  6


Movie:  673
num segments:  6


Movie:  685
num segments:  6


Movie:  690
num segments:  6


Movie:  718
num segments:  6


Movie:  724
num segments:  6


Movie:  739
num segments:  6


Movie:  755
num segments:  6


Movie:  776
num segments:  6


Movie:  783
num segments:  6


Movie:  799
num segments:  6


Movie:  806
num segments:  6


Movie:  824
num segments:  6


Movie:  837
num segments:  6


Movie:  839
num segments:  6


Movie:  849
num segments:  6


Movie:  857
num segments:  6


Movie:  871
num segments:  6


Movie:  873
num segments:  6


Movie:  880
num segments:  6


Movie:  881
num segments:  6


Movie:  898
num segments:  6


Movie:  929
num segments:  6


Movie:  934
num segments:  6


Movie:  943
num segments:  6


Movie:  947
num segments:  6


Movie:  949
num segments:  6


Movie:



Movie:  3334
num segments:  6


Movie:  3354
num segments:  6


Movie:  3357
num segments:  6


Movie:  3367
num segments:  6


Movie:  3374
num segments:  6


Movie:  3375
num segments:  6


Movie:  3384
num segments:  6


Movie:  3390
num segments:  6


Movie:  3402
num segments:  6


Movie:  3419
num segments:  6


Movie:  3421
num segments:  6


Movie:  3438
num segments:  6


Movie:  3442
num segments:  6


Movie:  3449
num segments:  6


Movie:  3455
num segments:  6


Movie:  3461
num segments:  6


Movie:  3464
num segments:  6


Movie:  3486
num segments:  6


Movie:  3524
num segments:  6


Movie:  3530
num segments:  6


num completed:  4250


Movie:  3533
num segments:  6


Movie:  3546
num segments:  6


Movie:  3569
num segments:  5


Movie:  3591
num segments:  6


Movie:  3613
num segments:  6


Movie:  3622
num segments:  6


Movie:  3625
num segments:  6


Movie:  3634
num segments:  6


Movie:  3659
num segments:  6


Movie:  3666
num segments:  6


Movie:  3667
nu



Movie:  4308
num segments:  5


Movie:  4468
num segments:  5


Movie:  2709
num segments:  4


Movie:  3363
num segments:  3


save sim_dct_global
read user_movie_ratings_dct
calc weighted score based on sim_movie_dct_global
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
12600

97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
24

### Regression model to combine features from different approaches

In [10]:
file_num = 1
df_val = pd.read_hdf(FEATS_VAL_FN.format(file_num), key='stage')

df_test = pd.read_hdf(FEATS_TEST_FN.format(file_num), key='stage')

df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [11]:
print(df_val.shape)
print(df_test.shape)

(962152, 119)
(240538, 119)


In [13]:
neighbourhood_cols = [x for x in df_val.columns if
                      x.startswith('pred_score_item_item_')]
base_cols = ['User', 'Rating', 'Date', 'Movie', 'num_rating_user_bins',
             'num_rating_movie_bins', 'num_rating_user',
             'num_rating_movie']
baseline_cols = [x for x in df_val.columns if (not
                 x.startswith('pred_score_item_item_')) and
                 (x not in base_cols) and (x.find('surpriseSVD') == -1)
                 and (x.find('lightfm') == -1)]
surprise_cols = [x for x in df_val.columns if x.find('surpriseSVD') != -1]
lightfm_cols = [x for x in df_val.columns if x.find('lightfm') != -1]
print(len(neighbourhood_cols))
print(neighbourhood_cols)
print(len(baseline_cols))
print(baseline_cols)
print(len(surprise_cols))
print(len(lightfm_cols))

1
['pred_score_item_item_neighbourhood_model']
15
['mean_ratings_movie', 'mean_ratings_user', 'weighted_mean_ratings_movie', 'days_since_first_user_rating', 'sqrt_days_since_first_user_rating', 'rating_age_days_user', 'rating_age_weeks_user', 'rating_age_months_user', 'days_since_first_movie_rating', 'sqrt_days_since_first_movie_rating', 'rating_age_days_movie', 'rating_age_weeks_movie', 'rating_age_months_movie', 'num_ratings_movie', 'num_ratings_user']
94
1


In [16]:
# Imputation with median
FEATS = neighbourhood_cols + baseline_cols + surprise_cols + lightfm_cols
print('number of features: ', len(FEATS))

df_val_imputed = df_val.copy()
df_test_imputed = df_test.copy()

medians = []
for feat in FEATS:
    print('Feature: ', feat)
    mask = df_val_imputed[feat].notnull()
    medians.append(df_val_imputed.loc[mask, feat].median())
    
impute_dct = dict(zip(FEATS, medians))
print(len(impute_dct))

print('save impute_dct')
json.dump(impute_dct, open(
    REG_MODEL_IMPUTE_DCT_FN.format(file_num), 'w'))

df_val_imputed.fillna(value=impute_dct, inplace=True)
df_test_imputed.fillna(value=impute_dct, inplace=True)

number of features:  111
Feature:  pred_score_item_item_neighbourhood_model
Feature:  mean_ratings_movie
Feature:  mean_ratings_user
Feature:  weighted_mean_ratings_movie
Feature:  days_since_first_user_rating
Feature:  sqrt_days_since_first_user_rating
Feature:  rating_age_days_user
Feature:  rating_age_weeks_user
Feature:  rating_age_months_user
Feature:  days_since_first_movie_rating
Feature:  sqrt_days_since_first_movie_rating
Feature:  rating_age_days_movie
Feature:  rating_age_weeks_movie
Feature:  rating_age_months_movie
Feature:  num_ratings_movie
Feature:  num_ratings_user
Feature:  pred_rating_surpriseSVD_U14_M8
Feature:  pred_rating_surpriseSVD_U0_M0
Feature:  pred_rating_surpriseSVD_U0_M1
Feature:  pred_rating_surpriseSVD_U0_M2
Feature:  pred_rating_surpriseSVD_U0_M3
Feature:  pred_rating_surpriseSVD_U0_M5
Feature:  pred_rating_surpriseSVD_U0_M9
Feature:  pred_rating_surpriseSVD_U10_M1
Feature:  pred_rating_surpriseSVD_U10_M2
Feature:  pred_rating_surpriseSVD_U10_M3
Feature

In [17]:
# get train and test data for training and validation
x_train, y_train = df_val_imputed[FEATS], df_val_imputed['Rating']
x_test, y_test = df_test_imputed[FEATS], df_test_imputed['Rating']

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(962152, 111) (962152,)
(240538, 111) (240538,)


In [18]:
from sklearn import linear_model, metrics

In [19]:
alphas=[x/10. for x in range(1, 41)]
ridgecv_model = linear_model.RidgeCV(alphas=alphas,
                                     scoring='neg_mean_squared_error',
                                     cv=4)

In [20]:
%time ridgecv_model.fit(x_train, y_train)


Ill-conditioned matrix (rcond=5.8782e-17): result may not be accurate.


Ill-conditioned matrix (rcond=5.87383e-17): result may not be accurate.


Ill-conditioned matrix (rcond=5.86958e-17): result may not be accurate.


Ill-conditioned matrix (rcond=5.86836e-17): result may not be accurate.



CPU times: user 8min 50s, sys: 5min 11s, total: 14min 2s
Wall time: 8min 59s


RidgeCV(alphas=array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. ]),
    cv=4, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring='neg_mean_squared_error', store_cv_values=False)

In [21]:
pred_train = ridgecv_model.predict(x_train)
pred_test = ridgecv_model.predict(x_test)

print('Train RMSE: ', np.sqrt(mean_squared_error(
    y_true=y_train.values, y_pred=pred_train)))
print('Test RMSE: ', np.sqrt(mean_squared_error(
    y_true=y_test.values, y_pred=pred_test)))

Train RMSE:  0.9284441802623734
Test RMSE:  0.9255941701895232


In [26]:
print('Feature Importance')
feat_imp_df = pd.DataFrame({'feature': FEATS,
                            'coef': ridgecv_model.coef_})
feat_imp_df['abs_coef'] = feat_imp_df['coef'].apply(lambda x: abs(x))
feat_imp_df.sort_values('abs_coef', ascending=False, inplace=True)
feat_imp_df.reset_index(drop=True, inplace=True)
print(feat_imp_df.head(30))

Feature Importance
                           feature      coef  abs_coef
0   pred_rating_surpriseSVD_U14_M6  0.937429  0.937429
1   pred_rating_surpriseSVD_U14_M5  0.858041  0.858041
2   pred_rating_surpriseSVD_U14_M7  0.822948  0.822948
3                mean_ratings_user  0.801861  0.801861
4   pred_rating_surpriseSVD_U14_M3  0.676849  0.676849
5   pred_rating_surpriseSVD_U14_M4  0.623685  0.623685
6    pred_rating_surpriseSVD_U9_M0  0.606978  0.606978
7    pred_rating_surpriseSVD_U7_M0  0.498802  0.498802
8    pred_rating_surpriseSVD_U6_M0  0.453924  0.453924
9   pred_rating_surpriseSVD_U10_M1  0.379677  0.379677
10  pred_rating_surpriseSVD_U14_M2  0.377082  0.377082
11   pred_rating_surpriseSVD_U0_M0  0.335080  0.335080
12  pred_rating_surpriseSVD_U14_M8  0.330335  0.330335
13  pred_rating_surpriseSVD_U14_M0  0.306431  0.306431
14  pred_rating_surpriseSVD_U12_M9  0.297545  0.297545
15   pred_rating_surpriseSVD_U8_M0  0.283201  0.283201
16   pred_rating_surpriseSVD_U6_M2  0.272279  

In [27]:
# Save
if not os.path.isfile(REG_MODEL_PRED_VAL_FN.format(file_num)):
    df_val['pred_model1'] = pred_train
    df_test['pred_model1'] = pred_test
    needed_cols = ['User', 'Movie', 'Date', 'Rating', 'pred_model1']

    print('save')
    joblib.dump(ridgecv_model, open(REG_MODEL_OBJ_FN.format(file_num),
                                    'wb'))
    df_val[needed_cols].to_hdf(
        REG_MODEL_PRED_VAL_FN.format(file_num), key='stage', mode='w')
    df_test[needed_cols].to_hdf(
        REG_MODEL_PRED_TEST_FN.format(file_num), key='stage', mode='w')

else:
    print('File exists. Reading from disk...')
    df_val_pred = pd.read_hdf(REG_MODEL_PRED_VAL_FN.format(file_num),
                              key='stage')
    df_test_pred = pd.read_hdf(REG_MODEL_PRED_TEST_FN.format(file_num),
                               key='stage')
    
    df_val['pred_model2'] = pred_train
    df_test['pred_model2'] = pred_test
    needed_cols = ['User', 'Movie', 'Date', 'Rating', 'pred_model2']
    
    df_val_pred = pd.merge(df_val_pred, df_val[needed_cols],
                           on=['User', 'Movie', 'Date', 'Rating'])
    df_test_pred = pd.merge(df_test_pred, df_test[needed_cols],
                            on=['User', 'Movie', 'Date', 'Rating'])
    
    df_val_pred.to_hdf(
        REG_MODEL_PRED_VAL_FN.format(file_num), key='stage', mode='w')
    df_test_pred.to_hdf(
        REG_MODEL_PRED_TEST_FN.format(file_num), key='stage', mode='w')

File exists. Reading from disk...


In [30]:
pred_scores = [('model1', 'pred_model1'), ('model2', 'pred_model2')]

for model, pred_score in pred_scores:
    print('%s Test RMSE: %0.4f' % (model, np.sqrt(mean_squared_error(
        y_true=df_test_pred['Rating'].values,
        y_pred=df_test_pred[pred_score].values))))

model1 Test RMSE: 0.9353
model2 Test RMSE: 0.9256


### Things done so far
1. Exploratory analysis to get an idea of what baseline features would be useful
2. custom item-item neighbourhood model
3. MF using SVD - surprise library
4. MF using lightfm
5. Regression model to combine features from the above approaches

### Next steps
1. Using content data - movie titles
2. Jointly train embeddings for users and movies (by including titles data)
3. Hyperparameter tuning of lightfm model
4. Training on all the datasets
5. Deep learning approaches
6. Algorithms that can be used on streaming data - not related to netflix problem