In [1]:
import pandas as pd
import numpy as np
import sys
import os
import pickle as pkl
import os

# 02C - Train, CV, and Test Sets
In this notebook, we merge the externally collected item data with the ratings data so that every row of the resulting dataset contains a user-item pair, the star rating for that pair, and relevant features for that item.

### Import combined item data

In [11]:
data_path_1 = os.path.join('..','data')
data_path_2 = os.path.join('..','data-2')

with open(os.path.join(data_path_2, 'item_data.df'), 'rb') as file_in:
    item_data = pkl.load(file_in)

item_data.sample(5)

Unnamed: 0,asin,title,box_office,country,language,metascore,mpaa_rating,runtime,type,year,vfx,genres_imdb,studios_imdb,imdb_rating,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank
4173,B000059MQ3,The Yards,,[usa],"[english, spanish]",58.0,R,115.0,movie,2000,True,"[crime, drama, romance, thriller]","[miramax, industry, entertainment]",6.4,17308.0,[james gray],[],"[mark wahlberg, joaquin phoenix, charlize ther...",[miramax],190215.0
3713,B001UV4XGU,Sunshine Cleaning,12033702.0,[usa],"[english, spanish]",61.0,R,91.0,movie,2008,True,"[comedy, drama]","[overture, film, big, beach, film, back, lot, ...",6.9,63888.0,[christine jeffs],[drama],"[amy adams, emily blunt]","[overture, film, anchor, bay, entertainment]",29713.0
2580,6303139108,Drugstore Cowboy,,[usa],[english],82.0,R,102.0,movie,1989,False,"[crime, drama]","[avenue, picture]",7.4,27162.0,[gus van sant],[],"[matt dillon, kelly lynch, james le gros, heat...","[live, artisan]",338967.0
2699,076781505X,Les Miserables,,[uk],[english],,,150.0,movie,1978,False,"[drama, history]","[incorporated, television, company, norman, ro...",7.4,1274.0,[glenn jordan],[],"[liam neeson, geoffrey rush, uma thurman, clai...","[sony, picture, home, entertainment]",9759.0
2349,B0000C8AVT,Too Late for Tears,,[usa],[english],,,99.0,movie,1949,True,"[crime, drama, film-noir, mystery, thriller]","[hunt, stromberg, production, republic, pictur...",7.4,2376.0,[byron haskin],[],"[lizabeth scott, don defore, dan duryea, arthu...","[alpha, video]",97091.0


### Import ratings data

In [8]:
ratings = pd.read_csv(os.path.join(data_path_1, 'reviews_sample_100.csv'))
ratings = ratings[['asin', 'reviewerID', 'overall']]

### Merge item data with ratings data
In performing an inner merge between the ratings dataset and items dataset, those user-item pairs that contain an item that could not be successfully queried on OMDB are dropped. This will result in approximately 27,000 ratings out of 440,000 ratings being lost.

In [27]:
all_data = pd.merge(ratings, item_data, how='inner', on='asin')
columns = list(all_data.columns)
columns[:3] = ['item', 'user', 'rating']
all_data.columns = columns
all_data.sample(5)

Unnamed: 0,item,user,rating,title,box_office,country,language,metascore,mpaa_rating,runtime,...,vfx,genres_imdb,studios_imdb,imdb_rating,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank
48360,B00H7KJRVY,A128DUUMZLGUFB,5.0,The Secret Life of Walter Mitty,33223430.0,"[usa, uk]","[english, spanish, icelandic]",54.0,PG,113.0,...,True,"[adventure, comedy, drama, fantasy, romance]","[twentieth, century, fox, tsg, entertainment, ...",7.3,257181.0,[ben stiller],"[action, adventure]","[ben stiller, kristen wiig, adam scott]","[20th, century, fox]",3035.0
39964,B0059XTUVI,A2809U2S14N0K1,4.0,The Iron Lady,29959436.0,"[uk, france]",[english],54.0,PG-13,105.0,...,True,"[biography, drama]","[pathé, film4, uk, film, council, the, weinste...",6.4,89595.0,[phyllida lloyd],[],"[meryl streep, jim broadbent]","[the, weinstein, company]",20864.0
160027,B00005JOC9,A1L9E4OA926W9B,3.0,The Da Vinci Code,217536138.0,"[usa, malta, france, uk]","[english, french, latin, spanish]",46.0,PG-13,149.0,...,True,"[mystery, thriller]","[columbia, picture, imagine, entertainment, sk...",6.6,347847.0,[ron howard],[],"[paul bettany, jürgen prochnow, jean reno, tom...","[sony, picture, home, entertainment]",12805.0
261163,B00005V1WW,A82LIVYSX6WZ9,2.0,The One,,[usa],[english],25.0,PG-13,87.0,...,True,"[action, sci-fi, thriller]","[revolution, studio, hard, eight, picture, won...",5.9,82150.0,[james wong],"[science, fiction]","[jason statham, delroy lindo, carla gugino, je...","[sony, picture, home, entertainment]",65673.0
150554,630518299X,AGLMZJVUGSOD5,4.0,The Truman Show,,[usa],[english],90.0,PG,103.0,...,True,"[comedy, drama, sci-fi]","[paramount, picture, scott, rudin, production]",8.1,749948.0,[peter weir],[],"[jim carrey, ed harris, laura linney, noah emm...",[paramount],82659.0


### Split data into train, cross-validation, and test sets
While common practice is to split data into 60-20-20 or 70-15-15 train/cv/test sets, given the large amount of data available (over 4 million), a 80-10-10 split provides enough data for cross-validation and testing to be just as effective. We also maintain a combined train+cross-validation set for faster algorithms where it remains possible to perform k-fold cross-validation.

In [42]:
# shuffle data
n_ratings = all_data.shape[0]
all_data_shuffled = all_data.sample(n_ratings)

train_df = all_data_shuffled.iloc[0:int(0.8 * n_ratings),:]
dev_df = all_data_shuffled.iloc[int(0.8*n_ratings):int(0.9*n_ratings),:]
train_dev_df = all_data_shuffled.iloc[:int(0.9*n_ratings),:]
test_df = all_data_shuffled.iloc[int(0.9*n_ratings):,:]

print('train set: {} ratings'.format(train_df.shape[0]))
print('dev set: {} ratings'.format(dev_df.shape[0]))
print('train+dev set: {} ratings'.format(train_dev_df.shape[0]))
print('test set: {} ratings'.format(test_df.shape[0]))

train set: 331561 ratings
dev set: 41445 ratings
train+dev set: 373006 ratings
test set: 41446 ratings


### Save splits

In [44]:
with open(os.path.join(data_path_2, 'splits', 'train.df'), 'wb') as file_out:
    pkl.dump(train_df, file_out)

with open(os.path.join(data_path_2, 'splits', 'dev.df'), 'wb') as file_out:
    pkl.dump(dev_df, file_out)
    
with open(os.path.join(data_path_2, 'splits', 'train_dev.df'), 'wb') as file_out:
    pkl.dump(train_dev_df, file_out)
    
with open(os.path.join(data_path_2, 'splits', 'test.df'), 'wb') as file_out:
    pkl.dump(test_df, file_out)