In [7]:
import pandas as pd
import numpy as np
import sys
import os
import pickle as pkl
import os

# 02C - Train, CV, and Test Sets
In this notebook, we merge the externally collected item data with the ratings data so that every row of the resulting dataset contains a user-item pair, the star rating for that pair, and relevant features for that item.

### Import combined item data

In [8]:
data_path_1 = os.path.join('..','..','data')
data_path_2 = os.path.join('..','..','data-2')

with open(os.path.join(data_path_2, 'item_data.df'), 'rb') as file_in:
    item_data = pkl.load(file_in)

item_data.sample(5)

Unnamed: 0,asin,title,box_office,country,language,metascore,mpaa_rating,runtime,type,year,vfx,genres_imdb,studios_imdb,imdb_rating,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank
1313,B0009PW4D2,H.G. Wells' War of the Worlds: Behind the Scenes,,[usa],[english],,,,movie,2005,False,[sci-fi],"[asylum, the]",6.0,17.0,[lee mclaughlin],[],"[anthony piana, jack clay, james lathrop, darl...",[allumination],133759.0
6,B0007989Y8,L'Eclisse,,"[italy, france]","[italian, english]",,,126.0,movie,1962,False,"[drama, romance]","[cineriz, interopa, film, paris, film]",7.9,13239.0,[michelangelo antonioni],[],"[alain delon, monica vitti, francisco rabal, l...",[criterion],112515.0
1347,6300247392,Taps,,[usa],[english],49.0,PG,126.0,movie,1981,True,[drama],"[twentieth, century, fox]",6.7,14140.0,[harold becker],"[military, war]","[george c. scott, timothy hutton, ronny cox, s...","[fox, home, entertainme]",258195.0
5166,630518299X,The Truman Show,,[usa],[english],90.0,PG,103.0,movie,1998,True,"[comedy, drama, sci-fi]","[paramount, picture, scott, rudin, production]",8.1,749948.0,[peter weir],[],"[jim carrey, ed harris, laura linney, noah emm...",[paramount],82659.0
921,B00008972O,Killing Me Softly,,"[usa, uk]",[english],,R,100.0,movie,2002,True,"[drama, mystery, romance, thriller]","[metrogoldwynmayer, (, mgm, ), montecito, pict...",5.5,15287.0,[kaige chen],[drama],"[heather graham, joseph fiennes, natascha mcel...","[mgm, video, dvd]",43625.0


### Import ratings data

In [9]:
ratings = pd.read_csv(os.path.join(data_path_1, 'reviews_sample_100.csv'))
ratings = ratings[['asin', 'reviewerID', 'overall']]

### Merge item data with ratings data
In performing an inner merge between the ratings dataset and items dataset, those user-item pairs that contain an item that could not be successfully queried on OMDB are dropped. This will result in approximately 27,000 ratings out of 440,000 ratings being lost.

In [10]:
all_data = pd.merge(ratings, item_data, how='inner', on='asin')
columns = list(all_data.columns)
columns[:3] = ['item', 'user', 'rating']
all_data.columns = columns
all_data.sample(5)

Unnamed: 0,item,user,rating,title,box_office,country,language,metascore,mpaa_rating,runtime,...,vfx,genres_imdb,studios_imdb,imdb_rating,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank
16999,0790729385,A2DWWZ6QIWBJ8D,4.0,The Exorcist,,[usa],"[english, latin, greek, french, german, arabic...",82.0,R,122.0,...,True,[horror],"[warner, bro, hoya, production]",8.0,307435.0,[william friedkin],[],"[ellen burstyn, max von sydow, linda blair, le...","[warner, home, video]",205897.0
240099,B00008LUNW,AAMUCZO7DPXSO,3.0,Darkness Falls,32032215.0,"[usa, australia]",[english],23.0,PG-13,86.0,...,True,"[horror, mystery, thriller]","[revolution, studio, distant, corner, entertai...",5.0,27407.0,[jonathan liebesman],[],"[grant piro, chaney kley, emma caulfield]","[sony, picture, home, entertainment]",71743.0
15437,B00003CWQ2,A3BXS0A7CMJ1CJ,2.0,Fight Club,,"[usa, germany]",[english],66.0,R,139.0,...,True,[drama],"[fox, 2000, picture, regency, enterprise, lins...",8.8,1508906.0,[david fincher],[],[],[],425241.0
39639,B005LAIH2W,A1DK5AZMXS1QA3,5.0,John Carter,73058679.0,[usa],[english],51.0,PG-13,132.0,...,True,"[action, adventure, sci-fi]","[walt, disney, picture, bot, vfx]",6.6,225714.0,[andrew stanton],[],"[taylor kitsch, lynn collins]","[buena, vista]",4471.0
355036,B00BUADSMQ,A3956O40ZZYFMY,5.0,A Good Day to Die Hard,67344392.0,[usa],"[english, russian, hindi]",28.0,R,98.0,...,True,"[action, thriller]","[twentieth, century, fox, tsg, entertainment, ...",5.3,176973.0,[john moore],[],"[bruce willis, jai courtney]","[20th, century, fox]",8992.0


### Split data into train, cross-validation, and test sets
While common practice is to split data into 60-20-20 or 70-15-15 train/cv/test sets, given the large amount of data available (over 4 million), a 80-10-10 split provides enough data for cross-validation and testing to be just as effective. We also maintain a combined train+cross-validation set for faster algorithms where it remains possible to perform k-fold cross-validation.

In [11]:
# shuffle data
n_ratings = all_data.shape[0]
all_data_shuffled = all_data.sample(n_ratings)

train_df = all_data_shuffled.iloc[0:int(0.8 * n_ratings),:]
dev_df = all_data_shuffled.iloc[int(0.8*n_ratings):int(0.9*n_ratings),:]
train_dev_df = all_data_shuffled.iloc[:int(0.9*n_ratings),:]
test_df = all_data_shuffled.iloc[int(0.9*n_ratings):,:]

print('train set: {} ratings'.format(train_df.shape[0]))
print('dev set: {} ratings'.format(dev_df.shape[0]))
print('train+dev set: {} ratings'.format(train_dev_df.shape[0]))
print('test set: {} ratings'.format(test_df.shape[0]))

train set: 331561 ratings
dev set: 41445 ratings
train+dev set: 373006 ratings
test set: 41446 ratings


### Save splits

In [12]:
with open(os.path.join(data_path_2, 'splits', 'train.df'), 'wb') as file_out:
    pkl.dump(train_df, file_out)

with open(os.path.join(data_path_2, 'splits', 'dev.df'), 'wb') as file_out:
    pkl.dump(dev_df, file_out)
    
with open(os.path.join(data_path_2, 'splits', 'train_dev.df'), 'wb') as file_out:
    pkl.dump(train_dev_df, file_out)
    
with open(os.path.join(data_path_2, 'splits', 'test.df'), 'wb') as file_out:
    pkl.dump(test_df, file_out)