In [1]:
#This version leaves more of the original dataset intact.  Instead of removing all users who have watched multiple movies on a single day 
#(this makes resolving state, next state order impossible given only date information and not datetime information), we remove just the affected data.
#This means we remove tuples from all movies watched by the user on the day they binge-watched, as well as the tuple from the prior viewing session 
#(for which we are unsure about the next state).

import datetime
import os
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import io


In [2]:
#download raw data from here: https://www.kaggle.com/netflix-inc/netflix-prize-data#README
#thanks to https://github.com/matthewkparker/Netflix_Recommender/blob/master/Code/01_Preprocessing.ipynb for help with data processing

#some resources below for enriching movie data
#most hopeful genre data https://www.igvita.com/2007/01/27/correlating-netflix-and-imdb-datasets/
#http://cns.bu.edu/~gsc/MovieGenre.html genre data
#alternatively can download data here: https://github.com/hadley/data-movies

def formatting(path):
    #Step 1
    df_raw = pd.read_csv(path, header=None, names=['user_id', 'rating', 'date'], usecols=[0, 1, 2])
    #Step 2
    tmp_movies = df_raw[df_raw['rating'].isna()]['user_id'].reset_index()
    movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
    #Step 3
    shifted_movie_indices = deque(movie_indices)
    shifted_movie_indices.rotate(-1)
    #Step 4
    user_data = []
    for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
        if df_id_1<df_id_2:
            tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
        else:
            tmp_df = df_raw.loc[df_id_1+1:].copy()
        tmp_df['movie'] = movie_id
        user_data.append(tmp_df)
    #Step 5
    df = pd.concat(user_data)
    print('done formatting')
    return df

In [3]:
path2file1 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_1.txt'
path2file2 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_2.txt'
path2file3 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_3.txt'
path2file4 = r'C:/Users/fxi/Documents/Decision_Making_Under_Uncertainty/final_project/combined_data_4.txt'

df1 = formatting(path2file1)
df2 = formatting(path2file2)
df3 = formatting(path2file3)
df4 = formatting(path2file4)

done formatting
done formatting
done formatting
done formatting


In [4]:
all_data=df1.append(df2.append(df3.append(df4)))
all_data.shape[0]

100480507

In [207]:
testing_data=all_data.loc[all_data['user_id'] == '1001129' ]
testing_data.to_csv('testing_data.csv')

#testing_data_dupes= pd.DataFrame({'count' : testing_data.groupby( [ 'user_id', 'date'] ).size()}).reset_index()
#to_delete=testing_data_dupes.loc[testing_data_dupes['count'] >= 2].user_id.unique()

#testing_data_dupes.loc[testing_data_dupes['count'] >= 2].sort_values(['user_id','date'], ascending=[True,True])

In [5]:
#remove data from days where one user watches multiple movies on the same day.  We can't tell which order movies were watched
#on these days because we only have dates and not datetimes.  Also keep track of how many movies were watched the next day and if we deleted that next days data.
#As we build the dataset later, we'll need to remember that days with the next_day_deleted flag won't have a next state or reward that we can use.
def remove_binge_dupes(testing_data):
    
    testing_data_dupes= pd.DataFrame({'daily_movie_count' : testing_data.groupby( [ 'user_id', 'date'] ).size()}).reset_index().sort_values(['user_id','date'], ascending=[True,True])
    daily_labeled_data = pd.merge(testing_data, testing_data_dupes,  how='left', left_on=['user_id','date'], right_on = ['user_id','date']).sort_values(['user_id','date'], ascending=[True,True])
    daily_labeled_data["next_day_movie_count"]=daily_labeled_data["daily_movie_count"].shift(-1)

    daily_labeled_data.loc[daily_labeled_data['next_day_movie_count'] >1.0, 'next_day_deleted'] = True
    daily_labeled_data.next_day_deleted.fillna(False, inplace=True)

    unique_daily_data= daily_labeled_data.loc[daily_labeled_data['daily_movie_count'] ==1]
    return unique_daily_data

binge_data_deduped=remove_binge_dupes(all_data)
binge_data_deduped.shape[0]
binge_data_deduped.to_csv('binge_data_deduped.csv')

In [81]:
#remove users that have multiple movies watched on the same day.  We can't tell which order movies were watched
#on these days because we only have dates and not datetimes
def remove_user_dupes(data):
    user_date_dupes=pd.DataFrame({'count' : data.groupby( [ 'user_id', 'date'] ).size()}).reset_index()
    deletions=user_date_dupes.loc[user_date_dupes['count'] >= 2].user_id.unique()
    data = data[~data['user_id'].isin(list(deletions))]
    return data

all_data_deduped=remove_user_dupes(all_data)
all_data_deduped.shape[0]

In [84]:
all_data_deduped.to_csv('all_data_deduped.csv')

In [8]:
#Note here we need to select whether to use data deduped by user/day or by user.  Here I'm using the bigger dataset, which dedupes by user/day.
all_data_deduped=binge_data_deduped

In [9]:
len(all_data_deduped.user_id.unique())

430595

In [10]:
user_df= all_data_deduped
movie_data = pd.read_csv("movie_titles.csv", header=None, encoding = "ISO-8859-1")
movie_data.columns = ['movie_id', 'release_year','movie_name']
netflix_data_cleaned=pd.merge(user_df, movie_data, left_on = 'movie', right_on = 'movie_id')

In [11]:
import scipy.io
import numpy as np

mat_path = r'C:/Users/fxi/Downloads/movieGenreBenchmark.mat'
#movieGenreData.mat'
genre_data = scipy.io.loadmat(mat_path)

movie_genre_mapping = pd.DataFrame(
    {'genre_id': genre_data['dataStructMovie'][0][0][1][:,0].tolist(),
     'movie_id':genre_data['dataStructMovie'][0][0][6][0].tolist()
    })    

k = [j[0] for j in genre_data['dataStructMovie'][0][0][8]] 

i=(list(range(1, len(k)+1)))

genre_names = pd.DataFrame(
    {'genre_id': i,
     'genre_name': k
    }) 

genre_mapping_cleaned=pd.merge(movie_genre_mapping, genre_names, left_on = 'genre_id', right_on = 'genre_id')
data=pd.merge(netflix_data_cleaned, genre_mapping_cleaned, left_on = 'movie_id', right_on = 'movie_id')

In [12]:
genre_names

Unnamed: 0,genre_id,genre_name
0,1,[Action & Adventure]
1,2,[Anime & Animation]
2,3,[Children & Family]
3,4,[Classics]
4,5,[Comedy]
5,6,[Documentary]
6,7,[Drama]
7,8,[Foreign]
8,9,[Horror]
9,10,[Music & Musicals]


In [89]:
data.to_csv('netflix_data_with_genre.csv')

In [21]:
sorted_data=data.sort_values(['user_id', 'date'], ascending=[True, True])

In [22]:
sorted_data["next_user_id"]=sorted_data["user_id"].shift(-1)
sorted_data['next_date']=sorted_data['date'].shift(-1)
sorted_data['next_movie']=sorted_data['movie_name'].shift(-1)
sorted_data['next_movie_id']=sorted_data['movie_id'].shift(-1)
sorted_data['next_rating']=sorted_data['rating'].shift(-1)
sorted_data['next_genre_id']=sorted_data['genre_id'].shift(-1)
sorted_data['next_genre']=sorted_data['genre_name'].shift(-1)

In [23]:
#Only keep movies where the next viewing day data has not been deleted.  This is the only data that is useful for seeing the next state and reward.
sorted_data= sorted_data.loc[sorted_data['next_day_deleted'] ==False]

sorted_data=sorted_data.assign(same_user=sorted_data.user_id == sorted_data.next_user_id)

In [24]:
sorted_data['movie_num'] = sorted_data.sort_values(['user_id','date'], ascending=[True,True]).groupby(['user_id']).cumcount() + 1


In [25]:
user_actions=sorted_data.groupby('user_id')['movie_num'].max()

In [26]:
sorted_data_with_actions=pd.merge(sorted_data, user_actions.to_frame(), left_on = 'user_id', right_on = 'user_id')

In [27]:
sorted_data_with_actions.rename(columns={'movie_num_x': 'movie_num', 'movie_num_y': 'total_movies_watched'}, inplace=True)
sorted_data_with_actions.head(10)

Unnamed: 0,user_id,rating,date,movie,daily_movie_count,next_day_movie_count,next_day_deleted,movie_id,release_year,movie_name,...,next_user_id,next_date,next_movie,next_movie_id,next_rating,next_genre_id,next_genre,same_user,movie_num,total_movies_watched
0,10,3.0,2003-02-14,1145,1,1.0,False,1145,2001.0,The Wedding Planner,...,10,2003-03-08,Brotherhood of the Wolf,8808.0,3.0,9.0,[Horror],True,1,10
1,10,2.0,2003-05-14,4683,1,1.0,False,4683,2000.0,Proof of Life,...,10,2003-05-21,Bridget Jones's Diary,299.0,2.0,11.0,[Romance],True,2,10
2,10,2.0,2003-05-21,299,1,1.0,False,299,2001.0,Bridget Jones's Diary,...,10,2003-05-30,Man Bites Dog,2986.0,1.0,8.0,[Foreign],True,3,10
3,10,1.0,2003-05-30,2986,1,1.0,False,2986,1992.0,Man Bites Dog,...,10,2003-06-15,My Big Fat Greek Wedding,6206.0,3.0,11.0,[Romance],True,4,10
4,10,3.0,2003-06-15,6206,1,1.0,False,6206,2002.0,My Big Fat Greek Wedding,...,10,2003-06-30,Divine Secrets of the Ya-Ya Sisterhood,6760.0,3.0,5.0,[Comedy],True,5,10
5,10,3.0,2003-06-30,6760,1,1.0,False,6760,2002.0,Divine Secrets of the Ya-Ya Sisterhood,...,10,2003-07-09,Sex and Lucia,16002.0,4.0,8.0,[Foreign],True,6,10
6,10,4.0,2003-07-09,16002,1,1.0,False,16002,2002.0,Sex and Lucia,...,10,2003-07-12,Knockaround Guys,16459.0,3.0,1.0,[Action & Adventure],True,7,10
7,10,3.0,2003-07-12,16459,1,1.0,False,16459,2002.0,Knockaround Guys,...,10,2003-07-29,Princess Mononoke,473.0,5.0,2.0,[Anime & Animation],True,8,10
8,10,5.0,2003-07-29,473,1,1.0,False,473,1997.0,Princess Mononoke,...,10,2003-10-15,Tully,4641.0,2.0,7.0,[Drama],True,9,10
9,10,2.0,2003-10-15,4641,1,1.0,False,4641,2002.0,Tully,...,10,2003-10-21,My First Mister,16862.0,4.0,5.0,[Comedy],True,10,10


In [28]:
#Delete last movie watched (next movie in the ordering is for a different user)
labeled_data=sorted_data_with_actions.loc[sorted_data_with_actions['same_user'] == True]

In [29]:
labeled_data.head()

Unnamed: 0,user_id,rating,date,movie,daily_movie_count,next_day_movie_count,next_day_deleted,movie_id,release_year,movie_name,...,next_user_id,next_date,next_movie,next_movie_id,next_rating,next_genre_id,next_genre,same_user,movie_num,total_movies_watched
0,10,3.0,2003-02-14,1145,1,1.0,False,1145,2001.0,The Wedding Planner,...,10,2003-03-08,Brotherhood of the Wolf,8808.0,3.0,9.0,[Horror],True,1,10
1,10,2.0,2003-05-14,4683,1,1.0,False,4683,2000.0,Proof of Life,...,10,2003-05-21,Bridget Jones's Diary,299.0,2.0,11.0,[Romance],True,2,10
2,10,2.0,2003-05-21,299,1,1.0,False,299,2001.0,Bridget Jones's Diary,...,10,2003-05-30,Man Bites Dog,2986.0,1.0,8.0,[Foreign],True,3,10
3,10,1.0,2003-05-30,2986,1,1.0,False,2986,1992.0,Man Bites Dog,...,10,2003-06-15,My Big Fat Greek Wedding,6206.0,3.0,11.0,[Romance],True,4,10
4,10,3.0,2003-06-15,6206,1,1.0,False,6206,2002.0,My Big Fat Greek Wedding,...,10,2003-06-30,Divine Secrets of the Ya-Ya Sisterhood,6760.0,3.0,5.0,[Comedy],True,5,10


In [42]:
labeled_data.to_csv('labeled_data.csv')

In [30]:
small_data = labeled_data[['genre_id','next_genre_id','next_rating']]

In [31]:
small_data.next_genre_id=small_data.next_genre_id.astype(int)
small_data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0,genre_id,next_genre_id,next_rating
0,11,9,3.0
1,14,11,2.0
2,11,8,1.0
3,8,11,3.0
4,11,5,3.0
5,5,8,4.0
6,8,1,3.0
7,1,2,5.0
8,2,7,2.0
9,7,5,4.0


In [33]:
genre_list=list(range(1, 15))
genres=pd.DataFrame({'recommendation':genre_list})

states_and_actions = pd.merge(small_data.assign(key=0), genres.assign(key=0), on='key').drop('key', axis=1)

In [34]:
states_and_actions.loc[states_and_actions['next_genre_id'] == states_and_actions['recommendation'], 'fixed_rating'] = states_and_actions['next_rating']

In [35]:
states_and_actions.fixed_rating.fillna(-5, inplace=True)

In [36]:
final_labeled_data= states_and_actions[['genre_id','recommendation','next_genre_id','fixed_rating']]

In [39]:
final_labeled_data.rename(columns={'fixed_rating': 'next_rating'}, inplace=True)
final_labeled_data.head(25)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,genre_id,recommendation,next_genre_id,next_rating
0,11,1,9,-5.0
1,11,2,9,-5.0
2,11,3,9,-5.0
3,11,4,9,-5.0
4,11,5,9,-5.0
5,11,6,9,-5.0
6,11,7,9,-5.0
7,11,8,9,-5.0
8,11,9,9,3.0
9,11,10,9,-5.0


In [40]:
final_labeled_data.shape[0]

43799238

In [None]:
final_labeled_data.to_csv('final_labeled_data.csv')

In [170]:
final_labeled_data.rename(columns={'genre_id': 's', 'recommendation': 'a', 'next_genre_id': 'sp', 'next_rating': 'r'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,s,a,sp,r
0,7,1,1,3.0
1,7,2,1,-5.0
2,7,3,1,-5.0
3,7,4,1,-5.0
4,7,5,1,-5.0


In [184]:
from sklearn.model_selection import train_test_split

a=list(range(1, 15))
s=list(range(1, 15))


#predict R
x = final_labeled_data.drop('r', axis=1)
x = x.drop('sp', axis=1)
y = final_labeled_data['r']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

from sklearn.tree import DecisionTreeClassifier  
classifier = DecisionTreeClassifier(class_weight='balanced')  
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)  

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))



from sklearn.utils.extmath import cartesian
ra=cartesian([s, a])
y_pred = classifier.predict(ra)

r=np.column_stack((ra, y_pred))

#print(r)
r_pd = pd.DataFrame(r)
#t_pd.groupby(0)[0].mean()
r_pd.columns = ['s', 'a','r_pred']
print(r_pd.columns)
r_f=r_pd[['s','r_pred']].groupby([ 's']).max()

len(list(r_f['r_pred']))

[[12064  4078  3317   806  1660  2844]
 [   25    37    26     5    16    12]
 [   32    58    28    13    48    34]
 [   78   125   108     9    85    76]
 [   95   142   123    16   134   124]
 [   89    92    73    11    61   140]]
             precision    recall  f1-score   support

       -5.0       0.97      0.49      0.65     24769
        1.0       0.01      0.31      0.02       121
        2.0       0.01      0.13      0.01       213
        3.0       0.01      0.02      0.01       481
        4.0       0.07      0.21      0.10       634
        5.0       0.04      0.30      0.08       466

avg / total       0.91      0.47      0.61     26684

Index(['s', 'a', 'r_pred'], dtype='object')


14

In [179]:
#### random forest for T
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
x = final_labeled_data.drop('r', axis=1)
x = x.drop('sp', axis=1)
y = final_labeled_data['sp']

labels = np.array(y)
# Remove the labels from the features
# axis 1 refers to the columns
features= x
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 50)
# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

predictions = rf.predict(ra)
t=np.column_stack((ra, predictions))
t=np.rint(t)



Mean Absolute Error: 2.85 degrees.
