# Import packages

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd

# Load dataset

In [2]:
movies_dataframe = pd.read_csv('../dataset/movies_with_features.csv', index_col='Id', encoding='utf-8')
movies_dataframe.head(10)
movies_dataframe = movies_dataframe.head(4000)

Unnamed: 0_level_0,MovieID,YearOfRelease,Title,Budget,Genre,Original_language,Runtime
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2003.0,Dinosaur Planet,0,,,0.0
1,2,2004.0,Isle of Man TT 2004 Review,0,,,0.0
2,3,1997.0,Character,4500000,History,nl,122.0
3,4,1994.0,Paula Abdul's Get Up & Dance,0,,,0.0
4,5,2004.0,The Rise and Fall of ECW,0,,,0.0
5,6,1997.0,Sick,0,,,0.0
6,7,1992.0,8 Man,0,,,0.0
7,8,2004.0,What the #$*! Do We Know!?,0,Documentary,en,109.0
8,9,1991.0,Class of Nuke 'Em High 2,0,Comedy,en,90.0
9,10,2001.0,Fighter,0,,,0.0


In [3]:
def getRatingsPathFromMovieID(id):
    basepath = '../dataset/training_set/mv_'
    fileEnding = '.txt'
    if(id<10):
        return basepath + '000000' + str(id) + fileEnding
    elif(id>=10 and id<100):
        return basepath + '00000' + str(id) + fileEnding
    elif(id>=100 and id<1000):
        return basepath + '0000' + str(id) + fileEnding
    elif(id>=1000 and id<10000):
        return basepath + '000' + str(id) + fileEnding
    elif(id>=10000 and id<100000):
        return basepath + '00' + str(id) + fileEnding
    else:
        return -1

In [4]:
movie_rankings = list()
for i in range(1,len(movies_dataframe.index)+1):
    path = getRatingsPathFromMovieID(i)
    try:
        movie_rankings.append(pd.read_csv(path, skiprows=[0], header=None))
        movie_rankings[i-1].columns = ['CustomerID', 'Rating', 'Date']
        movie_rankings[i-1]['MovieID'] = i
        #print('Read rankings for MovieID: ',str(i),'/',str(len(movies_dataframe.index)))
    except:
        print('Could not read txt file: ', path)

In [5]:
movie_rankings[6]

Unnamed: 0,CustomerID,Rating,Date,MovieID
0,951709,2,2001-11-04,7
1,585247,1,2003-12-19,7
2,2625420,2,2004-06-03,7
3,2322468,3,2003-11-12,7
4,2056324,2,2002-11-10,7
5,1969230,4,2003-06-01,7
6,90021,2,2001-05-02,7
7,2082629,4,2004-09-29,7
8,1876156,1,2001-05-17,7
9,1359575,2,2003-11-25,7


In [6]:
#Remove movies with fewer than 100 rankings
print(movies_dataframe.shape)
for i in range(0,len(movie_rankings)):
    if(len(movie_rankings[i])<100):
        movies_dataframe = movies_dataframe.drop(i)
print(movies_dataframe.shape)

(17770, 7)
(16795, 7)


In [7]:
movies_dataframe.head(10)

Unnamed: 0_level_0,MovieID,YearOfRelease,Title,Budget,Genre,Original_language,Runtime
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2003.0,Dinosaur Planet,0,,,0.0
1,2,2004.0,Isle of Man TT 2004 Review,0,,,0.0
2,3,1997.0,Character,4500000,History,nl,122.0
3,4,1994.0,Paula Abdul's Get Up & Dance,0,,,0.0
4,5,2004.0,The Rise and Fall of ECW,0,,,0.0
5,6,1997.0,Sick,0,,,0.0
7,8,2004.0,What the #$*! Do We Know!?,0,Documentary,en,109.0
9,10,2001.0,Fighter,0,,,0.0
10,11,1999.0,Full Frame: Documentary Shorts,0,,,0.0
11,12,1947.0,My Favorite Brunette,0,Comedy,en,87.0


In [8]:
customer_movies = pd.DataFrame(columns=['CustomerID', 'MovieID', 'Rating', 'Date'])
customer_movies = pd.concat(movie_rankings)
customer_movies.head()

Unnamed: 0,CustomerID,Rating,Date,MovieID
0,1488844,3,2005-09-06,1
1,822109,5,2005-05-13,1
2,885013,4,2005-10-19,1
3,30878,4,2005-12-26,1
4,823519,3,2004-05-03,1


In [9]:
#Remove users with fewer than 15 ratings as we do not have a foundation to build our recoomenadations on.
print(customer_movies.shape)
customer_movies = customer_movies.groupby('CustomerID').filter(lambda x : len(x)>=15)
print(customer_movies.shape)

(100480507, 4)
(100218205, 4)


In [None]:
movies_with_customer_ratings_and_all_features = pd.merge(customer_movies, movies_dataframe, on="MovieID")

In [None]:
#Convert Date to ddmmyyyy
movies_with_customer_ratings_and_all_features[['year_of_rating','month_of_rating','day_of_rating']] = pd.DataFrame(movies_with_customer_ratings_and_all_features.Date.str.split('-').tolist())
movies_with_customer_ratings_and_all_features = movies_with_customer_ratings_and_all_features.drop(['Date'], axis=1)
movies_with_customer_ratings_and_all_features["YearOfRelease"] = pd.to_numeric(movies_with_customer_ratings_and_all_features["YearOfRelease"],downcast='integer')

In [None]:
movies_with_customer_ratings_and_all_features.head()

In [None]:
#Populate empty fields and NaN values
#Start by dropping, and add later
#movies_with_customer_ratings_and_all_features = movies_with_customer_ratings_and_all_features.drop(['Budget','Genre','Original_language','Runtime'], axis=1)

In [None]:
movies_with_customer_ratings_and_all_features.tail()

In [None]:
movies_with_customer_ratings_and_all_features.to_csv('../dataset/prepared/movies_with_customer_ratings_and_all_features.csv', encoding='utf-8')


https://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/

http://www.cs.carleton.edu/cs_comps/0607/recommend/recommender/results.html