In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from copy import deepcopy
import warnings

warnings.filterwarnings('ignore')
np.set_printoptions(suppress = True)

# Trainng Samples loaded from Github Repo
ratings_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/ratings.dat'
titles_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/movies.dat'
users_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/users.dat'

ratings = pd.read_csv(ratings_path, sep  = "::", names = ['UserID', 'MovieID', 'Rating', 'Timestamp'])
titles = pd.read_csv(titles_path, sep = "::", names = ['MovieID', 'Title', 'Genres'], encoding ='latin-1')
users = pd.read_csv(users_path, sep = "::", names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip Code'], encoding ='latin-1')

users.dropna(inplace=True)

In [4]:
all_genres = []
for index,row in titles.iterrows():
  #  separate the genres and put into a list
  temp = row.Genres.split('|')
   #  store all the genres into a list
  all_genres =  all_genres + temp
  # unique genres list
genres = list(set(all_genres))
lenGeneres = len(genres)
zeroVec = np.zeros(lenGeneres) 
#  store the genres and corresponding index into a dictionary
genreDict0 = dict(enumerate(genres))
#  store genres as key and index as value
genreDict = {value:key for key, value in genreDict0.items()}
print(genreDict)


{'Fantasy': 0, "Children's": 1, 'Drama': 2, 'Film-Noir': 3, 'Musical': 4, 'Western': 5, 'Comedy': 6, 'Animation': 7, 'Mystery': 8, 'Romance': 9, 'Action': 10, 'Documentary': 11, 'Sci-Fi': 12, 'Adventure': 13, 'War': 14, 'Horror': 15, 'Crime': 16, 'Thriller': 17}


In [5]:
genreArray = np.empty([titles.shape[0],lenGeneres])
oldidx = -1
for index,row in titles.iterrows():
  try:
    # create a zeros list
    genresVec = np.zeros(lenGeneres)
    # get genres
    genreTmp = row.Genres.split('|')

    for gen in genreTmp: 
        # get genres index
      genID = genreDict[gen]
      # assign the corresponding genres index to 1
      genresVec[genID] = 1

    # put the 0 and 1 vector into a whole matrix representing the genres
    genreArray[index] = genresVec

  except:  
    print("found a bad movie")
print(genreArray.shape)
print(genreArray)

(3883, 18)
[[0. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 1.]]


In [6]:
print(len(genreArray),titles.shape)

df2 = pd.DataFrame(genreArray, dtype=int, columns=list(genreDict.keys()))
titles = pd.concat([titles, df2], axis=1)

3883 (3883, 3)


In [7]:
titles.head()

Unnamed: 0,MovieID,Title,Genres,Fantasy,Children's,Drama,Film-Noir,Musical,Western,Comedy,Animation,Mystery,Romance,Action,Documentary,Sci-Fi,Adventure,War,Horror,Crime,Thriller
0,1,Toy Story (1995),Animation|Children's|Comedy,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [8]:
np.random.seed(42)
movies = pd.merge(ratings, titles, on='MovieID')
shuffled_movies = movies.reindex(np.random.permutation(movies.index))

ntotal = movies.shape[0]
ntrain = int(np.floor(ntotal*0.85))
ntest = ntotal - ntrain

train_movies = deepcopy(shuffled_movies)
test_movies = deepcopy(shuffled_movies)

train_movies['Rating'][ntrain:] = np.nan
test_movies['Rating'][:ntrain] = np.nan

train_movies.dropna(inplace = True)
test_movies.dropna(inplace = True)

train = pd.merge(train_movies, users, on='UserID')
test = pd.merge(test_movies, users, on='UserID')

print(f"ntotal: {ntrain}\nntest: {ntest}")
train.head()

ntotal: 850177
ntest: 150032


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Fantasy,Children's,Drama,Film-Noir,Musical,Western,Comedy,Animation,Mystery,Romance,Action,Documentary,Sci-Fi,Adventure,War,Horror,Crime,Thriller,Gender,Age,Occupation,Zip Code
0,5755,184,3.0,958280246,Nadja (1994),Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744
1,5755,456,5.0,958266803,Fresh (1994),Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744
2,5755,3468,5.0,958264955,"Hustler, The (1961)",Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744
3,5755,319,5.0,958279642,Shallow Grave (1994),Thriller,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,F,35,2,78744
4,5755,1733,4.0,958281189,Afterglow (1997),Drama|Romance,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,F,35,2,78744


In [0]:
train['Male'] = 0 
train['Female'] = 0

train['Male'][train.Gender == 'M'] = 1
train['Female'][train.Gender == 'F'] = 1

test['Male'] = 0 
test['Female'] = 0

test['Male'][test.Gender == 'M'] = 1
test['Female'][test.Gender == 'F'] = 1


In [10]:
train.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Fantasy,Children's,Drama,Film-Noir,Musical,Western,Comedy,Animation,Mystery,Romance,Action,Documentary,Sci-Fi,Adventure,War,Horror,Crime,Thriller,Gender,Age,Occupation,Zip Code,Male,Female
0,5755,184,3.0,958280246,Nadja (1994),Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744,0,1
1,5755,456,5.0,958266803,Fresh (1994),Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744,0,1
2,5755,3468,5.0,958264955,"Hustler, The (1961)",Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744,0,1
3,5755,319,5.0,958279642,Shallow Grave (1994),Thriller,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,F,35,2,78744,0,1
4,5755,1733,4.0,958281189,Afterglow (1997),Drama|Romance,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,F,35,2,78744,0,1


In [0]:
#Create train and test data 
from sklearn.metrics import mean_absolute_error as MAE

Features = ['UserID', 'MovieID', 'Age', 'Occupation', 'Male', 'Female'] + list(genreDict.keys())
xtrain = train[Features]
ytrain = train['Rating']

xtest = test[Features]
ytest = test['Rating']

In [19]:
# OLS prediction
from sklearn.linear_model  import LinearRegression
ols = LinearRegression(normalize = True, fit_intercept = True)
ols.fit(xtrain, ytrain)

print("Normal")
pred_train = ols.predict(xtrain)
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = ols.predict(xtest)
print("Train MAE:", MAE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*ols.predict(xtrain))/2 
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = np.round(2*ols.predict(xtest))/2 
print("Train MAE:", MAE(pred_test, ytest))

Normal
Train MAE: 0.8987286895780108
Train MAE: 0.8992659259263861

Rounded
Train MAE: 0.8960381191210771
Train MAE: 0.8963321158152927


In [20]:
# Ridge prediction
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.1, normalize = True, fit_intercept = True)
ridge.fit(xtrain, ytrain) 

print("Normal")
pred_train = ridge.predict(xtrain)
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = ridge.predict(xtest)
print("Train MAE:", MAE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*ridge.predict(xtrain))/2 
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = np.round(2*ridge.predict(xtest))/2 
print("Train MAE:", MAE(pred_test, ytest))

Normal
Train MAE: 0.9009153376382154
Train MAE: 0.9015414073838328

Rounded
Train MAE: 0.8994409399454466
Train MAE: 0.9002179535032526


In [21]:
# Lasso prediction
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1, normalize = True, fit_intercept = True)
lasso.fit(xtrain, ytrain)  

print("Normal")
pred_train = lasso.predict(xtrain)
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = lasso.predict(xtest)
print("Train MAE:", MAE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*lasso.predict(xtrain))/2 
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = np.round(2*lasso.predict(xtest))/2 
print("Train MAE:", MAE(pred_test, ytest))

Normal
Train MAE: 0.9336290712841323
Train MAE: 0.9349936665773908

Rounded
Train MAE: 0.9459589003231091
Train MAE: 0.9470446304788311


In [24]:
from sklearn.svm import SVR
SVRKernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

svr = SVR(kernel=SVRKernels[2], gamma='scale', C=1.0, epsilon=0.2, max_iter = 150)
svr.fit(xtrain, ytrain) 

print("Normal")
pred_train = svr.predict(xtrain)
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = svr.predict(xtest)
print("Train MAE:", MAE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*svr.predict(xtrain))/2 
print("Train MAE:", MAE(pred_train, ytrain))
pred_test = np.round(2*svr.predict(xtest))/2 
print("Train MAE:", MAE(pred_test, ytest))

Normal
Train MAE: 1.0259406705475185
Train MAE: 1.0253336231886574

Rounded
Train MAE: 1.021375548856297
Train MAE: 1.020755572144609
