In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from copy import deepcopy
import warnings

warnings.filterwarnings('ignore')
np.set_printoptions(suppress = True)

# Trainng Samples loaded from Github Repo
ratings_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/ratings.dat'
titles_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/movies.dat'
users_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/users.dat'

ratings = pd.read_csv(ratings_path, sep  = "::", names = ['UserID', 'MovieID', 'Rating', 'Timestamp'])
titles = pd.read_csv(titles_path, sep = "::", names = ['MovieID', 'Title', 'Genres'], encoding ='latin-1')
users = pd.read_csv(users_path, sep = "::", names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip Code'], encoding ='latin-1')

users.dropna(inplace=True)

In [2]:
all_genres = []
for index,row in titles.iterrows():
  #  separate the genres and put into a list
  temp = row.Genres.split('|')
   #  store all the genres into a list
  all_genres =  all_genres + temp
  # unique genres list
genres = list(set(all_genres))
lenGeneres = len(genres)
zeroVec = np.zeros(lenGeneres) 
#  store the genres and corresponding index into a dictionary
genreDict0 = dict(enumerate(genres))
#  store genres as key and index as value
genreDict = {value:key for key, value in genreDict0.items()}
print(genreDict)


{'Documentary': 0, 'Thriller': 1, 'Animation': 2, 'Action': 3, 'Crime': 4, 'War': 5, 'Mystery': 6, 'Comedy': 7, "Children's": 8, 'Romance': 9, 'Drama': 10, 'Film-Noir': 11, 'Sci-Fi': 12, 'Adventure': 13, 'Western': 14, 'Horror': 15, 'Musical': 16, 'Fantasy': 17}


In [0]:
genreArray = np.empty([titles.shape[0],lenGeneres])
oldidx = -1
for index,row in titles.iterrows():
  try:
    # create a zeros list
    genresVec = np.zeros(lenGeneres)
    # get genres
    genreTmp = row.Genres.split('|')

    for gen in genreTmp: 
        # get genres index
      genID = genreDict[gen]
      # assign the corresponding genres index to 1
      genresVec[genID] = 1

    # put the 0 and 1 vector into a whole matrix representing the genres
    genreArray[index] = genresVec

  except:  
    print("found a bad movie")

df2 = pd.DataFrame(genreArray, dtype=int, columns=list(genreDict.keys()))
titles = pd.concat([titles, df2], axis=1)

In [4]:
np.random.seed(42)
movies = pd.merge(ratings, titles, on='MovieID')
shuffled_movies = movies.reindex(np.random.permutation(movies.index))

ntotal = movies.shape[0]
ntrain = int(np.floor(ntotal*0.85))
ntest = ntotal - ntrain

train_movies = deepcopy(shuffled_movies)
test_movies = deepcopy(shuffled_movies)

train_movies['Rating'][ntrain:] = np.nan
test_movies['Rating'][:ntrain] = np.nan

train_movies.dropna(inplace = True)
test_movies.dropna(inplace = True)

train = pd.merge(train_movies, users, on='UserID')
test = pd.merge(test_movies, users, on='UserID')

print(f"ntrain: {ntrain}\nntest: {ntest}")

ntrain: 850177
ntest: 150032


In [5]:
train['Male'] = 0 
train['Female'] = 0

train['Male'][train.Gender == 'M'] = 1
train['Female'][train.Gender == 'F'] = 1

test['Male'] = 0 
test['Female'] = 0

test['Male'][test.Gender == 'M'] = 1
test['Female'][test.Gender == 'F'] = 1
train.head()


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Documentary,Thriller,Animation,Action,Crime,War,Mystery,Comedy,Children's,Romance,Drama,Film-Noir,Sci-Fi,Adventure,Western,Horror,Musical,Fantasy,Gender,Age,Occupation,Zip Code,Male,Female
0,5755,184,3.0,958280246,Nadja (1994),Drama,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,F,35,2,78744,0,1
1,5755,456,5.0,958266803,Fresh (1994),Drama,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,F,35,2,78744,0,1
2,5755,3468,5.0,958264955,"Hustler, The (1961)",Drama,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,F,35,2,78744,0,1
3,5755,319,5.0,958279642,Shallow Grave (1994),Thriller,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,35,2,78744,0,1
4,5755,1733,4.0,958281189,Afterglow (1997),Drama|Romance,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,F,35,2,78744,0,1


In [0]:
total_mtx = train.append(test)
user_means = total_mtx.groupby('UserID')['Rating'].mean()
movie_means = total_mtx.groupby('MovieID')['Rating'].mean()

In [0]:
train['user_mean'] = pd.DataFrame(user_means[train['UserID']]).reset_index().drop('UserID',axis=1)
train['movie_mean'] = pd.DataFrame(movie_means[train['MovieID']]).reset_index().drop('MovieID',axis=1)

test['user_mean'] = pd.DataFrame(user_means[test['UserID']]).reset_index().drop('UserID',axis=1)
test['movie_mean'] = pd.DataFrame(movie_means[test['MovieID']]).reset_index().drop('MovieID',axis=1)

In [0]:
# OLS prediction
from sklearn.linear_model import Ridge,LinearRegression

def run_ridge(xtrain,ytrain,xtest,ytest):
  ridge = Ridge(alpha = 0.1, normalize = True, fit_intercept = True)
  ridge.fit(xtrain, ytrain) 
  print("RIDGE")
  pred_train = ridge.predict(xtrain)
  print("Train MAE:", MAE(pred_train, ytrain))
  pred_test = ridge.predict(xtest)
  print("Test MAE:", MAE(pred_test, ytest))


  print("\nRounded")
  pred_train = np.round(2*ridge.predict(xtrain))/2 
  print("Train MAE:", MAE(pred_train, ytrain))
  pred_test = np.round(2*ridge.predict(xtest))/2 
  print("Test MAE:", MAE(pred_test, ytest))
  print()
  
def run_ols(xtrain,ytrain,xtest,ytest):
  ols = LinearRegression(normalize = True, fit_intercept = True)
  ols.fit(xtrain, ytrain)
  print("OLS")
  pred_train = ols.predict(xtrain)
  print("Train MAE:", MAE(pred_train, ytrain))
  pred_test = ols.predict(xtest)
  print("Test MAE:", MAE(pred_test, ytest))


  print("\nRounded")
  pred_train = np.round(2*ols.predict(xtrain))/2 
  print("Train MAE:", MAE(pred_train, ytrain))
  pred_test = np.round(2*ols.predict(xtest))/2 
  print("Test MAE:", MAE(pred_test, ytest))
  print()

In [9]:
#Create train and test data 
from sklearn.metrics import mean_absolute_error as MAE

ytrain = train['Rating']
ytest = test['Rating']

print("Without adding means to the features")
Features = ['Age', 'Occupation', 'Male', 'Female'] + list(genreDict.keys())
xtrain = train[Features]
xtest = test[Features]
run_ols(xtrain,ytrain,xtest,ytest)
run_ridge(xtrain,ytrain,xtest,ytest)
print()

print("Adding user means to the features")
Features_user = deepcopy(Features)
Features_user.append('user_mean')
xtrain = train[Features_user]
xtest = test[Features_user]
run_ols(xtrain,ytrain,xtest,ytest)
run_ridge(xtrain,ytrain,xtest,ytest)
print()

print("Adding both user and movie means to the features")
Features_both = deepcopy(Features_user)
Features_both.append('movie_mean')
xtrain = train[Features_both]
xtest = test[Features_both]
run_ols(xtrain,ytrain,xtest,ytest)
run_ridge(xtrain,ytrain,xtest,ytest)
print()

Without adding means to the features
OLS
Train MAE: 0.9011547384269929
Test MAE: 0.9017393494828677

Rounded
Train MAE: 0.9007783085169323
Test MAE: 0.900584541964381

RIDGE
Train MAE: 0.9032150875631959
Test MAE: 0.9038759117253814

Rounded
Train MAE: 0.905066239147848
Test MAE: 0.9053535245814226


Adding user means to the features
OLS
Train MAE: 0.8112472834332177
Test MAE: 0.8116118713366862

Rounded
Train MAE: 0.803043366263731
Test MAE: 0.8030320198357683

RIDGE
Train MAE: 0.8139945052047675
Test MAE: 0.8143887378573755

Rounded
Train MAE: 0.8061274299351782
Test MAE: 0.80623466993708


Adding both user and movie means to the features
OLS
Train MAE: 0.7250001949394018
Test MAE: 0.7243181791542669

Rounded
Train MAE: 0.7157203735222195
Test MAE: 0.7155040258078277

RIDGE
Train MAE: 0.7286748887210349
Test MAE: 0.7282678999504184

Rounded
Train MAE: 0.7192537553944649
Test MAE: 0.7193165458035619




In [0]:
# from sklearn.svm import SVR
# SVRKernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

# svr = SVR(kernel=SVRKernels[2], gamma='scale', C=1.0, epsilon=0.2, max_iter = 150)
# svr.fit(xtrain, ytrain) 

# print("Normal")
# pred_train = svr.predict(xtrain)
# print("Train MAE:", MAE(pred_train, ytrain))
# pred_test = svr.predict(xtest)
# print("Test MAE:", MAE(pred_test, ytest))


# print("\nRounded")
# pred_train = np.round(2*svr.predict(xtrain))/2 
# print("Train MAE:", MAE(pred_train, ytrain))
# pred_test = np.round(2*svr.predict(xtest))/2 
# print("Test MAE:", MAE(pred_test, ytest))