In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings

warnings.filterwarnings('ignore')
np.set_printoptions(suppress = True)

# Trainng Samples loaded from Github Repo
ratings_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/ratings.dat'
titles_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/movies.dat'
users_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/users.dat'

ratings = pd.read_csv(ratings_path, sep  = "::", names = ['UserID', 'MovieID', 'Rating', 'Timestamp'])
titles = pd.read_csv(titles_path, sep = "::", names = ['MovieID', 'Title', 'Genres'], encoding ='latin-1')
users = pd.read_csv(users_path, sep = "::", names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip Code'], encoding ='latin-1')

users.dropna(inplace=True)

In [14]:
all_genres = []
for index,row in titles.iterrows():
  #  separate the genres and put into a list
  temp = row.Genres.split('|')
   #  store all the genres into a list
  all_genres =  all_genres + temp
  # unique genres list
genres = list(set(all_genres))
lenGeneres = len(genres)
zeroVec = np.zeros(lenGeneres) 
#  store the genres and corresponding index into a dictionary
genreDict0 = dict(enumerate(genres))
#  store genres as key and index as value
genreDict = {value:key for key, value in genreDict0.items()}
print(genreDict)


{'Romance': 0, 'Musical': 1, 'Western': 2, 'Mystery': 3, 'Animation': 4, 'War': 5, "Children's": 6, 'Film-Noir': 7, 'Fantasy': 8, 'Sci-Fi': 9, 'Adventure': 10, 'Horror': 11, 'Crime': 12, 'Documentary': 13, 'Action': 14, 'Thriller': 15, 'Drama': 16, 'Comedy': 17}


In [15]:
genreArray = np.empty([titles.shape[0],lenGeneres])
oldidx = -1
for index,row in titles.iterrows():
  try:
    # create a zeros list
    genresVec = np.zeros(lenGeneres)
    # get genres
    genreTmp = row.Genres.split('|')

    for gen in genreTmp: 
        # get genres index
      genID = genreDict[gen]
      # assign the corresponding genres index to 1
      genresVec[genID] = 1

    # put the 0 and 1 vector into a whole matrix representing the genres
    genreArray[index] = genresVec

  except:  
    print("found a bad movie")
print(genreArray.shape)
print(genreArray)

(3883, 18)
[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 1. 1. 0.]]


In [16]:
print(len(genreArray),titles.shape)

df2 = pd.DataFrame(genreArray, dtype=int, columns=list(genreDict.keys()))
titles = pd.concat([titles, df2], axis=1)

3883 (3883, 3)


In [17]:
titles.head()

Unnamed: 0,MovieID,Title,Genres,Romance,Musical,Western,Mystery,Animation,War,Children's,Film-Noir,Fantasy,Sci-Fi,Adventure,Horror,Crime,Documentary,Action,Thriller,Drama,Comedy
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [18]:
movies = pd.merge(ratings, titles, on='MovieID')
data = pd.merge(movies, users, on='UserID')

ntotal = data.shape[0]
ntv = int(np.floor(ntotal*0.85))
ntest = ntotal - ntv

print(f"ntotal: {ntotal}\nntrain: {ntv}\nntest: {ntest}")
data.head()

ntotal: 1000209
ntrain: 850177
ntest: 150032


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Romance,Musical,Western,Mystery,Animation,War,Children's,Film-Noir,Fantasy,Sci-Fi,Adventure,Horror,Crime,Documentary,Action,Thriller,Drama,Comedy,Gender,Age,Occupation,Zip Code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,F,1,10,48067


In [0]:
data['Male'] = 0 
data['Female'] = 0

data['Male'][data.Gender == 'M'] = 1
data['Female'][data.Gender == 'F'] = 1


In [20]:
data.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Romance,Musical,Western,Mystery,Animation,War,Children's,Film-Noir,Fantasy,Sci-Fi,Adventure,Horror,Crime,Documentary,Action,Thriller,Drama,Comedy,Gender,Age,Occupation,Zip Code,Male,Female
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,F,1,10,48067,0,1
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,F,1,10,48067,0,1
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,F,1,10,48067,0,1
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,F,1,10,48067,0,1
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,F,1,10,48067,0,1


In [84]:
Features = ['UserID', 'MovieID', 'Age', 'Occupation', 'Male', 'Female'] + list(genreDict.keys())
X = data[Features]
Y = data['Rating']

X.head()

Unnamed: 0,UserID,MovieID,Age,Occupation,Male,Female,Romance,Musical,Western,Mystery,Animation,War,Children's,Film-Noir,Fantasy,Sci-Fi,Adventure,Horror,Crime,Documentary,Action,Thriller,Drama,Comedy
0,1,1193,1,10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,661,1,10,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,914,1,10,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,3408,1,10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,2355,1,10,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1


In [0]:
#Create train and test data set
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X.values,Y.values,test_size=0.33,random_state=42)

In [89]:
# OLS prediction
from sklearn.linear_model  import LinearRegression
ols = LinearRegression(normalize = True, fit_intercept = True).fit(xtrain,ytrain)
ols.fit(xtrain, ytrain)

print("Normal")
pred_train = ols.predict(xtrain)
print("Train MSE:", MSE(pred_train, ytrain))
pred_test = ols.predict(xtest)
print("Train MSE:", MSE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*ols.predict(xtrain))/2 
print("Train MSE:", MSE(pred_train, ytrain))
pred_test = np.round(2*ols.predict(xtest))/2 
print("Train MSE:", MSE(pred_test, ytest))

Normal
Train MSE: 1.1958139168359008
Train MSE: 1.199183810314298

Rounded
Train MSE: 1.2132237293699824
Train MSE: 1.215725348336257


In [92]:
# Ridge prediction
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.1, normalize = True, fit_intercept = True)
ridge.fit(xtrain, ytrain) 

print("Normal")
pred_train = ridge.predict(xtrain)
print("Train MSE:", MSE(pred_train, ytrain))
pred_test = ridge.predict(xtest)
print("Train MSE:", MSE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*ridge.predict(xtrain))/2 
print("Train MSE:", MSE(pred_train, ytrain))
pred_test = np.round(2*ridge.predict(xtest))/2 
print("Train MSE:", MSE(pred_test, ytest))

Normal
Train MSE: 1.1963366341605424
Train MSE: 1.1998847371602253

Rounded
Train MSE: 0.8992829856447906
Train MSE: 0.9002784266320073


In [93]:
# Lasso prediction
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1, normalize = True, fit_intercept = True)
lasso.fit(xtrain, ytrain)  

print("Normal")
pred_train = lasso.predict(xtrain)
print("Train MSE:", MSE(pred_train, ytrain))
pred_test = lasso.predict(xtest)
print("Train MSE:", MSE(pred_test, ytest))


print("\nRounded")
pred_train = np.round(2*lasso.predict(xtrain))/2 
print("Train MSE:", MSE(pred_train, ytrain))
pred_test = np.round(2*lasso.predict(xtest))/2 
print("Train MSE:", MSE(pred_test, ytest))

Normal
Train MSE: 1.246356294824031
Train MSE: 1.25108279662561

Rounded
Train MSE: 0.9456039036619214
Train MSE: 0.9471731668226947
