In [0]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings

warnings.filterwarnings('ignore')
np.set_printoptions(suppress = True)

# Trainng Samples loaded from Github Repo
ratings_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/ratings.dat'
titles_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/movies.dat'
users_path = 'https://raw.githubusercontent.com/usef-kh/EC503Project/master/Datasets/ml-1m/users.dat'

ratings = pd.read_csv(ratings_path, sep  = "::", names = ['UserID', 'MovieID', 'Rating', 'Timestamp'])
titles = pd.read_csv(titles_path, sep = "::", names = ['MovieID', 'Title', 'Genres'], encoding ='latin-1')
users = pd.read_csv(users_path, sep = "::", names = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip Code'], encoding ='latin-1')

users.dropna(inplace=True)

In [0]:
all_genres = []

for index,row in titles.iterrows():
  #  separate the genres and put into a list
  temp = row.Genres.split('|')
   #  store all the genres into a list
  all_genres =  all_genres + temp
  # unique genres list
genres = list(set(all_genres))
lenGeneres = len(genres)
zeroVec = np.zeros(lenGeneres) 
#  store the genres and corresponding index into a dictionary
genreDict0 = dict(enumerate(genres))
#  store genres as key and index as value
genreDict = {value:key for key, value in genreDict0.items()}
print(genreDict)


{"Children's": 0, 'Adventure': 1, 'Comedy': 2, 'Action': 3, 'Documentary': 4, 'Western': 5, 'Film-Noir': 6, 'Fantasy': 7, 'Animation': 8, 'Romance': 9, 'Crime': 10, 'Sci-Fi': 11, 'Thriller': 12, 'Horror': 13, 'Musical': 14, 'War': 15, 'Mystery': 16, 'Drama': 17}


In [0]:
genreArray = np.empty([titles.shape[0],lenGeneres])
oldidx = -1
for index,row in titles.iterrows():
  try:
    # create a zeros list
    genresVec = np.zeros(lenGeneres)
    # get genres
    genreTmp = row.Genres.split('|')

    for gen in genreTmp: 
        # get genres index
      genID = genreDict[gen]
      # assign the corresponding genres index to 1
      genresVec[genID] = 1

    
    # put the 0 and 1 vector into a whole matrix representing the genres
    genreArray[index] = genresVec

  except:  
    print("found a bad movie")
print(genreArray.shape)
print(genreArray)

(3883, 18)
[[1. 0. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [0]:
print(len(genreArray),titles.shape)

df2 = pd.DataFrame(genreArray, dtype=int, columns=list('abcdefghijklmnopqr'))
titles = pd.concat([titles, df2], axis=1)

3883 (3883, 3)


In [0]:
titles.head()

Unnamed: 0,MovieID,Title,Genres,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r
0,1,Toy Story (1995),Animation|Children's|Comedy,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,5,Father of the Bride Part II (1995),Comedy,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
movies = pd.merge(ratings, titles, on='MovieID')
data = pd.merge(movies, users, on='UserID')

ntotal = data.shape[0]
ntv = int(np.floor(ntotal*0.85))
ntest = ntotal - ntv

print(f"ntotal: {ntotal}\nntrain: {ntv}\nntest: {ntest}")


data.head()

ntotal: 1000209
ntrain: 850177
ntest: 150032


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,Gender,Age,Occupation,Zip Code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,F,1,10,48067


In [0]:
data['Encoded Gender'] = 1
data['Encoded Gender'][data.Gender == 'F'] = 0


In [0]:
data.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,Gender,Age,Occupation,Zip Code,Encoded Gender
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,F,1,10,48067,0
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,F,1,10,48067,0
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,F,1,10,48067,0
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,F,1,10,48067,0
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,F,1,10,48067,0


In [0]:
lol = ['UserID', 'MovieID', 'Age', 'Occupation', 'Encoded Gender']#, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r']
X = data[lol]
Y = data['Rating']


In [0]:
#Create train and test data set
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X.values,Y.values,test_size=0.33,random_state=42)

logreg = LogisticRegression()
logreg.fit(xtrain, ytrain)
Y_pred = logreg.predict(xtest)
acc_log = round(logreg.score(xtrain, ytrain) * 100, 2)
acc_log

34.92

In [0]:
from sklearn.svm import SVC, LinearSVC
svc = LinearSVC()
svc.fit(xtrain, ytrain)
Y_pred = svc.predict(xtest)
acc_svc = round(svc.score(xtrain, ytrain) * 100, 2)
acc_svc

34.92