In [19]:
# Formating
import os 
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from pathlib import Path  
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from numpy import mean
from numpy import std



In [20]:
cleanPath = os.path.dirname(os.getcwd() + '/cleaned/')
users = pd.read_csv(cleanPath + '/users.csv')
ratings = pd.read_csv(cleanPath + '/ratings.csv')
movies = pd.read_csv(cleanPath + '/movies.csv')
print(users.columns)
print(ratings.columns)
print(movies.columns)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], dtype='object')
Index(['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype='object')
Index(['MovieID', 'Title', 'year', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western'],
      dtype='object')


In [21]:
# Check distribution of # reviews by person
ratings['num_movies_watched'] = ratings.groupby('UserID')['MovieID'].transform('count')
ratings.sort_values(by = 'num_movies_watched', ascending = True, inplace = True)
print(ratings['num_movies_watched'].describe())
print(ratings.shape)
print(ratings['Rating'].unique())

count    1.000209e+06
mean     3.899080e+02
std      3.247402e+02
min      2.000000e+01
25%      1.470000e+02
50%      3.020000e+02
75%      5.440000e+02
max      2.314000e+03
Name: num_movies_watched, dtype: float64
(1000209, 5)
[5 4 2 3 1]


In [22]:
# Encode genders and occupation
users['Sex'] = np.where(users['Gender'] == 'M',1,0)
users['Occupation_encoded'] = users['Occupation'].map({0:"other", 
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"})
pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')
users = pd.concat([users,pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')], axis=1)
print(users.columns)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'Sex',
       'Occupation_encoded', 'Occ_K-12 student', 'Occ_academic/educator',
       'Occ_artist', 'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer'],
      dtype='object')


In [23]:
# merge user and ratings
userRating = users.merge(ratings, how = 'left', on = 'UserID')
userRating.drop(['Gender','Occupation','Occupation_encoded'], inplace = True, axis = 1)
print(userRating.shape)
print(userRating.columns)
# merge userRatings with movies
df = userRating.merge(movies, how = 'left' , on = 'MovieID')
print(df.shape)
print(df.columns)


(1000209, 29)
Index(['UserID', 'Age', 'Zip-code', 'Sex', 'Occ_K-12 student',
       'Occ_academic/educator', 'Occ_artist', 'Occ_clerical/admin',
       'Occ_college/grad student', 'Occ_customer service',
       'Occ_doctor/health care', 'Occ_executive/managerial', 'Occ_farmer',
       'Occ_homemaker', 'Occ_lawyer', 'Occ_other', 'Occ_programmer',
       'Occ_retired', 'Occ_sales/marketing', 'Occ_scientist',
       'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'MovieID',
       'Rating', 'Timestamp', 'num_movies_watched'],
      dtype='object')
(1000209, 49)
Index(['UserID', 'Age', 'Zip-code', 'Sex', 'Occ_K-12 student',
       'Occ_academic/educator', 'Occ_artist', 'Occ_clerical/admin',
       'Occ_college/grad student', 'Occ_customer service',
       'Occ_doctor/health care', 'Occ_executive/managerial', 'Occ_farmer',
       'Occ_homemaker', 'Occ_lawyer', 'Occ_other', 'Occ_programmer',
       'Occ_retired', 'Occ_sales/ma

In [24]:
print(users['Zip-code'].unique())
print(users['Age'].unique())
print(users['Occupation_encoded'].unique())
print(users['Sex'].unique())
# df['ageCat'] = pd.cut(x=df['Age'], bins=[1,18,25,35,45,50,56], right=False)
# print(df.loc[df.Age==18,:].ageCat)

['48067' '70072' '55117' ... '78734' '76006' '14706']
[ 1 56 25 45 50 35 18]
['K-12 student' 'self-employed' 'scientist' 'executive/managerial'
 'writer' 'homemaker' 'academic/educator' 'programmer'
 'technician/engineer' 'other' 'clerical/admin' 'sales/marketing'
 'college/grad student' 'lawyer' 'farmer' 'unemployed' 'artist'
 'tradesman/craftsman' 'customer service' 'retired' 'doctor/health care']
[0 1]


In [25]:
df['Rating_dmn'] = df.Rating-df.groupby('UserID')['Rating'].transform('mean')
print(df.loc[df.UserID == 3,:].Rating_dmn.unique())
print(df.loc[df.UserID == 3,:].Rating.unique())

[-0.90196078  1.09803922  0.09803922 -1.90196078 -2.90196078]
[3 5 4 2 1]


In [26]:
# X: Gender, Age, Occupation, Zip-code, All the genres
# Need to encode Zip-code; drop it for now. Gonna work on it later on
# y: Rating
columnToDrop = ['UserID','MovieID','Timestamp','num_movies_watched','Title','Zip-code','Rating']
X = df.drop(labels=columnToDrop, axis=1)
y = df.Rating
y_demean = df.Rating_dmn
print(X.columns)

Index(['Age', 'Sex', 'Occ_K-12 student', 'Occ_academic/educator', 'Occ_artist',
       'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'year',
       'Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'Rating_dmn'],
      dtype='object')


In [49]:
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# below: https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy: 0.347 (0.001)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
index = 5555
row = X.iloc[index]
yhat = model.predict([row])
yactual = y.iloc[index]
print('Predicted Rating: %d. Actual Rating: %d' % (yhat[0], yactual))

Predicted Rating: 4. Actual Rating: 5


