In [12]:
# Formating
import os 
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from pathlib import Path  
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from numpy import mean
from numpy import std

In [13]:
def merge(user_df, rating_df, movie_df): 
    """merge users, rating, and movies and print out shape. Rating_df change based on whether we are doing training, testing or validation   """
    print(rating_df.shape)
    userRating_training = user_df.merge(rating_df, how='right', on='UserID')
    print(userRating_training.shape)
    userRating_training.drop(['Gender','Occupation','Occupation_encoded'], inplace = True, axis = 1)
    df = userRating_training.merge(movie_df, how='left', on='MovieID')

    return df

In [14]:
cleanPath = os.path.dirname(os.getcwd() + '/cleaned/')
users = pd.read_csv(cleanPath + '/users.csv')
ratings = pd.read_csv(cleanPath + '/ratings.csv')
movies = pd.read_csv(cleanPath + '/movies.csv')
training_rating = pd.read_csv(cleanPath + '/training_rating.csv')
testing_rating = pd.read_csv(cleanPath + '/testing_rating.csv')
validation_rating = pd.read_csv(cleanPath + '/validation_rating.csv')

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], dtype='object')
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Rating_dmn'], dtype='object')
Index(['MovieID', 'Title', 'year', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western'],
      dtype='object')


In [23]:
print(users.columns)
print(users.dtypes)

print(ratings.columns)
print(ratings.dtypes)

print(movies.columns)
print(movies.dtypes)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'Sex',
       'Occupation_encoded', 'Occ_K-12 student', 'Occ_academic/educator',
       'Occ_artist', 'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer'],
      dtype='object')
UserID                       int64
Gender                      object
Age                          int64
Occupation                   int64
Zip-code                    object
Sex                          int64
Occupation_encoded          object
Occ_K-12 student             uint8
Occ_academic/educator        uint8
Occ_artist                   uint8
Occ_clerical/admin           uint8
Occ_college/grad student     

In [15]:
# Encode genders and occupation
users['Sex'] = np.where(users['Gender'] == 'M',1,0)
users['Occupation_encoded'] = users['Occupation'].map({0:"other", 
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"})
pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')
users = pd.concat([users,pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')], axis=1)
print(users.columns)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'Sex',
       'Occupation_encoded', 'Occ_K-12 student', 'Occ_academic/educator',
       'Occ_artist', 'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer'],
      dtype='object')


In [16]:
# merge user, training_rating, and movie
df_training = merge(users,training_rating,movies)
# merge user, testing_rating, and movie 
df_testing = merge(users,testing_rating,movies)
# merge user, validation_rating and movie
df_validation = merge(users,validation_rating,movies)

(969163, 5)
(969163, 32)
(28346, 5)
(28346, 32)
(2700, 5)
(2700, 32)


In [22]:
columnToDrop = ['UserID','MovieID','Timestamp','Rating_dmn','Title','Zip-code','Rating']
X_train = df_training.drop(labels=columnToDrop, axis=1)
X_test = df_testing.drop(labels=columnToDrop, axis=1)
X_validation = df_validation.drop(labels=columnToDrop, axis=1)

y_train = df_training.Rating
y_test = df_testing.Rating
y_validation = df_validation.Rating

y_train_dmn = df_training.Rating_dmn
y_test_dmn = df_testing.Rating_dmn
y_validation_dmn = df_validation.Rating_dmn

0    0.811321
1   -1.188679
Name: Rating_dmn, dtype: float64
0    5
1    3
Name: Rating, dtype: int64


In [None]:
# define the linear regression model
linearReg_model = LinearRegression()
linearReg_model.fit(X_train,y_train_dmn)
y_pred_dmn = linearReg_model.predict(X_test)

In [10]:
# merge user and ratings
userRating = users.merge(ratings, how='left', on='UserID')
userRating.drop(['Gender','Occupation','Occupation_encoded'], inplace = True, axis = 1)
print(userRating.shape)
print(userRating.columns)
# merge userRatings with movies
df = userRating.merge(movies, how='left', on='MovieID')
print(df.shape)
print(df.columns)


(1000209, 29)
Index(['UserID', 'Age', 'Zip-code', 'Sex', 'Occ_K-12 student',
       'Occ_academic/educator', 'Occ_artist', 'Occ_clerical/admin',
       'Occ_college/grad student', 'Occ_customer service',
       'Occ_doctor/health care', 'Occ_executive/managerial', 'Occ_farmer',
       'Occ_homemaker', 'Occ_lawyer', 'Occ_other', 'Occ_programmer',
       'Occ_retired', 'Occ_sales/marketing', 'Occ_scientist',
       'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'MovieID',
       'Rating', 'Timestamp', 'num_movies_watched'],
      dtype='object')
(1000209, 49)
Index(['UserID', 'Age', 'Zip-code', 'Sex', 'Occ_K-12 student',
       'Occ_academic/educator', 'Occ_artist', 'Occ_clerical/admin',
       'Occ_college/grad student', 'Occ_customer service',
       'Occ_doctor/health care', 'Occ_executive/managerial', 'Occ_farmer',
       'Occ_homemaker', 'Occ_lawyer', 'Occ_other', 'Occ_programmer',
       'Occ_retired', 'Occ_sales/ma

In [12]:
print(df.loc[df.UserID == 3,:].Rating_dmn.unique())
print(df.loc[df.UserID == 3,:].Rating.unique())

[-0.90196078  1.09803922  0.09803922 -1.90196078 -2.90196078]
[3 5 4 2 1]


In [13]:
# X: Gender, Age, Occupation, Zip-code, All the genres
# Need to encode Zip-code; drop it for now. Gonna work on it later on
# y: Rating
X = df.drop(labels=columnToDrop, axis=1)
y = df.Rating
y_demean = df.Rating_dmn
print(X.columns)

Index(['Age', 'Sex', 'Occ_K-12 student', 'Occ_academic/educator', 'Occ_artist',
       'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'year',
       'Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'Rating_dmn'],
      dtype='object')


In [None]:
# define the linear regression model
# linearReg_model = LinearRegression()
# merge with the testing set 
print(training_rating.shape)
X_train = training_rating.merge(df, how='left', on='UserID') 
print(X_train.shape)

(969163, 4)


In [17]:
# define the linear regression model
linearReg_model = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linearReg_model.fit(X_train,y_train)
y_pred = linearReg_model.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(linearReg_model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
print('Mean RMSE: %.3f (%.3f)' % (-mean(n_scores), std(n_scores)))

# Compute and print R^2 and RMSE
print("R^2: {}".format(linearReg_model.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Mean Accuracy: 0.181 (0.001)
R^2: 0.8540634490182133
Root Mean Squared Error: 0.42636508362223186


In [49]:
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

# below: https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy: 0.347 (0.001)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
index = 5555
row = X.iloc[index]
yhat = model.predict([row])
yactual = y.iloc[index]
print('Predicted Rating: %d. Actual Rating: %d' % (yhat[0], yactual))

Predicted Rating: 4. Actual Rating: 5


