In [1]:
# Formating
import os 
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from pathlib import Path  
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from numpy import mean
from numpy import std

In [2]:
def saveData(name,df):
    """Create a 'cleaned' folder under the directory to save the csv files"""
    parentPathName = os.getcwd() + '/cleaned/' 
    csvName = name + '.csv'
    os.makedirs(parentPathName, exist_ok = True)
    filepath = os.path.join(parentPathName, csvName)    
    df.to_csv(filepath, index = False) 
    
    return None

In [3]:
def merge(user_df, rating_df, movie_df): 
    """merge users, rating, and movies and print out shape. Rating_df change based on whether we are doing training, testing or validation   """
    print(rating_df.shape)
    userRating_training = user_df.merge(rating_df, how='right', on='UserID')
    print(userRating_training.shape)
    userRating_training.drop(['Gender','Occupation','Occupation_encoded'], inplace = True, axis = 1)
    df = userRating_training.merge(movie_df, how='left', on='MovieID')
    print(df.shape)
    return df

In [4]:
cleanPath = os.path.dirname(os.getcwd() + '/cleaned/')
users = pd.read_csv(cleanPath + '/users.csv')
ratings = pd.read_csv(cleanPath + '/ratings.csv')
movies = pd.read_csv(cleanPath + '/movies.csv')
training_rating = pd.read_csv(cleanPath + '/training_rating.csv')
testing_rating = pd.read_csv(cleanPath + '/testing_rating.csv')
validation_rating = pd.read_csv(cleanPath + '/validation_rating.csv')

In [5]:
print(users.columns)
print(users.dtypes)

print(ratings.columns)
print(ratings.dtypes)

print(movies.columns)
print(movies.dtypes)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], dtype='object')
UserID         int64
Gender        object
Age            int64
Occupation     int64
Zip-code      object
dtype: object
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Rating_mn', 'Rating_dmn'], dtype='object')
UserID          int64
MovieID         int64
Rating          int64
Timestamp      object
Rating_mn     float64
Rating_dmn    float64
dtype: object
Index(['MovieID', 'Title', 'year', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western'],
      dtype='object')
MovieID         int64
Title          object
year            int64
Action          int64
Adventure       int64
Animation       int64
Children's      int64
Comedy          int64
Crime           int64
Documentary     int64
Drama           int64
Fantasy         int64
Film-Noir       int6

## Necessary features encoding

In [6]:
# drop year
movies = movies.drop(['year'], axis=1)
print(movies.columns)
# encode age
users['Age'].replace({1: 1, 18: 2, 25: 3, 35: 4, 45: 5, 50: 6, 56:  7}, inplace=True)

Index(['MovieID', 'Title', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')


In [7]:
# Encode genders and occupation
users['Sex'] = np.where(users['Gender'] == 'M',1,0)
users['Occupation_encoded'] = users['Occupation'].map({0:"other", 
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"})
pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')
users = pd.concat([users,pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')], axis=1)
print(users.columns)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'Sex',
       'Occupation_encoded', 'Occ_K-12 student', 'Occ_academic/educator',
       'Occ_artist', 'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer'],
      dtype='object')


## Merge users, ratings and movies

In [8]:
# merge user, training_rating, and movie
df_training = merge(users,training_rating,movies)
# merge user, testing_rating, and movie 
df_testing = merge(users,testing_rating,movies)
# merge user, validation_rating and movie
df_validation = merge(users,validation_rating,movies)

(971839, 6)
(971839, 33)
(971839, 49)
(28346, 6)
(28346, 33)
(28346, 49)
(2700, 6)
(2700, 33)
(2700, 49)


In [9]:
columnToDrop = ['UserID','MovieID','Timestamp','Rating_mn','Rating_dmn','Title','Zip-code','Rating']
X_train = df_training.drop(labels=columnToDrop, axis=1)
X_test = df_testing.drop(labels=columnToDrop, axis=1)
X = pd.concat([X_train,X_test])
print(X.shape)
X_validation = df_validation.drop(labels=columnToDrop, axis=1)

y_train = df_training.Rating
y_test = df_testing.Rating
y = pd.concat([y_train,y_test])
y_validation = df_validation.Rating

y_train_dmn = df_training.Rating_dmn
y_test_dmn = df_testing.Rating_dmn
y_dmn = pd.concat([y_train_dmn,y_test_dmn])
print(X_train.columns)
y_validation_dmn = df_validation.Rating_dmn

(1000185, 41)
Index(['Age', 'Sex', 'Occ_K-12 student', 'Occ_academic/educator', 'Occ_artist',
       'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')


## Employ model 

In [10]:
# define the linear regression model
linearReg_model = LinearRegression()
linearReg_model.fit(X_train,y_train_dmn)
y_pred_dmn = linearReg_model.predict(X_test)
# Compute and print R^2 and RMSE
rmse = np.sqrt(mean_squared_error(y_test_dmn,y_pred_dmn))
print("Root Mean Squared Error on Testing Set: {}".format(rmse))
print("R^2 for training set: {}".format(linearReg_model.score(X_train, y_train_dmn)))
print("R^2 for test set: {}".format(linearReg_model.score(X_test, y_test_dmn)))


linearReg_RMSE = cross_val_score(linearReg_model, X, y_dmn, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
print('Mean RMSE: %.3f (%.3f)' % (-mean(linearReg_RMSE), std(linearReg_RMSE)))
linearReg_r2 = cross_val_score(linearReg_model, X, y_dmn, scoring='r2', cv=10, n_jobs=-1)
print('Mean R^2: %.3f (%.3f)' % (mean(linearReg_r2), std(linearReg_r2)))


Root Mean Squared Error on Testing Set: 1.5138885423877277
R^2 for training set: 0.028548990435729893
R^2 for test set: -0.14460391639527126
Mean RMSE: 1.028 (0.110)
Mean R^2: 0.026 (0.008)


In [11]:
# define the multinomial logit model 
multiLogit_model = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=150)
multiLogit_model.fit(X_train,y_train)
y_pred = multiLogit_model.predict(X_test)

In [12]:
# Compute RMSE on testing set
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error on Testing Set: {}".format(rmse))
# Compute mean RMSE
multiLogit_RMSE = cross_val_score(multiLogit_model, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
print('Mean RMSE: %.3f (%.3f)' % (-mean(multiLogit_RMSE), std(multiLogit_RMSE)))
# below: https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the model evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

Root Mean Squared Error on Testing Set: 1.6342636727480004
Mean RMSE: 1.428 (0.008)


## Generate recommendation on all movies

In [50]:
# Get df_all with validation users charactersitics and all movie info
df = df_validation.groupby('UserID', as_index=False).nth(0)
print(df.columns)
print(df.shape)
df_user = df[['UserID','Age','Sex','Occ_K-12 student',
       'Occ_academic/educator', 'Occ_artist', 'Occ_clerical/admin',
       'Occ_college/grad student', 'Occ_customer service',
       'Occ_doctor/health care', 'Occ_executive/managerial', 'Occ_farmer',
       'Occ_homemaker', 'Occ_lawyer', 'Occ_other', 'Occ_programmer',
       'Occ_retired', 'Occ_sales/marketing', 'Occ_scientist',
       'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer']]
df_all = df_user.merge(movies, how='cross')
print(df_all.columns)
print(df_all.shape)

Index(['UserID', 'Age', 'Zip-code', 'Sex', 'Occ_K-12 student',
       'Occ_academic/educator', 'Occ_artist', 'Occ_clerical/admin',
       'Occ_college/grad student', 'Occ_customer service',
       'Occ_doctor/health care', 'Occ_executive/managerial', 'Occ_farmer',
       'Occ_homemaker', 'Occ_lawyer', 'Occ_other', 'Occ_programmer',
       'Occ_retired', 'Occ_sales/marketing', 'Occ_scientist',
       'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'MovieID',
       'Rating', 'Timestamp', 'Rating_mn', 'Rating_dmn', 'Title', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')
(5, 49)
Index(['UserID', 'Age', 'Sex', 'Occ_K-12 student', 'Occ_academic/educator',
       'Occ_artist', 'Occ_clerical/admin', 'Occ_college/grad student',
       

In [51]:
def getData(datAddress, columns):
    """Read in .dat file with '::' as sep"""
    with open(datAddress, encoding="ISO-8859-1") as f: 
        lists = [line.strip().split('::') for line in f.readlines()]
        df = pd.DataFrame(lists, columns = columns)
    return df 

In [52]:
movieRaw = getData('./ml-1m/movies.dat', ['MovieID','Title','Genres'])
movieRaw['MovieID'] = movieRaw['MovieID'].astype(int)

In [54]:
# logit predict on X_wholeValidation
X_wholeValidation = df_all.drop(labels=['MovieID','Title','UserID'], axis=1)
y_logit_wholeValidation = multiLogit_model.predict(X_wholeValidation)
finalOutput = df_all.merge(ratings[['UserID','MovieID','Rating']], how="left", on=['UserID','MovieID'])
print(finalOutput.shape)
print(y_logit_wholeValidation.shape)
finalOutput['moviePredict'] = y_logit_wholeValidation
finalOutput['moviePredict_excl'] = finalOutput['Rating'].isnull()*finalOutput['moviePredict']   
finalOutput = movieRaw.merge(finalOutput[['UserID','MovieID','moviePredict_excl']], on = 'MovieID', how = 'right')
finalOutput.rename(columns = {'moviePredict_excl':'moviePredict'}, inplace=True)
print(finalOutput.columns)
logitTop = finalOutput.sort_values(by=['UserID','moviePredict'], ascending=[True,False]).groupby('UserID').head(5)
logitTop['Model'] = 'Logit'
logitTop = logitTop[['UserID','Title','MovieID','Genres']]
print(logitTop)
saveData('logit_recommendation',logitTop)

(19415, 45)
(19415,)
Index(['MovieID', 'Title', 'Genres', 'UserID', 'moviePredict'], dtype='object')
       UserID                                            Title  MovieID  \
37       1018                              Patriot, The (2000)     3753   
41       1018    Butterfly (La Lengua de las Mariposas) (2000)     3746   
167      1018  Messenger: The Story of Joan of Arc, The (1999)     3053   
202      1018                               Three Kings (1999)     2890   
286      1018                            One Man's Hero (1999)     2235   
3924     2106    Butterfly (La Lengua de las Mariposas) (2000)     3746   
4050     2106  Messenger: The Story of Joan of Arc, The (1999)     3053   
4169     2106                            One Man's Hero (1999)     2235   
4435     2106                           Land Girls, The (1998)     1898   
4461     2106                           Kurt & Courtney (1998)     1856   
7807     2484    Butterfly (La Lengua de las Mariposas) (2000)     3746   

## Generate user persona 

In [62]:
userPersona = validation_rating[['UserID']].merge(users, how='left', on='UserID')
userPersona.drop_duplicates(keep='first', inplace=True)
saveData('userPersona',userPersona)

In [None]:
# Linear regression on validation set
y_validation_dmn = linearReg_model.predict(X_validation)
df_validation['y_validation_dmn'] = pd.Series(y_validation_dmn)
df_validation['y_linear'] = df_validation['y_validation_dmn']+df_validation['Rating_mn']
print(df_validation[['y_linear','Rating']])


In [61]:
index = 5555
row = X.iloc[index]
yhat = model.predict([row])
yactual = y.iloc[index]
print('Predicted Rating: %d. Actual Rating: %d' % (yhat[0], yactual))

Predicted Rating: 4. Actual Rating: 5


