In [69]:
# Formating
import os 
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from pathlib import Path  
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from numpy import mean
from numpy import std

In [70]:
def merge(user_df, rating_df, movie_df): 
    """merge users, rating, and movies and print out shape. Rating_df change based on whether we are doing training, testing or validation   """
    print(rating_df.shape)
    userRating_training = user_df.merge(rating_df, how='right', on='UserID')
    print(userRating_training.shape)
    userRating_training.drop(['Gender','Occupation','Occupation_encoded'], inplace = True, axis = 1)
    df = userRating_training.merge(movie_df, how='left', on='MovieID')
    print(df.shape)
    return df

In [71]:
cleanPath = os.path.dirname(os.getcwd() + '/cleaned/')
users = pd.read_csv(cleanPath + '/users.csv')
ratings = pd.read_csv(cleanPath + '/ratings.csv')
movies = pd.read_csv(cleanPath + '/movies.csv')
training_rating = pd.read_csv(cleanPath + '/training_rating.csv')
testing_rating = pd.read_csv(cleanPath + '/testing_rating.csv')
validation_rating = pd.read_csv(cleanPath + '/validation_rating.csv')

In [72]:
print(users.columns)
print(users.dtypes)

print(ratings.columns)
print(ratings.dtypes)

print(movies.columns)
print(movies.dtypes)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], dtype='object')
UserID         int64
Gender        object
Age            int64
Occupation     int64
Zip-code      object
dtype: object
Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Rating_mn', 'Rating_dmn'], dtype='object')
UserID          int64
MovieID         int64
Rating          int64
Timestamp      object
Rating_mn     float64
Rating_dmn    float64
dtype: object
Index(['MovieID', 'Title', 'year', 'Action', 'Adventure', 'Animation',
       'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western'],
      dtype='object')
MovieID         int64
Title          object
year            int64
Action          int64
Adventure       int64
Animation       int64
Children's      int64
Comedy          int64
Crime           int64
Documentary     int64
Drama           int64
Fantasy         int64
Film-Noir       int6

In [73]:
movies['year'] = movies['year']-1919
print(movies.year.unique())

[2000 1999 1998 1997 1996 1995 1994 1993 1992 1991 1990 1989 1988 1987
 1986 1985 1984 1983 1982 1981 1980 1979 1978 1977 1976 1975 1974 1973
 1972 1971 1970 1969 1968 1967 1966 1965 1964 1963 1962 1961 1960 1959
 1958 1957 1956 1955 1954 1953 1952 1951 1950 1949 1948 1947 1946 1945
 1944 1943 1942 1941 1940 1939 1938 1937 1936 1935 1934 1933 1932 1931
 1930 1929 1928 1927 1926 1925 1923 1922 1921 1920 1919]
[81 80 79 78 77 76 75 74 73 72 71 70 69 68 67 66 65 64 63 62 61 60 59 58
 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34
 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10
  9  8  7  6  4  3  2  1  0]


In [74]:
# Encode genders and occupation
users['Sex'] = np.where(users['Gender'] == 'M',1,0)
users['Occupation_encoded'] = users['Occupation'].map({0:"other", 
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"})
pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')
users = pd.concat([users,pd.get_dummies(users['Occupation_encoded'], prefix = 'Occ')], axis=1)
print(users.columns)

Index(['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'Sex',
       'Occupation_encoded', 'Occ_K-12 student', 'Occ_academic/educator',
       'Occ_artist', 'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer'],
      dtype='object')


In [75]:
# merge user, training_rating, and movie
df_training = merge(users,training_rating,movies)
# merge user, testing_rating, and movie 
df_testing = merge(users,testing_rating,movies)
# merge user, validation_rating and movie
df_validation = merge(users,validation_rating,movies)

(969163, 6)
(969163, 33)
(969163, 50)
(28346, 6)
(28346, 33)
(28346, 50)
(2700, 6)
(2700, 33)
(2700, 50)


In [78]:
columnToDrop = ['UserID','MovieID','Timestamp','Rating_mn','Rating_dmn','Title','Zip-code','Rating']
X_train = df_training.drop(labels=columnToDrop, axis=1)
X_test = df_testing.drop(labels=columnToDrop, axis=1)
X = pd.concat([X_train,X_test])
print(X.shape)
X_validation = df_validation.drop(labels=columnToDrop, axis=1)

y_train = df_training.Rating
y_test = df_testing.Rating
y = pd.concat([y_train,y_test])
y_validation = df_validation.Rating

y_train_dmn = df_training.Rating_dmn
y_test_dmn = df_testing.Rating_dmn
y_dmn = pd.concat([y_train_dmn,y_test_dmn])
print(X_train.columns)
y_validation_dmn = df_validation.Rating_dmn

(997509, 43)
[80 66 78 31 79 81 64 43 56 77 58 20 69 71 39 46 70 23 76 45 63 40 67 74
 61 22 68 18 28 72 73 53 44 21 48 35  6 11 62 54 49 51 57 60 47 75 55 65
 36 29 27 25 38 42 32 34 15 30 37 52 59 33 14 50  1 24 26 41  0 12 16  3
 17 19  7 13  8  2  9  4 10]


In [83]:
# define the linear regression model
linearReg_model = LinearRegression()
linearReg_model.fit(X_train,y_train_dmn)
y_pred_dmn = linearReg_model.predict(X_test)
# Compute and print R^2 and RMSE
rmse = np.sqrt(mean_squared_error(y_test_dmn,y_pred_dmn))
print("Root Mean Squared Error on Testing Set: {}".format(rmse))
print("R^2 for training set: {}".format(linearReg_model.score(X_train, y_train_dmn)))
print("R^2 for test set: {}".format(linearReg_model.score(X_test, y_test_dmn)))


linearReg_RMSE = cross_val_score(linearReg_model, X, y_dmn, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
print('Mean RMSE: %.3f (%.3f)' % (-mean(linearReg_RMSE), std(linearReg_RMSE)))
linearReg_r2 = cross_val_score(linearReg_model, X, y_dmn, scoring='r2', cv=10, n_jobs=-1)
print('Mean R^2: %.3f (%.3f)' % (mean(linearReg_r2), std(linearReg_r2)))


Root Mean Squared Error: 1.4906735319215227
R^2 for training set: 0.0470700328012551
R^2 for test set: -0.10976878218259678
Mean RMSE: 1.008 (0.105)
Mean R^2: 0.045 (0.007)


In [85]:
# define the multinomial logit model 
multiLogit_model = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=200)
multiLogit_model.fit(X_train,y_train)
y_pred = multiLogit_model.predict(X_test)

Mean RMSE: 1.182 (0.072)


In [86]:
# Compute RMSE on testing set
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error on Testing Set: {}".format(rmse))
# Compute mean RMSE
multiLogit_RMSE = cross_val_score(multiLogit_model, X, y, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
print('Mean RMSE: %.3f (%.3f)' % (-mean(multiLogit_RMSE), std(multiLogit_RMSE)))
# below: https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
# define the model evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

Root Mean Squared Error on Testing Set: 1.6396622223755417
Mean RMSE: 1.182 (0.072)


In [87]:
print(X_validation.columns)

Index(['Age', 'Sex', 'Occ_K-12 student', 'Occ_academic/educator', 'Occ_artist',
       'Occ_clerical/admin', 'Occ_college/grad student',
       'Occ_customer service', 'Occ_doctor/health care',
       'Occ_executive/managerial', 'Occ_farmer', 'Occ_homemaker', 'Occ_lawyer',
       'Occ_other', 'Occ_programmer', 'Occ_retired', 'Occ_sales/marketing',
       'Occ_scientist', 'Occ_self-employed', 'Occ_technician/engineer',
       'Occ_tradesman/craftsman', 'Occ_unemployed', 'Occ_writer', 'Rating_mn',
       'year', 'Action', 'Adventure', 'Animation', 'Children's', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')


In [111]:
# Logit on validation set
y_validation = multiLogit_model.predict(X_validation)
df_validation['y_logit'] = pd.Series(y_validation)
# print(df_validation[['y_logit','Rating']])
logitTop = df_validation.sort_values(by=['UserID','y_logit'], ascending=[True,False]).groupby('UserID').head(5)
columnToKeep = ['UserID','y_logit','MovieID','Rating','Action', 'Adventure', 'Animation', "Children's", 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western']
logitTop = logitTop[columnToKeep]
logitTop = logitTop.merge(movies, how='left', on='MovieID')

    UserID                                              Title
0     1018  Star Wars: Episode VI - Return of the Jedi (1983)
1     1018                            Fatal Attraction (1987)
2     1018                   Silence of the Lambs, The (1991)
3     1018                  Terminator 2: Judgment Day (1991)
4     1018          Star Wars: Episode IV - A New Hope (1977)
5     2106                             Say Anything... (1989)
6     2106                       2001: A Space Odyssey (1968)
7     2106                               Roman Holiday (1953)
8     2106                          Christmas Story, A (1983)
9     2106                              Bodyguard, The (1992)
10    2484                                  Casablanca (1942)
11    2484                          Gone with the Wind (1939)
12    2484                               Flying Tigers (1942)
13    2484                              Mister Roberts (1955)
14    2484              All Quiet on the Western Front (1930)
15    44

In [None]:
# Linear regression on validation set
y_validation_dmn = linearReg_model.predict(X_validation)
df_validation['y_validation_dmn'] = pd.Series(y_validation_dmn)
df_validation['y_linear'] = df_validation['y_validation_dmn']+df_validation['Rating_mn']
print(df_validation[['y_linear','Rating']])


In [61]:
index = 5555
row = X.iloc[index]
yhat = model.predict([row])
yactual = y.iloc[index]
print('Predicted Rating: %d. Actual Rating: %d' % (yhat[0], yactual))

Predicted Rating: 4. Actual Rating: 5


