In [1]:
import os
import csv
import pandas as pd
import numpy as np
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV


#os.getcwd()
os.chdir('/Users/yajwang/Data')

In [2]:
train = pd.read_csv('reviews.csv')
test = pd.read_csv('test_reviews.csv')

In [3]:
print(train.shape)
print(test.shape)
print(train.dtypes.head(10))

(179345, 5)
(1387, 5)
location_id     object
review_id       object
source          object
date            object
rating         float64
dtype: object


In [4]:
train.head()

Unnamed: 0,location_id,review_id,source,date,rating
0,4962_221,test_1895327047,GOOGLE_PLACES,10/20/15,1.0
1,4962_221,test_18961550118,FACEBOOK,11/27/17,1.0
2,4962_221,test_2044312531,FACEBOOK,11/15/17,1.0
3,4962_221,test_21399832280,FACEBOOK,7/17/17,1.0
4,4962_221,test_21516225526,FACEBOOK,1/11/16,1.0


In [5]:
test.date.describe()

count       1124
unique       714
top       4/2/18
freq          10
Name: date, dtype: object

In [6]:
test['date'].fillna('4/2/18', inplace=True)

In [7]:
from datetime import datetime

train['datetime']=pd.to_datetime(train['date'])
test['datetime']=pd.to_datetime(test['date'])

In [8]:
train['year'] = [d.year for d in train['datetime']]
test['year'] = [d.year for d in test['datetime']]

train['month'] = [d.month for d in train['datetime']]
test['month'] = [d.month for d in test['datetime']]

In [9]:
test.date.isnull().sum()

0

In [10]:
train = pd.DataFrame.dropna(train)
train.head()

Unnamed: 0,location_id,review_id,source,date,rating,datetime,year,month
0,4962_221,test_1895327047,GOOGLE_PLACES,10/20/15,1.0,2015-10-20,2015,10
1,4962_221,test_18961550118,FACEBOOK,11/27/17,1.0,2017-11-27,2017,11
2,4962_221,test_2044312531,FACEBOOK,11/15/17,1.0,2017-11-15,2017,11
3,4962_221,test_21399832280,FACEBOOK,7/17/17,1.0,2017-07-17,2017,7
4,4962_221,test_21516225526,FACEBOOK,1/11/16,1.0,2016-01-11,2016,1


In [24]:
y_train = train.rating.values / 5

In [12]:
ntrain = train.shape[0]
ntest = test.shape[0]
dfall = pd.concat((train.drop(['rating'], axis=1), test)).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
dfall['year'] = dfall['year'].astype(str)
dfall['month'] = dfall['month'].astype(str)

In [14]:
dfall = pd.get_dummies(dfall[['location_id', 'source', 'year', 'month']])

In [15]:
dfall.head()

Unnamed: 0,location_id_4962_10,location_id_4962_1000,location_id_4962_1005,location_id_4962_101,location_id_4962_103,location_id_4962_104,location_id_4962_105,location_id_4962_107,location_id_4962_108,location_id_4962_109,...,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
x_train = dfall[:ntrain]
x_test = dfall[ntrain:]

In [30]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)




RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [31]:
rf_model_pred = np.expm1(rf_model.predict(x_test))

In [39]:
test['prediction']= ((rf_model_pred - min(rf_model_pred))/(max(rf_model_pred) - min(rf_model_pred)))*4+1

In [49]:
# Use LassoCV to find out the best alpha first.
lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                          0.3, 0.6, 1, 2, 5, 10, 100], 
                max_iter = 500000, cv = 10)
lasso.fit(x_train, y_train)
alpha = lasso.alpha_

lasso2 = LassoCV(alphas = [alpha * .6, alpha * .7, alpha * .8, 
                          alpha * .9, alpha, 
                          alpha * 1.1, alpha * 1.2, alpha * 1.3, 
                          alpha * 1.4], 
                max_iter = 500000, cv = 10)
lasso2.fit(x_train, y_train)
alpha2 = lasso2.alpha_

print("Best alpha :", alpha2)

Best alpha : 140.0


In [50]:
lasso_model = Lasso(alpha=alpha2, max_iter=500000).fit(x_train, y_train)
lasso_pred = np.expm1(lasso_model.predict(x_test))

In [58]:
test['prediction2'] = lasso_pred / max(lasso_pred)

In [59]:
test.head()

Unnamed: 0,location_id,review_id,source,date,rating,datetime,year,month,prediction,prediction2
0,4962_147,test_10368849285,FACEBOOK,12/4/15,,2015-12-04,2015,12,5.0,1.0
1,4962_147,test_1042394214,FACEBOOK,11/29/16,,2016-11-29,2016,11,4.234846,1.0
2,4962_185,test_11322162891,GOOGLE_PLACES,8/3/16,,2016-08-03,2016,8,4.847204,1.0
3,4962_185,test_11324660949,GOOGLE_PLACES,2/3/18,,2018-02-03,2018,2,4.54789,1.0
4,4962_185,test_11348252095,GOOGLE_PLACES,2/12/16,,2016-02-12,2016,2,4.835567,1.0


Lasso regression does not work well for this case.