In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

%matplotlib inline

In [2]:
yelp_train = pd.read_csv('yelp242_train.csv')
yelp_test = pd.read_csv('yelp242_test.csv')
yelp_train['stars'] = yelp_train['stars']
yelp_test['stars'] = yelp_test['stars']
#yelp_train.info()

## Linear Regression Model

In [3]:
import statsmodels.formula.api as smf

# Simple regression using new data, not yet incorporating the Winery variable
mod = smf.ols(formula="stars ~ review_count +  C(GoodForKids, Treatment(reference='(Missing)')) + C(Alcohol, Treatment(reference='(Missing)')) + C(BusinessAcceptsCreditCards, Treatment(reference='(Missing)')) + C(WiFi, Treatment(reference='(Missing)')) + C(BikeParking, Treatment(reference='(Missing)')) + C(ByAppointmentOnly, Treatment(reference='(Missing)')) + C(WheelechairAccessible, Treatment(reference='(Missing)')) + C(OutdoorSeating, Treatment(reference='(Missing)')) + C(RestaurantsReservations, Treatment(reference='(Missing)')) + C(DogsAllowed, Treatment(reference='(Missing)')) + C(Caters, Treatment(reference='(Missing)'))",
                 data=yelp_train).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                  stars   R-squared:                       0.176
Model:                            OLS   Adj. R-squared:                  0.172
Method:                 Least Squares   F-statistic:                     53.24
Date:                Wed, 01 Mar 2023   Prob (F-statistic):          2.48e-239
Time:                        16:17:32   Log-Likelihood:                -7305.6
No. Observations:                6272   AIC:                         1.466e+04
Df Residuals:                    6246   BIC:                         1.484e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------

## OSR2

In [4]:
# compute out-of-sample R-squared using the test set
def OSR2(model, df_train, df_test, dependent_var):   
    y_test = df_test[dependent_var]
    y_pred = model.predict(df_test)
    SSE = np.sum((y_test - y_pred)**2)
    SST = np.sum((y_test - np.mean(df_train[dependent_var]))**2)    
    return 1 - SSE/SST

In [5]:
print(OSR2(mod, yelp_train, yelp_test, 'stars'))

0.14346594233013754


## MAE (mean absolute error)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder


y_train = yelp_train['stars']
y_test = yelp_test['stars']
X_train = yelp_train.drop(['stars'], axis=1)
X_test = yelp_test.drop(['stars'], axis=1)



labelencoder = LabelEncoder()

X_train_enc = X_train.select_dtypes(include='object').apply(labelencoder.fit_transform)
X_test_enc = X_test.select_dtypes(include='object').apply(labelencoder.fit_transform)

X_train_new = pd.concat([X_train.drop(X_train_enc.columns, axis=1), X_train_enc], axis=1)
X_test_new = pd.concat([X_test.drop(X_test_enc.columns, axis=1), X_test_enc], axis=1)



lr_model = LinearRegression()
lr_model.fit(X_train_new, y_train)
lr_predictions = lr_model.predict(X_test_new)
lr_mae = mean_absolute_error(y_test, lr_predictions)
print(lr_mae)

0.6327833639990151


## Predictions of fourOrAbove

In [7]:
def convert_to_binary(predictions):
    binary_predictions = np.zeros(predictions.shape)
    binary_predictions[predictions >= 4] = 1
    return binary_predictions.astype(int)

lr_pred = mod.predict(X_test)
convert_to_binary(lr_pred)

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
count_of_zeros = sum(1 for i in convert_to_binary(lr_pred) if i == 0)
print(count_of_zeros)

count_of_ones = sum(1 for i in convert_to_binary(lr_pred) if i == 1)
print(count_of_ones)

2389
299


In [9]:
from sklearn.metrics import confusion_matrix

y_test2 = convert_to_binary(y_test)
y_pred = convert_to_binary(mod.predict(X_test))
cm = confusion_matrix(y_test2, y_pred)

print ("Confusion Matrix : \n", cm) 

acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)

Confusion Matrix : 
 [[1453   67]
 [ 936  232]]
Accuracy is: 0.6269
TPR is: 0.1986
FPR is: 0.0441
