<h1> Install a new library </h1>

In [1]:
# From Shmueli et al., Data Mining for Business Analytics
# https://www.dataminingbook.com/book/python-edition
!pip install dmba

Defaulting to user installation because normal site-packages is not writeable

You should consider upgrading via the 'c:\program files\python38\python.exe -m pip install --upgrade pip' command.



Collecting dmba
  Downloading dmba-0.0.13-py3-none-any.whl (11 kB)
Installing collected packages: dmba
Successfully installed dmba-0.0.13


<h1> Import necessary libraries </h1>

In [1]:
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, LassoCV, BayesianRidge
from sklearn.feature_selection import SelectFromModel

print("Finished!")

Finished!


<h1> Ingesting data </h1>

In [2]:
# Read in data and create data frame; call it "reviews"
reviews = pd.read_csv("company_reviews.csv")

<h1> Variable selection </h1>

<h2> -- Data prep </h2>

In [3]:
# Print your column names for easy copying and pasting
for col in reviews.columns: 
    print(col) 

month
year
state
status
summary
score
likes


In [4]:
# Convert status into binary numerical format and clean up space around name
status_dict = {'Former Employee': 0,'Former Employee ': 0,'Current Employee': 1, 'Current Employee ':1} 
reviews['status_num'] = [status_dict[item] for item in reviews.status]

In [5]:
# Isolate your X's and y 
independent = ['month', 'year', 'state', 'status_num', 'likes']
dependent = 'score'

In [6]:
# Split your data frame into training and test sets: 80% training; 20% test
X = pd.get_dummies(reviews[independent], drop_first=True)
y = reviews[dependent]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state=200)

In [None]:
# The code below for exhaustive search, backward elimination, forward selection, and stepwise selection 
# draws from the Schmueli et al. book

<h2> -- Exhaustive search </h2>

In [None]:
# CAUTION: This code takes a *really* long time to run, like multiple days.
# Code is here for your reference but not required for homework or your team projects.

def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[list(variables)], train_y)
    return model

def score_model(model, variables):
    pred_y = model.predict(train_X[list(variables)])
    return -adjusted_r2_score(train_y, pred_y, model)

allVariables = train_X.columns
results = exhaustive_search(allVariables, train_model, score_model)

data = []
for result in results:
    model = result['model']
    variables = list(result['variables'])
    AIC = AIC_score(train_y, model.predict(train_X[variables]),
                    model)
    d = {'n': result['n'], 'r2adj': -result['score'], 'AIC':
         AIC}
    d.update({var: var in result['variables'] for var in 
              allVariables})
    data.append(d)

pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables)))

<h2> -- Backward elimination </h2>

In [8]:
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    return AIC_score(train_y, model.predict(train_X[variables]), model)

allVariables = train_X.columns
best_model, best_variables = backward_elimination(allVariables, train_model, score_model, verbose=True)

print(best_variables)

regressionSummary(test_y, best_model.predict(test_X[best_variables]))

Variables: year, status_num, likes, month_Aug, month_Dec, month_Feb, month_Jan, month_Jul, month_Jun, month_Mar, month_May, month_Nov, month_Oct, month_Sep, state_CA, state_CO, state_CT, state_DC, state_DE, state_FL, state_GA, state_IL, state_IN, state_KS, state_KY, state_LA, state_MA, state_MD, state_ME, state_MI, state_MN, state_MO, state_NC, state_ND, state_NH, state_NJ, state_NM, state_NV, state_NY, state_OH, state_OK, state_OR, state_PA, state_SC, state_SD, state_TN, state_TX, state_UT, state_VA, state_WA, state_WI, state_WV
Start: score=2264.34
Step: score=2262.34, remove month_Sep
Step: score=2260.35, remove state_ND
Step: score=2258.37, remove state_OH
Step: score=2256.40, remove state_KS
Step: score=2254.43, remove state_NC
Step: score=2252.48, remove state_CT
Step: score=2250.53, remove state_KY
Step: score=2248.57, remove state_TN
Step: score=2246.62, remove state_UT
Step: score=2244.75, remove state_CO
Step: score=2242.90, remove state_FL
Step: score=2241.17, remove state_W

In [None]:
# Note which x's the model produce and where the regression statistics fall in comparison to later methods.

<h2> -- Forward selection </h2>

In [10]:
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

best_model, best_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)

print(best_variables)

regressionSummary(test_y, best_model.predict(test_X[best_variables]))

Variables: year, status_num, likes, month_Aug, month_Dec, month_Feb, month_Jan, month_Jul, month_Jun, month_Mar, month_May, month_Nov, month_Oct, month_Sep, state_CA, state_CO, state_CT, state_DC, state_DE, state_FL, state_GA, state_IL, state_IN, state_KS, state_KY, state_LA, state_MA, state_MD, state_ME, state_MI, state_MN, state_MO, state_NC, state_ND, state_NH, state_NJ, state_NM, state_NV, state_NY, state_OH, state_OK, state_OR, state_PA, state_SC, state_SD, state_TN, state_TX, state_UT, state_VA, state_WA, state_WI, state_WV
Start: score=2344.82, constant
Step: score=2264.54, add likes
Step: score=2250.79, add status_num
Step: score=2235.51, add state_CA
Step: score=2228.84, add state_GA
Step: score=2225.79, add state_TX
Step: score=2223.29, add month_Dec
Step: score=2220.85, add state_PA
Step: score=2218.31, add state_OR
Step: score=2216.17, add state_SC
Step: score=2214.94, add month_Aug
Step: score=2214.14, add month_Jan
Step: score=2213.93, add state_MI
Step: score=2213.76, ad

In [None]:
# Same assessment as above.

<h2> -- Stepwise selection </h2>

In [12]:
best_model, best_variables = stepwise_selection(train_X.columns, train_model, score_model, verbose = True)

print(best_variables)

regressionSummary(test_y, best_model.predict(test_X[best_variables]))

Variables: year, status_num, likes, month_Aug, month_Dec, month_Feb, month_Jan, month_Jul, month_Jun, month_Mar, month_May, month_Nov, month_Oct, month_Sep, state_CA, state_CO, state_CT, state_DC, state_DE, state_FL, state_GA, state_IL, state_IN, state_KS, state_KY, state_LA, state_MA, state_MD, state_ME, state_MI, state_MN, state_MO, state_NC, state_ND, state_NH, state_NJ, state_NM, state_NV, state_NY, state_OH, state_OK, state_OR, state_PA, state_SC, state_SD, state_TN, state_TX, state_UT, state_VA, state_WA, state_WI, state_WV
Start: score=2344.82, constant
Step: score=2264.54, add likes
Step: score=2250.79, add status_num
Step: score=2235.51, add state_CA
Step: score=2228.84, add state_GA
Step: score=2225.79, add state_TX
Step: score=2223.29, add month_Dec
Step: score=2220.85, add state_PA
Step: score=2218.31, add state_OR
Step: score=2216.17, add state_SC
Step: score=2214.94, add month_Aug
Step: score=2214.14, add month_Jan
Step: score=2213.93, add state_MI
Step: score=2213.76, ad

In [None]:
# Same assessment as above.

<h2> -- Ridge </h2>

In [None]:
# The implementations below are from scikit-learn (sklearn)

In [13]:
# Run the model using the same training and test objects as above
ridge = Ridge(normalize=True, alpha=1)
ridge.fit(train_X, train_y)
regressionSummary(test_y, ridge.predict(test_X))


Regression statistics

                      Mean Error (ME) : 0.0226
       Root Mean Squared Error (RMSE) : 1.0553
            Mean Absolute Error (MAE) : 0.8819
          Mean Percentage Error (MPE) : -12.9479
Mean Absolute Percentage Error (MAPE) : 32.7378


In [43]:
# Determine which X's were selected 
selector = SelectFromModel(Ridge()).fit(train_X, train_y)
train_X.columns[selector.get_support()]

Index(['status_num', 'month_Dec', 'state_CA', 'state_DC', 'state_DE',
       'state_GA', 'state_IL', 'state_IN', 'state_LA', 'state_ME', 'state_MI',
       'state_MN', 'state_MO', 'state_NH', 'state_NJ', 'state_NM', 'state_NY',
       'state_OK', 'state_OR', 'state_PA', 'state_SC', 'state_SD', 'state_TX',
       'state_VA'],
      dtype='object')

In [None]:
# Same assessment as above.

<h2> -- Lasso </h2>

In [44]:
lasso = Lasso(normalize=True, alpha=1)
lasso.fit(train_X, train_y)
regressionSummary(test_y, lasso.predict(test_X))


Regression statistics

                      Mean Error (ME) : 0.0163
       Root Mean Squared Error (RMSE) : 1.0626
            Mean Absolute Error (MAE) : 0.9031
          Mean Percentage Error (MPE) : -14.1541
Mean Absolute Percentage Error (MAPE) : 34.2501


In [45]:
# Determine which X's were selected 
selector = SelectFromModel(Lasso()).fit(train_X, train_y)
train_X.columns[selector.get_support()]

Index(['likes'], dtype='object')

In [None]:
# Same assessment as above.

<h2> -- Next steps + extensions </h2>

In [None]:
# Application to logistic regression: 

# Change your X's and y above in this step so that "dependent" captures your binary dependent variable
independent = ['month', 'year', 'state', 'likes']
dependent = 'status_num'

# Then change references to LinearRegression to LogisticRegression throughout the code
# Note that if you get errors re: interations when you switch from linear to logistic, add a max_iter option 
# where the code calls LogisticRegression as follows:
model = LogisticRegression(max_iter=400)

# Forward and stepwise selection may generate errors in logistic regression applications.  

In [None]:
# Once you have selected your variables, 
# go back to regression for interpretation or prediction with your new set of x's

# Can you beat your earlier prediction performance?

In [None]:
# Extensions within sklearn: LassoCV, BayesianRidge