# Modeling

# Imports

In [1]:
#export
import pkg_resources
from pkg_resources import DistributionNotFound, VersionConflict
from platform import python_version
import numpy as np
import pandas as pd
import time
import gc
import random
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from scipy import stats
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix

# Data Import
Now for the fun stuff. Let’s import some data!

Clean Dirty Data

In [2]:
path1 = 'final_data.csv'
dirty_data = pd.read_csv(path1)
hours_traveled = dirty_data[~dirty_data['hours_traveled'].isna()]
no_nulls = hours_traveled.fillna(0)
less_dirty_data = pd.get_dummies(no_nulls, columns=['contracttypelocale','buildingtype',"Max Type"])
less_dirty_data.to_csv('cleaned_final_data.csv')

In [3]:
len(less_dirty_data)

660

Create Funnel

In [2]:
path2 = 'cleaned_final_data.csv'
# pd.read_csv(path2).columns
features = [
    'Has Max',
    # 'Max Type_MAX Pro',
    # 'Max Type_MAX Digital Subscription for NIM',
    'hours_traveled',
    'User_Gen_Alerts',
    'Non_User_Gen_Alerts',
    'tickets',
    'ticket_hours',
    'contracttypelocale_Bronze',
    'contracttypelocale_Bronze w/ Phone Monitoring',
    'contracttypelocale_Gold',
    'contracttypelocale_Gold w/ Phone Monitoring', 'contracttypelocale_NIM',
    'contracttypelocale_Platinum',
    'contracttypelocale_Platinum Premiere w/ Phone Monitoring',
    'contracttypelocale_Platinum w/ Phone Monitoring', 'buildingtype_0',
    'buildingtype_Education / Religion',
    'buildingtype_Entertainment / Leisure',
    'buildingtype_Hospital / Healthcare', 'buildingtype_Hotel / Restaurant',
    'buildingtype_Industrial', 'buildingtype_Mixed-Use Buildings',
    'buildingtype_Office', 'buildingtype_Parking Garage',
    'buildingtype_Private Residential - Comfort *',
    'buildingtype_Private Residential - Premium *', 'buildingtype_Retail']
    # 'ContractTypeLocale_Bronze',
    # 'ContractTypeLocale_Bronze w/ Phone Monitoring',
    # 'ContractTypeLocale_Gold',
    # 'ContractTypeLocale_Gold w/ Phone Monitoring',
    # 'ContractTypeLocale_NIM',
    # 'ContractTypeLocale_Platinum',
    # 'ContractTypeLocale_Platinum Premiere w/ Phone Monitoring',
    # 'ContractTypeLocale_Platinum w/ Phone Monitoring',


In [3]:
#export
class Data():
    def __init__(self):
        self.x_data = None
        self.y_data = None
        self.x_train = None
        self.y_train = None
        self.x_test = None
        self.y_test = None

    def dataAllocation(self,path):
        # TODO: Separate out the x_data and y_data and return each
        # args: string path for .csv file
        # return: pandas dataframe, pandas series
        # -------------------------------
        all_data = pd.read_csv(path)
        x_data = all_data[features]
        y_data = all_data['renewed']
        # -------------------------------

        self.x_data = x_data
        self.y_data = y_data
        print('Allocation complete')

    def trainSets(self):
        # TODO: Split 75% of the data into training and 25% into test sets. Call them x_train, x_test, y_train and y_test.
        # Use the train_test_split method in sklearn with the parameter 'shuffle' set to true and the 'random_state' set to 614.
        # args: pandas dataframe, pandas dataframe
        # return: pandas dataframe, pandas dataframe, pandas series, pandas series
        # -------------------------------
        os = SMOTE(random_state=0)
        x_train, x_test, y_train, y_test = train_test_split(self.x_data,self.y_data, test_size=0.25, train_size=None, random_state=614, shuffle=True, stratify=None)
        os_data_x,os_data_y=os.fit_resample(x_train, y_train)
        os_data_x = pd.DataFrame(data=os_data_x,columns=features)
        os_data_y = pd.DataFrame(data=os_data_y,columns=['renewed'])['renewed']
        # Check that data was oversampled correctly
        print("length of actual training data is ",len(y_train))
        print("length of oversampled data is ",len(os_data_x))
        print("Number of 'not renewed' in oversampled data",len(os_data_y[os_data_y==0]))
        print("Number of 'not renewed' in actual training data",len(y_train[y_train==0]))
        print("Proportion of 'not renewed' data in oversampled data is ",len(os_data_y[os_data_y==0])/len(os_data_x))
        print("Proportion of 'not renewed' data in actual training data is ",len(y_train[y_train==0])/len(y_train))
        # -------------------------------
        self.x_train = os_data_x
        self.x_test = x_test
        self.y_train = os_data_y
        self.y_test = y_test
        print('Train/Test split complete')
##################################################
##### Do not add anything below this line ########
data = Data()
data.dataAllocation(path2)
data.trainSets()
##################################################

Allocation complete
length of actual training data is  495
length of oversampled data is  750
Number of 'not renewed' in oversampled data 375
Number of 'not renewed' in actual training data 120
Proportion of 'not renewed' data in oversampled data is  0.5
Proportion of 'not renewed' data in actual training data is  0.24242424242424243
Train/Test split complete


In [5]:
# data.x_train.to_csv('train_x.csv')
# data.y_train.to_csv('train_y.csv')
# data.x_test.to_csv('test_x.csv')
# data.y_test.to_csv('test_y.csv')
corr = data.x_train[
    ['Has Max',
    'hours_traveled',
    'User_Gen_Alerts',
    'Non_User_Gen_Alerts',
    'tickets',
    'ticket_hours',
    'buildingtype_Office']
].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,Has Max,hours_traveled,User_Gen_Alerts,Non_User_Gen_Alerts,tickets,ticket_hours,buildingtype_Office
Has Max,1.0,-0.02,-0.09,-0.03,-0.07,-0.03,0.13
hours_traveled,-0.02,1.0,0.13,0.12,0.12,0.03,-0.04
User_Gen_Alerts,-0.09,0.13,1.0,0.55,0.38,0.19,0.0
Non_User_Gen_Alerts,-0.03,0.12,0.55,1.0,0.51,0.28,-0.08
tickets,-0.07,0.12,0.38,0.51,1.0,0.55,-0.08
ticket_hours,-0.03,0.03,0.19,0.28,0.55,1.0,0.01
buildingtype_Office,0.13,-0.04,0.0,-0.08,-0.08,0.01,1.0


In [16]:
import matplotlib.pyplot as plt
import plotly.express as px
df = data.x_train[
    ['Has Max',
    'hours_traveled',
    'User_Gen_Alerts',
    'Non_User_Gen_Alerts',
    'tickets',
    'ticket_hours',
    'buildingtype_Office']
]
# f = plt.figure(figsize=(19, 15))
# my_dpi = 45
# # f = plt.figure(figsize=(800/my_dpi, 800/my_dpi), dpi=my_dpi)
# plt.matshow(corr, fignum=f.number)
# plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=24, rotation=45)
# plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=24)
# cb = plt.colorbar()
# cb.ax.tick_params(labelsize=18)
# plt.title('Correlation Matrix', fontsize=20)
fig = px.imshow(corr)
fig.show()

# Logistic Regression Classifier

In [17]:
class LogClassifier():

    def logisticClassifier(self,x_train,x_test, y_train):
        # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: RandomForestClassifier object, numpy array, numpy array
        # -------------------------------
        clf = LogisticRegression(random_state=614)
        clf.fit(x_train, y_train)
        y_predict_train = clf.predict(x_train)
        y_predict_test = clf.predict(x_test)
        # -------------------------------
        return clf, y_predict_train, y_predict_test

    def rfTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        train_accuracy = accuracy_score(y_train, y_predict_train)
        # -------------------------------
        return train_accuracy

    def rfTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        test_accuracy = accuracy_score(y_test, y_predict_test)
        # -------------------------------
        return test_accuracy

#Feature Importance

    def rfGetParams(self,clf):
        # TODO: Determine the feature importance as evaluated by the Random Forest Classifier.
        # args: RandomForestClassifier object
        # return: float array
        # -------------------------------
        params = clf.get_params()
        # -------------------------------
        return params
##################################################
##### Do not add anything below this line ########
log_model = LogClassifier()
clf, y_predict_train, y_predict_test = log_model.logisticClassifier(x_train = data.x_train, x_test = data.x_test, y_train = data.y_train)
train_accuracy = log_model.rfTrainAccuracy(data.y_train, y_predict_train)
print('Basic Model')
print(f'Train Accuracy: {train_accuracy}')
test_accuracy = log_model.rfTestAccuracy(data.y_test, y_predict_test)
print(f'Test Accuracy: {test_accuracy}')

# est = sm.OLS(data.y_train, data.x_train[['Has Max']])
# est2 = est.fit()
# print(est2.summary())

Basic Model
Train Accuracy: 0.7573333333333333
Test Accuracy: 0.6909090909090909



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



# Random Forest Classifier

In [18]:
#export
class RFClassifier():

    def randomForestClassifier(self,x_train,x_test, y_train):
        # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: RandomForestClassifier object, numpy array, numpy array
        # -------------------------------
        rf_clf = RandomForestClassifier(random_state=614)
        rf_clf.fit(x_train, y_train)
        y_predict_train = rf_clf.predict(x_train)
        y_predict_test = rf_clf.predict(x_test)
        # -------------------------------
        return rf_clf, y_predict_train, y_predict_test

    def rfTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        train_accuracy = accuracy_score(y_train, y_predict_train)
        # -------------------------------
        return train_accuracy

    def rfTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        test_accuracy = accuracy_score(y_test, y_predict_test)
        # -------------------------------
        return test_accuracy

#Feature Importance

    def rfFeatureImportance(self,rf_clf):
        # TODO: Determine the feature importance as evaluated by the Random Forest Classifier.
        # args: RandomForestClassifier object
        # return: float array
        # -------------------------------
        feature_importance = rf_clf.feature_importances_
        # sorted_indices = np.argsort(importances)[::-1]
        # feature_importance = importances[sorted_indices]
        # -------------------------------
        return feature_importance

    def sortedRFFeatureImportanceIndicies(self,rf_clf):
        # TODO: Sort them in the descending order and return the feature numbers[0 to ...].
        #       Hint: There is a direct function available in sklearn to achieve this. Also checkout argsort() function in Python.
        # args: RandomForestClassifier object
        # return: int array
        # -------------------------------
        feature_importance = rf_clf.feature_importances_
        sorted_indices = np.argsort(feature_importance)[::-1]
        # -------------------------------
        return sorted_indices

#Hyper-parameter Tuning

    def hyperParameterTuning(self,rf_clf,x_train,y_train):
        # TODO: Tune the hyper-parameters 'n_estimators' and 'max_depth'.
        # Define param_grid for GridSearchCV as a dictionary
        # args: RandomForestClassifier object, pandas dataframe, pandas series
        # return: GridSearchCV object
        # -------------------------------
        param_grid = {'n_estimators':[4,16,256], 'max_depth':[2, 8, 16]}
        gscv_rfc = GridSearchCV(rf_clf, param_grid)
        gscv_rfc.fit(x_train, y_train)
        # -------------------------------
        return gscv_rfc

    def bestParams(self,gscv_rfc):
        # TODO: Get the best params, using .best_params_
        # args:  GridSearchCV object
        # return: parameter dict
        # -------------------------------
        best_params = gscv_rfc.best_params_
        # -------------------------------
        return best_params

    def bestScore(self,gscv_rfc):
        # TODO: Get the best score, using .best_score_.
        # args: GridSearchCV object
        # return: float
        # -------------------------------
        best_score = gscv_rfc.best_score_
        # -------------------------------
        return best_score

    def randomForestClassifier_Tuned(self,x_train,x_test, y_train):
        # TODO: Create a RandomForestClassifier and train it. Set Random state to 614.
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: RandomForestClassifier object, numpy array, numpy array
        # -------------------------------
        rf_clf = RandomForestClassifier(random_state=614, max_depth=8, n_estimators=4)
        rf_clf.fit(x_train, y_train)
        y_predict_train = rf_clf.predict(x_train)
        y_predict_test = rf_clf.predict(x_test)
        # -------------------------------
        return rf_clf, y_predict_train, y_predict_test

##################################################
##### Do not add anything below this line ########
forest = RFClassifier()
rf_clf, y_predict_train, y_predict_test = forest.randomForestClassifier(x_train = data.x_train, x_test = data.x_test, y_train = data.y_train)
train_accuracy = forest.rfTrainAccuracy(data.y_train, y_predict_train)
print('Basic Model')
print(f'Train Accuracy: {train_accuracy}')
test_accuracy = forest.rfTestAccuracy(data.y_test, y_predict_test)
print(f'Test Accuracy: {test_accuracy}')
feature_importance = forest.rfFeatureImportance(rf_clf)
sorted_indices = forest.sortedRFFeatureImportanceIndicies(rf_clf)
print("Sorted Features (by importance):")
n = 1
for ind in sorted_indices:
    print(f'...{n}. {feature_importance[ind]}  {features[ind]}')
    n+=1
    if n ==16:
        break
gscv_rfc = forest.hyperParameterTuning(rf_clf, data.x_train, data.y_train)
best_params = forest.bestParams(gscv_rfc)
print('Tuning:')
print(f'Best Parameters: {best_params}')
bestScore = forest.bestScore(gscv_rfc)
print(f'bestScore: {bestScore}')
tuned_rf_clf, tuned_y_predict_train, tuned_y_predict_test = forest.randomForestClassifier_Tuned(x_train = data.x_train, x_test = data.x_test, y_train = data.y_train)
print('Tuned Model')
tuned_train_accuracy = forest.rfTrainAccuracy(data.y_train, tuned_y_predict_train)
print(f'Tuned Train Accuracy: {tuned_train_accuracy}')
tuned_test_accuracy = forest.rfTestAccuracy(data.y_test, tuned_y_predict_test)
# print(f'Tuned Test Accuracy: {tuned_test_accuracy}')
##################################################

Basic Model
Train Accuracy: 1.0
Test Accuracy: 0.793939393939394
Sorted Features (by importance):
...1. 0.287249072216247  hours_traveled
...2. 0.09665018539769779  User_Gen_Alerts
...3. 0.08139427975691284  Non_User_Gen_Alerts
...4. 0.07266944948851302  buildingtype_Office
...5. 0.0721680477657909  ticket_hours
...6. 0.06552022576753447  tickets
...7. 0.05311429427722605  buildingtype_Hospital / Healthcare
...8. 0.03671733898412571  buildingtype_0
...9. 0.032617994923565666  Has Max
...10. 0.024472772566230535  contracttypelocale_NIM
...11. 0.022787626562461933  buildingtype_Parking Garage
...12. 0.02148343585257382  contracttypelocale_Gold w/ Phone Monitoring
...13. 0.017490356257876465  buildingtype_Hotel / Restaurant
...14. 0.016588286475109577  buildingtype_Private Residential - Comfort *
...15. 0.01608332497612114  buildingtype_Education / Religion
Tuning:
Best Parameters: {'max_depth': 16, 'n_estimators': 16}
bestScore: 0.8573333333333333
Tuned Model
Tuned Train Accuracy: 0.868


# SVM Classifier

In [19]:
#export
class SupportVectorMachine():

    def dataPreProcess(self,x_train,x_test):
        # TODO: Pre-process the data to standardize it, otherwise the grid search will take much longer.
        # args: pandas dataframe, pandas dataframe
        # return: pandas dataframe, pandas dataframe
        # -------------------------------
        scaler = StandardScaler()
        scaler.fit(x_train)
        scaled_x_train = scaler.transform(x_train)
        scaled_x_test = scaler.transform(x_test)
        # -------------------------------
        return scaled_x_train, scaled_x_test

    def SVCClassifier(self,scaled_x_train,scaled_x_test, y_train):
        # TODO: Create a SVC classifier and train it. Set gamma = 'auto'
        # args: pandas dataframe, pandas dataframe, pandas series
        # return: numpy array, numpy array
        # -------------------------------
        clf = SVC(gamma='auto')
        clf.fit(scaled_x_train, y_train)
        y_predict_train = clf.predict(scaled_x_train)
        y_predict_test = clf.predict(scaled_x_test)
        # -------------------------------
        return y_predict_train,y_predict_test

    def SVCTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy on the training set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        train_accuracy = accuracy_score(y_train, y_predict_train)
        # -------------------------------
        return train_accuracy

    def SVCTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy on the test set using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        test_accuracy = accuracy_score(y_test, y_predict_test)
        # -------------------------------
        return test_accuracy

    def SVMBestScore(self, scaled_x_train, y_train):
        # TODO: Tune the hyper-parameters 'C' and 'kernel' (use rbf and linear).
        # Note: Set n_jobs = -1 and return_train_score = True and gamma = 'auto'
        # args: pandas dataframe, pandas series
        # return: GridSearchCV object, float
        # -------------------------------
        svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]}
        clf = SVC(gamma='auto')
        svm_cv = GridSearchCV(
            clf, svm_parameters, scoring=None, n_jobs=-1, refit=True, return_train_score=True)
        svm_cv.fit(scaled_x_train, y_train)
        best_score = svm_cv.best_score_
        # -------------------------------
        return svm_cv, best_score

    def SVCClassifierParam(self,svm_cv,scaled_x_train,scaled_x_test,y_train):
        # TODO: Calculate the training and test set predicted values after hyperparameter tuning and standardization.
        # args: GridSearchCV object, pandas dataframe, pandas dataframe, pandas series
        # return: numpy series, numpy series
        # -------------------------------
        y_predict_train = svm_cv.predict(scaled_x_train)
        y_predict_test = svm_cv.predict(scaled_x_test)
        # -------------------------------
        return y_predict_train,y_predict_test

    def svcTrainAccuracy(self,y_train,y_predict_train):
        # TODO: Return accuracy (on the training set) using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        train_accuracy = accuracy_score(y_train, y_predict_train)
        # -------------------------------
        return train_accuracy

    def svcTestAccuracy(self,y_test,y_predict_test):
        # TODO: Return accuracy (on the test set) using the accuracy_score method.
        # args: pandas series, numpy array
        # return: float
        # -------------------------------
        test_accuracy = accuracy_score(y_test, y_predict_test)
        # -------------------------------
        return test_accuracy

#Cross Validation Results

    def SVMRankTestScore(self,svm_cv):
        # TODO: Return the rank test score for all hyperparameter values that you obtained in Q3.4.3. The
        # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.
        # args: GridSearchCV object
        # return: int array
        # -------------------------------
        rank_test_score = svm_cv.cv_results_['rank_test_score']
        # -------------------------------
        return rank_test_score

    def SVMMeanTestScore(self,svm_cv):
        # TODO: Return mean test score for all of hyperparameter values that you obtained in Q3.4.3. The
        # GridSearchCV class holds a 'cv_results_' dictionary that should help you report these metrics easily.
        # args: GridSearchCV object
        # return: float array
        # -------------------------------
        mean_test_score = svm_cv.cv_results_['mean_test_score']
        # -------------------------------
        return mean_test_score

##################################################
##### Do not add anything below this line ########
svm = SupportVectorMachine()
scaled_x_train, scaled_x_test = svm.dataPreProcess(data.x_train,data.x_test)
y_predict_train,y_predict_test = svm.SVCClassifier(scaled_x_train,scaled_x_test, data.y_train)
test_accuracy = svm.SVCTestAccuracy(data.y_test, y_predict_test)
train_accuracy = svm.SVCTestAccuracy(data.y_train, y_predict_train)
print('Basic Model')
print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')
tune_svm, best_accuracy = svm.SVMBestScore(scaled_x_train, data.y_train)
print(f'Best Accuracy: {best_accuracy}')

Basic Model
Train Accuracy: 0.8466666666666667
Test Accuracy: 0.7696969696969697
Best Accuracy: 0.8066666666666666


radial basis function kernel