# CS 559: Final Project - Individual Part
Zheng Ke Xu

## 1. Subgroup 2379
Goal is to apply 3 different supervised learning models. We're going to rely on gridsearch CV to perform cross validation.

In [43]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

#we can actually apply this notebook to any subgroup. All subgroups are already preprocessed
df = pd.read_csv('./subgroup_2379.csv')

#Splitting data into 
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

x_train = train_set.drop(columns='Bankrupt?')
y_train = train_set['Bankrupt?']

x_test = test_set.drop(columns='Bankrupt?')
y_test = test_set['Bankrupt?']

print(df)

      Bankrupt?   ROA(C) before interest and depreciation before interest  \
0           0.0                                           0.404409          
1           0.0                                           1.081097          
2           0.0                                           0.143034          
3           0.0                                           0.284946          
4           0.0                                           1.392983          
...         ...                                                ...          
2374        0.0                                           1.114771          
2375        0.0                                           0.018761          
2376        0.0                                           0.795670          
2377        0.0                                           0.135016          
2378        0.0                                           0.532691          

       ROA(A) before interest and % after tax  \
0                         

### Part 1.1: SVM

In [44]:
from sklearn.svm import SVC

#Training model and cross validation
svc = SVC(kernel='rbf', gamma='auto')

param_grid_svc = {
    'C': range(1, 10, 1)
}

grid_search_svc = GridSearchCV(
    estimator=svc,
    param_grid=param_grid_svc,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

start_time = time.time()
grid_search_svc.fit(x_train, y_train)
end_time = time.time()
tuning_time = end_time - start_time

svc = grid_search_svc.best_estimator_
svc_param = grid_search_svc.best_params_
svc_results = pd.DataFrame(grid_search_svc.cv_results_)

print("Tuning time: " + str(tuning_time) + "\n")
print("R2 values: \n" + str(svc_results[['mean_score_time', 'mean_test_score', 'rank_test_score']]) + "\n")
print("Best SVM parameters: " + str(svc_param))

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Tuning time: 0.027377605438232422

R2 values: 
   mean_score_time  mean_test_score  rank_test_score
0         0.001966         0.998424                1
1         0.002059         0.998424                1
2         0.002194         0.998424                1
3         0.002024         0.998424                1
4         0.001867         0.998424                1
5         0.002068         0.998424                1
6         0.001964         0.998424                1
7         0.001871         0.998424                1
8         0.001376         0.998424                1

Best SVM parameters: {'C': 1}


In [45]:
#Testing model SV
svc_pred = svc.predict(x_test)
svc_acc = accuracy_score(y_test, svc_pred)

print(f"SVC accuracy score {svc_acc}")

SVC accuracy score 1.0


### Part 1.2: Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression

#Training model and cross validation
logreg = LogisticRegression(solver='liblinear', random_state=42)

param_grid_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

grid_search_logreg = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid_logreg,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

start_time = time.time()
grid_search_logreg.fit(x_train, y_train)
end_time = time.time()
tuning_time = end_time - start_time

logreg = grid_search_logreg.best_estimator_
logreg_param = grid_search_logreg.best_params_
logreg_results = pd.DataFrame(grid_search_logreg.cv_results_)

print("Tuning time: " + str(tuning_time) + "\n")
print("R2 values: \n" + str(logreg_results[['mean_score_time', 'mean_test_score', 'rank_test_score']]) + "\n")
print("Best logistic regression parameters: " + str(logreg_param))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Tuning time: 0.04812049865722656

R2 values: 
    mean_score_time  mean_test_score  rank_test_score
0          0.001084         0.998424                1
1          0.001072         0.998424                1
2          0.001051         0.998424                1
3          0.000976         0.998424                1
4          0.001253         0.998424                1
5          0.001256         0.998424                1
6          0.001086         0.998424                1
7          0.001097         0.998424                1
8          0.001214         0.998424                1
9          0.001181         0.998424                1
10         0.000885         0.996847               12
11         0.000831         0.997898               11

Best logistic regression parameters: {'C': 0.0001, 'penalty': 'l1'}


In [47]:
#Testing model Logistic Regression
logreg_pred = logreg.predict(x_test)
logreg_acc = accuracy_score(y_test, logreg_pred)

print(f"Logistic Regression accuracy score {logreg_acc}")

Logistic Regression accuracy score 1.0


### Part 1.3: Naive Bayes

In [48]:
from sklearn.naive_bayes import GaussianNB

#Training model and cross validation
naiveB = GaussianNB()

param_grid_naiveB = {
    'var_smoothing': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

grid_search_naiveB = GridSearchCV(
    estimator=naiveB,
    param_grid=param_grid_naiveB,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

start_time = time.time()
grid_search_naiveB.fit(x_train, y_train)
end_time = time.time()
tuning_time = end_time - start_time

naiveB = grid_search_naiveB.best_estimator_
naiveB_param = grid_search_naiveB.best_params_
naiveB_results = pd.DataFrame(grid_search_naiveB.cv_results_)

print("Tuning time: " + str(tuning_time) + "\n")
print("R2 values: \n" + str(naiveB_results[['mean_score_time', 'mean_test_score', 'rank_test_score']]) + "\n")
print("Best Naive Bayes parameters: " + str(naiveB_param))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Tuning time: 0.024627685546875

R2 values: 
   mean_score_time  mean_test_score  rank_test_score
0         0.001116         0.997898                3
1         0.001116         0.995795                4
2         0.001150         0.990540                5
3         0.001009         0.968995                6
4         0.000866         0.998424                1
5         0.001149         0.998424                1

Best Naive Bayes parameters: {'var_smoothing': 1}


In [49]:
#Testing model Naive Bayes
naiveB_pred = naiveB.predict(x_test)
naiveB_acc = accuracy_score(y_test, naiveB_pred)

print(f"Naive Bayes accuracy score {naiveB_acc}")

Naive Bayes accuracy score 1.0


## 2. Stacking Model
Combine our 3 base models into a stacking model. Our meta learner is Logistic Regression.

In [50]:
from sklearn.ensemble import StackingClassifier

#Training stacked models
estimators = [
    ('svc', svc),
    ('logreg', logreg),
    ('naiveB', naiveB)
]

meta_learner = LogisticRegression(solver='liblinear')

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_learner,
    cv=3,
    passthrough=False
)

stacking_model.fit(x_train, y_train)
stack_pred = stacking_model.predict(x_test)
stack_acc = accuracy_score(y_test, stack_pred)

print(f"Stacked model score {naiveB_acc}")

Stacked model score 1.0


This data set is very sparse and may not necessarily be a good representative of the final model.

In [51]:
import joblib

joblib.dump(stacking_model.predict, 'stacking_model_zx.joblib')

['stacking_model_zx.joblib']