In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2, SelectFromModel
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv("CSI4810_projectVehicleData/slectedFeaturesTrain.csv")
target = train.pop("loan_default")
test = pd.read_csv("CSI4810_projectVehicleData/slectedFeaturesTest.csv")

---

#### <b>Modeling</b>

---

##### Testing different LogReg model parameters

In [3]:
normalizedTrain = pd.DataFrame(StandardScaler().fit_transform(train), columns=train.columns)
X, Xtest, y, ytest = train_test_split(normalizedTrain, target, test_size=0.2, random_state=42)

In [None]:
logParams = {
    'solver': ['lbfgs','sag'],
    'class_weight': ['balanced',None],
    'C': [10,1,.01,.001],
}

grid = GridSearchCV(LogisticRegression(max_iter=1000,random_state=42,penalty='l2'), logParams, scoring='f1',verbose=2,n_jobs=-1) 
grid.fit(X,y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [166]:
print(grid.best_params_)
logMdl = LogisticRegression(max_iter=200,solver='lbfgs',C=.01,random_state=42,class_weight='balanced').fit(X,y)
print(classification_report(y, logMdl.predict(X)))
print("Validation\n",classification_report(ytest,logMdl.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.84      0.54      0.66    145863
           1       0.28      0.64      0.39     40530

    accuracy                           0.56    186393
   macro avg       0.56      0.59      0.52    186393
weighted avg       0.72      0.56      0.60    186393

Validation
               precision    recall  f1-score   support

           0       0.85      0.53      0.66     36550
           1       0.28      0.65      0.39     10049

    accuracy                           0.56     46599
   macro avg       0.56      0.59      0.52     46599
weighted avg       0.72      0.56      0.60     46599



##### Testing different random forest model parameters

In [80]:
treeParams = {
    'max_depth': [None,10,20,25],
    'min_samples_leaf': [1,3,5],
    'min_impurity_decrease': [0,.01],
}
grid = GridSearchCV(RandomForestClassifier(random_state=42,class_weight='balanced',criterion='entropy'), treeParams, scoring='f1',verbose=2,n_jobs=-1) 
grid.fit(X,y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [161]:
print(grid.best_params_)
tree = RandomForestClassifier(max_depth=25,min_samples_leaf=3, class_weight='balanced',random_state=42,criterion='entropy').fit(X,y)
print(classification_report(y, tree.predict(X)))
print("Validation\n",classification_report(ytest,tree.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.96      0.89      0.92    145863
           1       0.68      0.88      0.77     40530

    accuracy                           0.89    186393
   macro avg       0.82      0.88      0.85    186393
weighted avg       0.90      0.89      0.89    186393

Validation
               precision    recall  f1-score   support

           0       0.82      0.82      0.82     36550
           1       0.33      0.33      0.33     10049

    accuracy                           0.71     46599
   macro avg       0.57      0.57      0.57     46599
weighted avg       0.71      0.71      0.71     46599



##### Testing different XGBoot model parameters

In [133]:
xgbParams = {
    'max_depth': [3,5,7,10,15,20],
    'min_child_weight': [1,3,5],
    'learning_rate': [.1,.05,.01],
    'n_estimators':[50,100,250,500],
    'scale_pos_weight': [3.25,3.5,3.75]
}
grid = GridSearchCV(XGBClassifier(booster='gbtree',
                    scale_pos_weight=3.5,
                    objective='binary:logistic',
                    seed=42,
                    ), xgbParams, scoring='f1',verbose=2,n_jobs=-1) 
grid.fit(X,y)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [4]:
print(grid.best_params_) 
xgb = XGBClassifier(booster='gbtree',
                    scale_pos_weight=3.75,
                    objective='binary:logistic',
                    seed=42,
                    n_estimators=250,
                    min_child_weight=1,
                    max_depth=5,
                    learning_rate=.1
                    ).fit(X,y)
print(classification_report(y, xgb.predict(X)))
print("Validation\n",classification_report(ytest,xgb.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.88      0.59      0.70    145863
           1       0.32      0.71      0.45     40530

    accuracy                           0.61    186393
   macro avg       0.60      0.65      0.57    186393
weighted avg       0.76      0.61      0.65    186393

Validation
               precision    recall  f1-score   support

           0       0.86      0.58      0.69     36550
           1       0.30      0.67      0.42     10049

    accuracy                           0.60     46599
   macro avg       0.58      0.62      0.55     46599
weighted avg       0.74      0.60      0.63     46599



---

### Scoring & Validation

In [136]:
def score_dataset(xx, yy,model):
    score = cross_validate(
        model, xx, yy, cv=5, scoring=['f1_weighted','roc_auc','accuracy'],verbose=2
    )
    return (("f1_weighted",score['test_f1_weighted'].mean(), score['test_f1_weighted'].std()),
             ("accuracy",score['test_accuracy'].mean(),score['test_accuracy'].std()),
               ("roc_auc",score['test_roc_auc'].mean(), score['test_roc_auc'].std()))

In [137]:
score_dataset(X, y,
                model=LogisticRegression(max_iter=200,solver='lbfgs',C=.01,class_weight='balanced'))

[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.4s


(('f1_weighted', 0.597426128478317, 0.0009105944286623746),
 ('accuracy', 0.5592162751301037, 0.001020744253287091),
 ('roc_auc', 0.6211699121345514, 0.0015551296441321878))

In [24]:
score_dataset(X, y,
                model=RandomForestClassifier(max_depth=25,min_samples_leaf=3, class_weight='balanced',criterion='entropy'))

[CV] END .................................................... total time= 1.2min
[CV] END .................................................... total time= 1.3min
[CV] END .................................................... total time= 1.2min
[CV] END .................................................... total time= 1.1min
[CV] END .................................................... total time= 1.2min


(('f1_weighted', 0.711231883945589, 0.0012461234138114502),
 ('accuracy', 0.714774718675016, 0.0022056298241717297),
 ('roc_auc', 0.6381499568035489, 0.001586434134728966))

In [179]:
score_dataset(X,y,
              model=XGBClassifier(booster='gbtree',
                    scale_pos_weight=3.75,
                    objective='binary:logistic',
                    n_estimators=500,
                    min_child_weight=1,
                    max_depth=5,
                    learning_rate=.1
                    ))

[CV] END .................................................... total time=   4.3s
[CV] END .................................................... total time=   6.0s
[CV] END .................................................... total time=   4.5s
[CV] END .................................................... total time=   5.6s
[CV] END .................................................... total time=   4.8s


(('f1_weighted', 0.6371772523054796, 0.001731894995835357),
 ('accuracy', 0.6024743422682675, 0.0019207160360762393),
 ('roc_auc', 0.6597397480320076, 0.0016509529823657716))

In [139]:
confusion_matrix(ytest, tree.predict(Xtest))

array([[29906,  6644],
       [ 6759,  3290]], dtype=int64)

In [140]:
confusion_matrix(ytest,logMdl.predict(Xtest))

array([[19545, 17005],
       [ 3565,  6484]], dtype=int64)

In [180]:
confusion_matrix(ytest,xgb.predict(Xtest))

array([[21019, 15531],
       [ 3298,  6751]], dtype=int64)

---
### Output Model

---

In [5]:
xgb.fit(normalizedTrain,target)

In [6]:
import joblib
joblib.dump(xgb, 'xgb.pkl')

['xgb.pkl']

In [1]:
! pip freeze > requirements.txt