In [1]:
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
import pickle

In [2]:
upstream = ['section-05-data-preparation']
product = None

In [3]:
# Parameters
upstream = {
    "section-05-data-preparation": {
        "X_val": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-05-data-preparation-X_val.pkl",
        "X_train": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-05-data-preparation-X_train.pkl",
        "y_train": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-05-data-preparation-y_train.pkl",
        "y_val": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-05-data-preparation-y_val.pkl",
        "nb": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-05-data-preparation.ipynb",
    }
}
product = {
    "X_val": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-06-scikit-learn-bespoke-ml-X_val.pkl",
    "X_train": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-06-scikit-learn-bespoke-ml-X_train.pkl",
    "nb": "/content/Reproducible-Data-Analysis-With-Jupyter-Notebooks-Standard-Bank-Virtual-Internship/output/section-06-scikit-learn-bespoke-ml.ipynb",
}


In [4]:
X_train = pickle.loads(Path(upstream['section-05-data-preparation']['X_train']).read_bytes())
X_val = pickle.loads(Path(upstream['section-05-data-preparation']['X_val']).read_bytes())
y_train = pickle.loads(Path(upstream['section-05-data-preparation']['y_train']).read_bytes())
y_val = pickle.loads(Path(upstream['section-05-data-preparation']['y_val']).read_bytes())

## 06) Scikit-learn Bespoke ML

In [5]:

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", accuracy_score(xgb.predict(X_train), y_train))
print("Accuracy on the Train Dataset : ", accuracy_score(xgb.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(xgb.predict(X_val), y_val))

Accuracy on the Train Dataset :  1.0
Accuracy on the Train Dataset :  0.7922077922077922

Confusion matrix : 
 [[29 14]
 [18 93]]


In [6]:
!pip install catboost
clear_output()

In [7]:

cbc = CatBoostClassifier(verbose=0)
cbc.fit(X_train, y_train)

print("Accuracy on the Train Dataset : ", accuracy_score(cbc.predict(X_train), y_train))
print("Accuracy on the Train Dataset : ", accuracy_score(cbc.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(cbc.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.9326086956521739
Accuracy on the Train Dataset :  0.8116883116883117

Confusion matrix : 
 [[ 25   7]
 [ 22 100]]


In [8]:

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", accuracy_score(lgbm.predict(X_train), y_train))
print("Accuracy on the Train Dataset : ", accuracy_score(lgbm.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(lgbm.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.9847826086956522
Accuracy on the Train Dataset :  0.7857142857142857

Confusion matrix : 
 [[28 14]
 [19 93]]


In [9]:
## Some sklearn classifiers we can try

try:


  classifiers = sklearn.utils.all_estimators(type_filter='classifier')
  for name, class_ in classifiers:
      if hasattr(class_, 'predict_proba'):
          print(name, class_)
  
except:
  pass

In [10]:
## Scale the data to a range of [0-1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [11]:

lr = LogisticRegression() 
lr.fit(X_train, y_train)

print("Accuracy on the Train Dataset : ", accuracy_score(lr.predict(X_train), y_train))
print("Accuracy on the Validation Dataset : ", accuracy_score(lr.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(lr.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.8108695652173913
Accuracy on the Validation Dataset :  0.8311688311688312

Confusion matrix : 
 [[ 22   1]
 [ 25 106]]


In [12]:

svm = SVC()
svm.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", svm.score(X_train, y_train))
print("Accuracy on the Validation Dataset : ", svm.score(X_val, y_val)) 
print("\nConfusion matrix : \n", confusion_matrix(svm.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.8152173913043478
Accuracy on the Validation Dataset :  0.8311688311688312

Confusion matrix : 
 [[ 21   0]
 [ 26 107]]


In [13]:

adb = AdaBoostClassifier()
adb.fit(X_train, y_train)
adb.score(X_train, y_train), adb.score(X_val, y_val)
print("Accuracy on the Train Dataset : ", adb.score(X_train, y_train))
print("Accuracy on the Validation Dataset : ", adb.score(X_val, y_val)) 
print("\nConfusion matrix : \n", confusion_matrix(adb.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.8304347826086956
Accuracy on the Validation Dataset :  0.8116883116883117

Confusion matrix : 
 [[27  9]
 [20 98]]


In [14]:

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", dt.score(X_train, y_train))
print("Accuracy on the Validation Dataset : ", dt.score(X_val, y_val)) 
print("\nConfusion matrix : \n", confusion_matrix(dt.predict(X_val), y_val))

Accuracy on the Train Dataset :  1.0
Accuracy on the Validation Dataset :  0.6493506493506493

Confusion matrix : 
 [[30 37]
 [17 70]]


The AutoML model has an accuracy of 78.9%. The bespoke classifier models tried have accuracies ranging from 71% to 80%. The models are tried right out of the box, which means that with parameter fine tuning, we can expect to get higher accuracies. Thus, we can make a case for bespoke modelling since accuracy is very important in this application case and any slight improvement in the expected level of accuracy should be pursued.

In [15]:
Path(product['X_train']).parent.mkdir(exist_ok=True, parents=True)
Path(product['X_train']).write_bytes(pickle.dumps(X_train))

Path(product['X_val']).parent.mkdir(exist_ok=True, parents=True)
Path(product['X_val']).write_bytes(pickle.dumps(X_val))

24792