In [None]:
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
  import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from pathlib import Path
import pickle

In [None]:
upstream = ['section-05-data-preparation']
product = None

In [None]:
X_train = pickle.loads(Path(upstream['section-05-data-preparation']['X_train']).read_bytes())
X_val = pickle.loads(Path(upstream['section-05-data-preparation']['X_val']).read_bytes())
y_train = pickle.loads(Path(upstream['section-05-data-preparation']['y_train']).read_bytes())
y_val = pickle.loads(Path(upstream['section-05-data-preparation']['y_val']).read_bytes())

## 06) Scikit-learn Bespoke ML

In [None]:

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", accuracy_score(xgb.predict(X_train), y_train))
print("Accuracy on the Train Dataset : ", accuracy_score(xgb.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(xgb.predict(X_val), y_val))

Accuracy on the Train Dataset :  1.0
Accuracy on the Train Dataset :  0.7402597402597403

Confusion matrix : 
 [[20 20]
 [20 94]]


In [None]:
!pip install catboost
clear_output()

In [None]:

cbc = CatBoostClassifier(verbose=0)
cbc.fit(X_train, y_train)

print("Accuracy on the Train Dataset : ", accuracy_score(cbc.predict(X_train), y_train))
print("Accuracy on the Train Dataset : ", accuracy_score(cbc.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(cbc.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.9173913043478261
Accuracy on the Train Dataset :  0.8181818181818182

Confusion matrix : 
 [[ 19   7]
 [ 21 107]]


In [None]:

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", accuracy_score(lgbm.predict(X_train), y_train))
print("Accuracy on the Train Dataset : ", accuracy_score(lgbm.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(lgbm.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.9978260869565218
Accuracy on the Train Dataset :  0.7467532467532467

Confusion matrix : 
 [[20 19]
 [20 95]]


In [None]:
## Some sklearn classifiers we can try
try:


  classifiers = sklearn.utils.all_estimators(type_filter='classifier')
  for name, class_ in classifiers:
      if hasattr(class_, 'predict_proba'):
          print(name, class_)
  
except:
  pass

In [None]:
## Scale the data to a range of [0-1]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:

lr = LogisticRegression() 
lr.fit(X_train, y_train)

print("Accuracy on the Train Dataset : ", accuracy_score(lr.predict(X_train), y_train))
print("Accuracy on the Validation Dataset : ", accuracy_score(lr.predict(X_val), y_val))
print("\nConfusion matrix : \n", confusion_matrix(lr.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.8065217391304348
Accuracy on the Validation Dataset :  0.8311688311688312

Confusion matrix : 
 [[ 17   3]
 [ 23 111]]


In [None]:

svm = SVC()
svm.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", svm.score(X_train, y_train))
print("Accuracy on the Validation Dataset : ", svm.score(X_val, y_val)) 
print("\nConfusion matrix : \n", confusion_matrix(svm.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.8173913043478261
Accuracy on the Validation Dataset :  0.8311688311688312

Confusion matrix : 
 [[ 17   3]
 [ 23 111]]


In [None]:

adb = AdaBoostClassifier()
adb.fit(X_train, y_train)
adb.score(X_train, y_train), adb.score(X_val, y_val)
print("Accuracy on the Train Dataset : ", adb.score(X_train, y_train))
print("Accuracy on the Validation Dataset : ", adb.score(X_val, y_val)) 
print("\nConfusion matrix : \n", confusion_matrix(adb.predict(X_val), y_val))

Accuracy on the Train Dataset :  0.8608695652173913
Accuracy on the Validation Dataset :  0.7662337662337663

Confusion matrix : 
 [[19 15]
 [21 99]]


In [None]:

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print("Accuracy on the Train Dataset : ", dt.score(X_train, y_train))
print("Accuracy on the Validation Dataset : ", dt.score(X_val, y_val)) 
print("\nConfusion matrix : \n", confusion_matrix(dt.predict(X_val), y_val))

Accuracy on the Train Dataset :  1.0
Accuracy on the Validation Dataset :  0.6688311688311688

Confusion matrix : 
 [[22 33]
 [18 81]]


The AutoML model has an accuracy of 78.9%. The bespoke classifier models tried have accuracies ranging from 71% to 80%. The models are tried right out of the box, which means that with parameter fine tuning, we can expect to get higher accuracies. Thus, we can make a case for bespoke modelling since accuracy is very important in this application case and any slight improvement in the expected level of accuracy should be pursued.

In [None]:
Path(product['X_train']).parent.mkdir(exist_ok=True, parents=True)
Path(product['X_train']).write_bytes(pickle.dumps(X_train))

Path(product['X_val']).parent.mkdir(exist_ok=True, parents=True)
Path(product['X_val']).write_bytes(pickle.dumps(X_val))