<a href="https://colab.research.google.com/github/utukJ/Graded_quiz/blob/master/quiz_c_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
# importing relevant libraries

import numpy as np
import pandas as pd

In [37]:
# getting the dataset into a pandas dataframe

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00471/Data_for_UCI_named.csv") 

In [38]:
# checking the value counts of the two classes of the target column

df.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [39]:
# dropping the stab column

df.drop(columns = ['stab'], inplace = True)

In [40]:
# data preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

predictors = df.drop(columns = ['stabf'])
target = df['stabf']

x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.2, random_state = 1)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [42]:
# training a random forest and extra trees classifier 

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix

rf_clf = RandomForestClassifier(random_state = 1)
ex_clf = ExtraTreesClassifier(random_state = 1)

rf_clf.fit(x_train, y_train)
ex_clf.fit(x_train, y_train)

# evaluating the classifiers on the test set

rf_preds = rf_clf.predict(x_test)
ex_preds = ex_clf.predict(x_test)

print("RF report: ")
print(classification_report(y_test, rf_preds, digits = 5))

print("Ex Trees report: ")
print(classification_report(y_test, ex_preds, digits = 5))

RF report: 
              precision    recall  f1-score   support

      stable    0.91912   0.87781   0.89799       712
    unstable    0.93409   0.95730   0.94555      1288

    accuracy                        0.92900      2000
   macro avg    0.92660   0.91755   0.92177      2000
weighted avg    0.92876   0.92900   0.92862      2000

Ex Trees report: 
              precision    recall  f1-score   support

      stable    0.94099   0.85112   0.89381       712
    unstable    0.92183   0.97050   0.94554      1288

    accuracy                        0.92800      2000
   macro avg    0.93141   0.91081   0.91967      2000
weighted avg    0.92865   0.92800   0.92712      2000



In [54]:
# training an extreme gradient boosting model

from xgboost import XGBClassifier

xgb_clf = XGBClassifier(random_state=1).fit(x_train, y_train)

xgb_preds = xgb_clf.predict(x_test)

print("XGB classification report: ")
print(classification_report(y_test, xgb_preds, digits = 5))

XGB classification report: 
              precision    recall  f1-score   support

      stable    0.92061   0.84691   0.88222       712
    unstable    0.91896   0.95963   0.93885      1288

    accuracy                        0.91950      2000
   macro avg    0.91978   0.90327   0.91054      2000
weighted avg    0.91955   0.91950   0.91869      2000



In [55]:
# training a light gradient boosting model

from lightgbm import LGBMClassifier

lbm_clf = LGBMClassifier(random_state = 1).fit(x_train, y_train)

lbm_preds = lbm_clf.predict(x_test)

print("LightGBM classification report")
print(classification_report(y_test, lbm_preds, digits = 5))

LightGBM classification report
              precision    recall  f1-score   support

      stable    0.92972   0.89185   0.91039       712
    unstable    0.94153   0.96273   0.95202      1288

    accuracy                        0.93750      2000
   macro avg    0.93563   0.92729   0.93120      2000
weighted avg    0.93733   0.93750   0.93720      2000



In [56]:
# improving extra trees using randomized hyperparameter search

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

search_clf = RandomizedSearchCV(ExtraTreesClassifier(), hyperparameter_grid, cv = 5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search_clf.fit(x_train, y_train)
preds = search_clf.predict(x_test)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


In [57]:
search_clf.best_params_

{'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [66]:
optim_clf = ExtraTreesClassifier(max_features = None, min_samples_leaf = 8, min_samples_split = 2, n_estimators = 1000, random_state = 1)
optim_clf.fit(x_train, y_train)
optim_preds = optim_clf.predict(x_test)
print("Optimized Extra trees report: ")
print(classification_report(y_test, optim_preds, digits = 5))

Optimized Extra trees report: 
              precision    recall  f1-score   support

      stable    0.92113   0.86938   0.89451       712
    unstable    0.92997   0.95885   0.94419      1288

    accuracy                        0.92700      2000
   macro avg    0.92555   0.91412   0.91935      2000
weighted avg    0.92682   0.92700   0.92650      2000



In [61]:
print(optim_clf.feature_importances_)
print(optim_clf.feature_importances_.argmax())
print(optim_clf.feature_importances_.argmin())

[0.13723975 0.1405075  0.13468029 0.13541676 0.00368342 0.00533686
 0.00542927 0.00496249 0.10256244 0.10757765 0.11306268 0.10954089]
1
4


In [63]:
predictors.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923


In [65]:
# code for calculating f1_score given True positive, false positive, etc


tp = 355
fp = 1480
tn = 120
fn =  45

precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = 2*precision*recall/(precision+recall)

print("f1_score: ", f1)

f1_score:  0.3176733780760626
