In [27]:
# For data wrangling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# For modelling
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# For model evaluation
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [28]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HAMOYE/Stage C/Data_for_UCI_named.csv')
data.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable
3,0.716415,7.6696,4.486641,2.340563,3.963791,-1.027473,-1.938944,-0.997374,0.446209,0.976744,0.929381,0.362718,0.028871,unstable
4,3.134112,7.608772,4.943759,9.857573,3.525811,-1.125531,-1.845975,-0.554305,0.79711,0.45545,0.656947,0.820923,0.04986,unstable


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


In [30]:
data.describe()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5.25,5.250001,5.250004,5.249997,3.75,-1.25,-1.25,-1.25,0.525,0.525,0.525,0.525,0.015731
std,2.742548,2.742549,2.742549,2.742556,0.75216,0.433035,0.433035,0.433035,0.274256,0.274255,0.274255,0.274255,0.036919
min,0.500793,0.500141,0.500788,0.500473,1.58259,-1.999891,-1.999945,-1.999926,0.050009,0.050053,0.050054,0.050028,-0.08076
25%,2.874892,2.87514,2.875522,2.87495,3.2183,-1.624901,-1.625025,-1.62496,0.287521,0.287552,0.287514,0.287494,-0.015557
50%,5.250004,5.249981,5.249979,5.249734,3.751025,-1.249966,-1.249974,-1.250007,0.525009,0.525003,0.525015,0.525002,0.017142
75%,7.62469,7.624893,7.624948,7.624838,4.28242,-0.874977,-0.875043,-0.875065,0.762435,0.76249,0.76244,0.762433,0.044878
max,9.999469,9.999837,9.99945,9.999443,5.864418,-0.500108,-0.500072,-0.500025,0.999937,0.999944,0.999982,0.99993,0.109403


In [31]:
data.shape

(10000, 14)

In [32]:
data.isnull().sum()

tau1     0
tau2     0
tau3     0
tau4     0
p1       0
p2       0
p3       0
p4       0
g1       0
g2       0
g3       0
g4       0
stab     0
stabf    0
dtype: int64

In [33]:
data.stabf.value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

In [34]:
x = data.drop(columns=['stab', 'stabf'])
y = data['stabf']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [35]:
x_train.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
2694,6.255995,2.542401,7.024714,9.476518,3.529888,-1.224881,-0.688228,-1.61678,0.568221,0.618403,0.685739,0.660088
5140,5.070581,5.490253,8.075688,0.761075,4.220888,-1.280596,-1.902185,-1.038107,0.443515,0.097244,0.916955,0.129254
2568,1.220072,8.804028,3.874283,8.433949,3.614027,-1.039236,-0.953566,-1.621224,0.908353,0.923594,0.238881,0.660156
3671,7.498402,6.697603,8.798626,2.126236,3.134585,-1.581906,-0.589386,-0.963293,0.260826,0.899003,0.964752,0.600598
7427,7.074006,1.337511,6.100756,7.759156,2.526922,-0.92254,-0.6326,-0.971782,0.98458,0.716082,0.836928,0.165162


In [36]:
y_train.head()

2694    unstable
5140    unstable
2568    unstable
3671    unstable
7427    unstable
Name: stabf, dtype: object

In [37]:
y_train.value_counts()
# The distribution is still imbalance. SMOTE will be used to balance the training data.

unstable    5092
stable      2908
Name: stabf, dtype: int64

In [38]:
# Instantiating the Standard Scaler

scaler = StandardScaler()

# Applying Standard Scaler on the training set
x_train_scaled = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_train_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.367327,-0.986042,0.650447,1.547527,-0.29149,0.061535,1.293862,-0.845074,0.160918,0.339859,0.585568,0.492239
1,-0.064659,0.089437,1.035079,-1.641494,0.619865,-0.067235,-1.502925,0.486613,-0.293143,-1.558488,1.429649,-1.443521
2,-1.46785,1.298418,-0.502536,1.166046,-0.180521,0.490603,0.68256,-0.855302,1.39935,1.451534,-1.045743,0.492489
3,0.820081,0.52992,1.299657,-1.141975,-0.812854,-0.763632,1.521579,0.65878,-0.958319,1.361958,1.60414,0.275303
4,0.665424,-1.425627,0.3123,0.919137,-1.614296,0.760315,1.422019,0.639243,1.676895,0.69566,1.137504,-1.312575


In [39]:
# Applying Standard Scaler on the testing set

x_test_scaled = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)
x_test_scaled.head()

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4
0,0.593951,-0.412733,1.503924,1.116943,0.403423,-1.492971,-0.785033,1.566781,-0.901007,1.167203,-1.50733,1.084726
1,0.20219,0.374416,-0.1888,-0.522268,-0.225967,-1.058483,0.420047,1.028627,-1.625721,-0.39566,1.414651,1.226011
2,-1.079044,-0.313745,-0.884634,0.01708,-0.943122,0.112653,0.801335,0.733004,1.457108,-1.438495,0.651821,-1.682168
3,-0.08312,-1.107327,0.372805,-1.708152,0.75399,-1.637972,0.403805,-0.088036,0.083322,-1.672322,-0.357714,1.055865
4,0.873921,1.438466,0.086662,1.715037,-0.15388,-0.007015,-0.197053,0.472315,0.136549,-1.469731,0.956396,-0.819727


In [40]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [41]:
y_train

array([1, 1, 1, ..., 1, 1, 0])

In [42]:
y_test

array([1, 1, 0, ..., 0, 1, 1])

Q1 What is the accuracy on the test set using the XGboost classifier? In 4 decimal places.

In [43]:
xgclassif = XGBClassifier(random_state=1)
xgclassif.fit(x_train,y_train)
xgb_preds = xgclassif.predict(x_test)
# calculate accuracy score
Accuracy = accuracy_score(xgb_preds,y_test)
print("Accuracy Score:",Accuracy)

Accuracy Score: 0.9195


Q9 What is the accuracy on the test set using the random forest classifier? In 4 decimal places.

In [44]:
rand_forest = RandomForestClassifier(random_state = 1)
rand_forest.fit(x_train_scaled, y_train)
rand_forest_pred = rand_forest.predict(x_test_scaled)
accuracy = round(accuracy_score(y_test, rand_forest_pred), 4)
accuracy

0.929

Q11 According to a use-case, in a certain ML task, a false positive is six times costlier than a false negative. You, as a Data Scientist, trained 4 models, to solve the use case.
Keep the following evaluation criteria in mind:
1) Must have a recall rate of at least 80% 
2) Must have a false positive rate of 8% or less 
3) Must minimize business costs
 After creating each binary classification model, you generated the corresponding confusion matrix. Which confusion matrix represents the model that satisfies the requirements? 


In [45]:
# Recall = TP/(TP + FN)
# False Positive Rate (FPR) = FP/(FP + TN)
# Cost = 5*FP + FN

def value_calc(TN,FP,FN,TP):
    recall = TP/(TP + FN)
    fpr = FP/(FP + TN)
    cost = 5*FP + FN
    print(f'Recall = {recall}\n')
    print(f'False Positive Rate = {fpr}\n')
    print(f'Cost = {cost}\n')

# For option A:
# TN = 98, FP = 2, TP = 18, TP = 82

value_calc(98,2,18,82)

Recall = 0.82

False Positive Rate = 0.02

Cost = 28



Q13 What is the accuracy on the test set using the LGBM classifier? In 4 decimal places.

In [46]:
lgbm_cl = LGBMClassifier(random_state=1)
lgbm_cl.fit(x_train_scaled, y_train)
lgbm_pred = lgbm_cl.predict(x_test_scaled)
acc_score = accuracy_score(y_test, lgbm_pred)
acc_score

0.9375

Q14 Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [47]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
distributions = dict(C=uniform(loc=0, scale=4),
                    penalty=['l2', 'l1'])
parameters = {
    "max_depth": [2,3, 5, 10, None,"auto"],
    "n_estimators": [100, 200, 300, 400, 500,1000],
    "min_samples_split":[5,6,7,8],
    "min_samples_leaf":[5,6,7,8]
}
clf = RandomizedSearchCV( ExtraTreesClassifier(),parameters, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1, random_state = 1)
search = clf.fit(x_train,y_train)
search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

{'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_depth': None}

Q19 You are working on a spam classification system using regularized logistic regression. “Spam” is a positive class (y = 1) and “not spam” is the negative class (y = 0). You have trained your classifier and there are n = 1700 examples in the test set. The confusion matrix of predicted class vs. actual class is:

In [48]:
"""
confusion matrix given data:
TP = 256
TN = 20
FP = 45
FN = 1380
"""

# Precision = TP/(TP + FP)

precision = 256/(256 + 45)

# Recall = TP/(TP + FN)

recall = 256/(256 + 1380)

# F1 score
F1_score = 2*(precision*recall)/(precision + recall)

F1_score = round(F1_score, 4)

print(F1_score)

0.2643
