In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.optimize import differential_evolution

In [4]:
org_df = pd.read_csv('waterDataTraining.csv')
df = org_df.copy()
df.dropna(inplace = True)
df.head()

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
0,2017-07-01 00:00:00,6.94,8.60774,0.020954,0.125931,3.58683,43.7559,False
1,2017-07-01 00:01:00,6.93,8.60589,0.020965,0.127219,3.59025,43.4366,False
2,2017-07-01 00:02:00,6.94,8.6022,0.020968,0.126482,3.58318,43.5994,False
3,2017-07-01 00:03:00,6.94,8.6022,0.020972,0.126184,3.58769,43.3704,False
4,2017-07-01 00:04:00,6.94,8.60405,0.020973,0.127908,3.58287,43.1656,False


In [5]:
org_df['Event'].value_counts()

False    132268
True        212
Name: Event, dtype: int64

In [6]:
X = df.drop(['Time', 'Event'], axis = 1)
y = df['Event']

In [7]:
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(" X_train dataset: ", X_train.shape)
print(" y_train dataset: ", y_train.shape)
print(" X_test dataset: ", X_test.shape)
print(" y_test dataset: ", y_test.shape)

 X_train dataset:  (92548, 6)
 y_train dataset:  (92548,)
 X_test dataset:  (39664, 6)
 y_test dataset:  (39664,)


In [8]:
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.metrics import f1_score

def get_model_f1(x):
    model = LogisticRegression( C = x[0], max_iter=1000, random_state = 123)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    print(f"In Func, 1-f1:{1- f1_score(y_test, y_pred)}, f1:{f1}, c: {x[0]}")
    return 1-f1
    # DE tries to minimize the objective function. I couldn't find a way to change it to be able to maximize the obj func.
    # Hence, I'm returning 1-f1 score so that it minimises 1-f1 -> same as maximising f1

In [13]:
# bounds = [(0, 1e-1)]
bounds = [(0, 1)]
result = differential_evolution(func = get_model_f1, bounds = bounds, maxiter = 10, popsize = 10)
print(result.x, result.fun)

In Func, 1-f1:0.6917293233082706, f1:0.30827067669172936, c: 0.259983906691968
In Func, 1-f1:0.7663043478260869, f1:0.23369565217391305, c: 0.4457339483282138
In Func, 1-f1:0.9219330855018587, f1:0.07806691449814125, c: 0.5830018064818248
In Func, 1-f1:0.7448071216617211, f1:0.2551928783382789, c: 0.366609601655287
In Func, 1-f1:0.8117154811715481, f1:0.18828451882845187, c: 0.9397666602619179
In Func, 1-f1:0.924187725631769, f1:0.07581227436823104, c: 0.6226566122719084
In Func, 1-f1:0.5897435897435896, f1:0.4102564102564103, c: 0.16181200814970903
In Func, 1-f1:0.8146551724137931, f1:0.18534482758620688, c: 0.808302574503762
In Func, 1-f1:0.7491039426523297, f1:0.2508960573476703, c: 0.09089125141294063
In Func, 1-f1:0.9286321155480034, f1:0.0713678844519966, c: 0.7799484535702818
In Func, 1-f1:0.9236363636363636, f1:0.07636363636363636, c: 0.6120746085863746
In Func, 1-f1:0.7985948477751756, f1:0.20140515222482433, c: 0.647409662685031
In Func, 1-f1:0.9236363636363636, f1:0.07636363

In Func, 1-f1:0.5102040816326532, f1:0.48979591836734687, c: 0.10523040746357298
In Func, 1-f1:0.5102040816326532, f1:0.48979591836734687, c: 0.10501202325932402
In Func, 1-f1:0.7211155378486056, f1:0.2788844621513944, c: 0.07328692147836846
In Func, 1-f1:0.6813186813186813, f1:0.31868131868131866, c: 0.038840108792902395
In Func, 1-f1:0.7407407407407407, f1:0.25925925925925924, c: 0.08694174106363428
In Func, 1-f1:0.6585365853658536, f1:0.34146341463414637, c: 0.03142394298771972
In Func, 1-f1:0.45599999999999996, f1:0.544, c: 0.061490252340794416
In Func, 1-f1:0.45599999999999996, f1:0.544, c: 0.06149026234079442
[0.06149025] 0.45599999999999996


In [14]:
result.x, 1-result.fun

(array([0.06149025]), 0.544)

In [16]:
model = LogisticRegression( C = 0.06149025, max_iter=1000, random_state = 123)
model.fit(X_train_res, y_train_res)
y_pred = model.predict(X_test)
get_metrics(y_pred, y_test)
# return (1 - f1)

(0.998562928600242, 0.5396825396825397, 0.5483870967741935, 0.544)

In [1]:
from sklearn.metrics import confusion_matrix
def get_metrics(Y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(Y_true, y_pred).ravel()
    acc = (tp + tn) / (tp + fp + tn + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return acc, precision, recall, f1

In [None]:
0.06149025