In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.optimize import differential_evolution

In [2]:
org_df = pd.read_csv('waterDataTraining.csv')
df = org_df.copy()
df.dropna(inplace = True)
df.head()

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
0,2017-07-01 00:00:00,6.94,8.60774,0.020954,0.125931,3.58683,43.7559,False
1,2017-07-01 00:01:00,6.93,8.60589,0.020965,0.127219,3.59025,43.4366,False
2,2017-07-01 00:02:00,6.94,8.6022,0.020968,0.126482,3.58318,43.5994,False
3,2017-07-01 00:03:00,6.94,8.6022,0.020972,0.126184,3.58769,43.3704,False
4,2017-07-01 00:04:00,6.94,8.60405,0.020973,0.127908,3.58287,43.1656,False


In [3]:
org_df['Event'].value_counts()

False    132268
True        212
Name: Event, dtype: int64

In [4]:
X = df.drop(['Time', 'Event'], axis = 1)
y = df['Event']

In [5]:
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(" X_train dataset: ", X_train.shape)
print(" y_train dataset: ", y_train.shape)
print(" X_test dataset: ", X_test.shape)
print(" y_test dataset: ", y_test.shape)

 X_train dataset:  (92548, 6)
 y_train dataset:  (92548,)
 X_test dataset:  (39664, 6)
 y_test dataset:  (39664,)


In [6]:
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [7]:
df.describe()

Unnamed: 0,Tp,pH,Cond,Turb,SAC,PFM
count,132212.0,132212.0,132212.0,132212.0,132212.0,132212.0
mean,8.041941,8.533165,0.021095,0.128367,4.10036,68.660148
std,0.893679,0.052653,0.000347,0.005929,0.304115,11.420527
min,0.0,0.0,0.0,0.0,0.0,42.1631
25%,7.25,8.50278,0.020871,0.12716,3.816362,60.4849
50%,7.88,8.53963,0.021102,0.127754,4.12492,71.4407
75%,8.71,8.56349,0.021327,0.128686,4.395708,79.0769
max,10.3,8.66676,0.023214,1.78285,6.102071,84.8151


In [8]:
from sklearn.linear_model import LogisticRegression

In [17]:
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

def get_model_f1(x):
    model = MLPClassifier(activation='tanh', solver='adam', alpha=x[0], 
                          hidden_layer_sizes=(int(x[1]), int(x[2])), random_state=1)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    print(f"In Func, 1-f1:{1- f1_score(y_test, y_pred)}, f1:{f1},  {x[0]}, {x[1]}, {x[2]}")
    return 1-f1
    # DE tries to minimize the objective function. I couldn't find a way to change it to be able to maximize the obj func.
    # Hence, I'm returning 1-f1 score so that it minimises 1-f1 -> same as maximising f1

In [18]:
# bounds = [(0, 1e-1)]
bounds = [(0, 1e-1), (1, 100), (1, 10)]
result = differential_evolution(func = get_model_f1, bounds = bounds, maxiter = 5, popsize = 5)
print(result.x, result.fun)

In Func, 1-f1:0.995170861469204, f1:0.004829138530795989,  0.09129333386122918, 48.626527842439415, 1.6002050862044594
In Func, 1-f1:0.7153024911032029, f1:0.2846975088967972,  0.024899898178777136, 78.00857336845313, 4.066452356551857
In Func, 1-f1:0.9929694727104533, f1:0.0070305272895467156,  0.05231396645128266, 68.82346123715865, 6.692725385256161
In Func, 1-f1:0.6677966101694914, f1:0.3322033898305085,  0.04488923911583259, 22.25941130845669, 7.754899568851045
In Func, 1-f1:0.7466307277628033, f1:0.25336927223719674,  0.017189041384017653, 37.194115576662995, 3.2322612112725424
In Func, 1-f1:0.9838474778233814, f1:0.016152522176618563,  0.056549188654935814, 31.60680261096781, 9.73013863510888
In Func, 1-f1:0.7351351351351352, f1:0.2648648648648648,  0.0333963864584313, 55.42222889199596, 3.799521263500344
In Func, 1-f1:0.6235294117647059, f1:0.3764705882352941,  0.03202531540010131, 93.36365854524024, 6.213825201791724
In Func, 1-f1:0.987844493345908, f1:0.012155506654092004,  0

In Func, 1-f1:0.4265734265734267, f1:0.5734265734265733,  0.05311289139216085, 82.0285788021744, 9.451703048658452
In Func, 1-f1:0.9884485162318263, f1:0.01155148376817367,  0.05069909362801646, 41.034018741090534, 9.215179161229722
In Func, 1-f1:0.9229181004817619, f1:0.07708189951823811,  0.06772834484400882, 91.46864904784897, 9.335094343338032
In Func, 1-f1:0.9571750563486101, f1:0.04282494365138993,  0.03999102545396302, 97.70903919415986, 8.274244528456814
In Func, 1-f1:0.980275463356572, f1:0.019724536643427985,  0.05249739629810993, 67.42599576557254, 8.934433016345649
In Func, 1-f1:0.9454722492697176, f1:0.054527750730282376,  0.06752906494906503, 42.45745880611026, 7.755829879220524
In Func, 1-f1:0.9806044932923873, f1:0.019395506707612736,  0.023939560946036467, 55.542622760789875, 4.535768461933405
In Func, 1-f1:0.9766596518079205, f1:0.023340348192079585,  0.06505614532480244, 22.25941130845669, 8.802719301047615
In Func, 1-f1:0.9922994382377075, f1:0.007700561762292495,  

In Func, 1-f1:0.31092436974789917, f1:0.6890756302521008,  0.048616393195598046, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.30000000000000004, f1:0.7,  0.048616383195598044, 91.58430860220678, 8.691923797215694
In Func, 1-f1:0.30000000000000004, f1:0.7,  0.048616383195598044, 91.58430859220678, 8.691923807215694
In Func, 1-f1:0.30000000000000004, f1:0.7,  0.048616536200420335, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.31092436974789917, f1:0.6890756302521008,  0.04861654620042034, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.30000000000000004, f1:0.7,  0.048616536200420335, 91.58430860220678, 8.691923797215694
In Func, 1-f1:0.30000000000000004, f1:0.7,  0.048616536200420335, 91.58430859220678, 8.691923807215694
In Func, 1-f1:0.30000000000000004, f1:0.7,  0.048616573530991795, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.30645161290322576, f1:0.6935483870967742,  0.0486165835309918, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.300000000000000

In Func, 1-f1:0.3984962406015038, f1:0.6015037593984962,  0.04858705631651927, 91.58430860220678, 8.691923797215694
In Func, 1-f1:0.3984962406015038, f1:0.6015037593984962,  0.04858705631651927, 91.58430859220678, 8.691923807215694
In Func, 1-f1:0.2941176470588235, f1:0.7058823529411765,  0.04861179974830572, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.3015873015873016, f1:0.6984126984126984,  0.04861180974830572, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.2941176470588235, f1:0.7058823529411765,  0.04861179974830572, 91.58430860220678, 8.691923797215694
In Func, 1-f1:0.2941176470588235, f1:0.7058823529411765,  0.04861179974830572, 91.58430859220678, 8.691923807215694
In Func, 1-f1:0.6412213740458015, f1:0.35877862595419846,  0.048615294531307814, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.9428279734558448, f1:0.05717202654415519,  0.048615304531307815, 91.58430859220678, 8.691923797215694
In Func, 1-f1:0.6412213740458015, f1:0.35877862595419846,  0.0486152

In [21]:
params = result.x
result.x, 1-result.fun


(array([4.86097057e-02, 9.15843086e+01, 8.69192380e+00]), 0.6612903225806451)

In [23]:
# params = result.x
# model =MLPClassifier(activation='tanh', solver='adam', alpha=params[0], 
#                           hidden_layer_sizes=(int(params[1]), int(params[2])), random_state=1)
# model.fit(X_train_res, y_train_res)
# y_pred = model.predict(X_test)
print(get_metrics(y_pred, y_test))

(0.9989411052843888, 0.6507936507936508, 0.6721311475409836, 0.6612903225806451)


In [20]:
from sklearn.metrics import confusion_matrix
def get_metrics(Y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(Y_true, y_pred).ravel()
    acc = (tp + tn) / (tp + fp + tn + fn)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    return acc, precision, recall, f1