In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import evaluate


In [2]:
df = pd.read_csv('database/final.csv')
df


Unnamed: 0.1,Unnamed: 0,ID,pH,BDecf,pCO2,BE,Apgar1,Apgar5,Gest. Weeks,Weight(g),...,Median UC,Std FHR,Std UC,RMS FHR,RMS UC,Peak to RMS FHR,Peak to RMS UC,Peak FHR,Peak UC,target
0,0,1220.000000,7.300000,3.520000,6.000000,-4.700000,9.000000,10.000000,42.000000,3100.000000,...,23.000000,59.376698,21.970835,122.357020,33.143752,70.642980,93.856248,193.000000,127.000000,1.0
1,1,1234.000000,7.290000,2.500000,6.500000,-4.200000,8.000000,9.000000,41.000000,3200.000000,...,20.500000,59.311329,24.685834,135.710988,34.975993,88.289012,92.024007,224.000000,127.000000,1.0
2,2,1208.000000,7.230000,5.840000,6.600000,-7.400000,9.000000,9.000000,40.000000,3900.000000,...,3.000000,51.554942,21.705502,109.963878,26.323443,64.536122,73.676557,174.500000,100.000000,1.0
3,3,1038.000000,7.330000,2.720000,5.700000,-4.000000,10.000000,10.000000,39.000000,2740.000000,...,6.000000,26.070361,30.393657,135.560556,37.570082,31.189444,62.429918,166.750000,100.000000,1.0
4,4,1004.000000,7.300000,5.190000,5.500000,-6.400000,8.000000,9.000000,41.000000,3370.000000,...,14.000000,51.581668,16.210997,105.951334,22.315540,123.048666,100.684460,229.000000,123.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,970,1612.415502,7.277141,4.170062,6.214293,-5.257210,8.285658,8.714244,39.571415,4014.282925,...,13.642683,66.893660,15.123756,112.954032,21.790416,87.188871,69.994656,200.142902,91.785073,0.0
971,971,1767.337796,7.293675,3.440918,6.131623,-4.463584,8.947013,9.210260,39.736753,4047.350650,...,15.709416,62.683887,17.396235,103.389556,25.123933,96.215996,74.266717,199.605552,99.390650,0.0
972,972,1481.788323,7.230459,4.115599,7.287158,-5.830279,9.440366,10.000000,40.559634,4038.440290,...,15.922014,64.601882,18.569339,109.909654,26.431828,77.227946,77.513116,187.137600,103.944945,0.0
973,973,1199.005821,7.233021,6.115778,6.434896,-7.374997,6.520836,7.390627,39.130209,3926.041797,...,8.127612,78.127428,9.059652,138.476805,12.894938,63.100015,58.594676,201.576821,71.489613,0.0


In [3]:
df.drop(['Unnamed: 0', 'ID'], axis=1, inplace=True)
df.shape


(975, 42)

In [4]:
df.isnull().sum()


pH                 0
BDecf              0
pCO2               0
BE                 0
Apgar1             0
Apgar5             0
Gest. Weeks        0
Weight(g)          0
Sex                0
Age                0
Gravidity          0
Parity             0
Diabetes           0
Hypertension       0
Preeclampsia       0
Liq.               0
Pyrexia            0
Meconium           0
Presentation       0
Induced            0
I.stage            0
NoProgress         0
CK/KP              0
II.stage           0
Deliv. type        0
dbID               0
Rec. type          0
Pos. II.st.        0
Sig2Birth          0
Mean FHR           0
Mean UC            0
Median FHR         0
Median UC          0
Std FHR            0
Std UC             0
RMS FHR            0
RMS UC             0
Peak to RMS FHR    0
Peak to RMS UC     0
Peak FHR           0
Peak UC            0
target             0
dtype: int64

In [5]:
df.columns


Index(['pH', 'BDecf', 'pCO2', 'BE', 'Apgar1', 'Apgar5', 'Gest. Weeks',
       'Weight(g)', 'Sex', 'Age', 'Gravidity', 'Parity', 'Diabetes',
       'Hypertension', 'Preeclampsia', 'Liq.', 'Pyrexia', 'Meconium',
       'Presentation', 'Induced', 'I.stage', 'NoProgress', 'CK/KP', 'II.stage',
       'Deliv. type', 'dbID', 'Rec. type', 'Pos. II.st.', 'Sig2Birth',
       'Mean FHR', 'Mean UC', 'Median FHR', 'Median UC', 'Std FHR', 'Std UC',
       'RMS FHR', 'RMS UC', 'Peak to RMS FHR', 'Peak to RMS UC', 'Peak FHR',
       'Peak UC', 'target'],
      dtype='object')

In [6]:
df.shape

(975, 42)

# Feature Engineering

In [7]:
def f(x):
    if 6.5 < x <= 10:
        return 'Excellent'
    elif 4 < x <= 6.5:
        return 'ModeratelyAbnormal'
    elif x <= 4:
        return 'Attention'


In [8]:
df['Apgar1'] = df['Apgar1'].apply(lambda x: f(x))
df['Apgar5'] = df['Apgar5'].apply(lambda x: f(x))


In [9]:
df.Apgar1.value_counts()


Excellent             901
ModeratelyAbnormal     47
Attention              27
Name: Apgar1, dtype: int64

In [10]:
df.Apgar5.value_counts()


Excellent             957
ModeratelyAbnormal     15
Attention               3
Name: Apgar5, dtype: int64

In [11]:
df.target.value_counts()


1.0    488
0.0    487
Name: target, dtype: int64

In [12]:
df['pH'] = df['pH'].apply(lambda x: 'Pathological' if x <= 7.15 else 'Normal')

df.pH.value_counts()


Normal          842
Pathological    133
Name: pH, dtype: int64

In [13]:
df.head()

Unnamed: 0,pH,BDecf,pCO2,BE,Apgar1,Apgar5,Gest. Weeks,Weight(g),Sex,Age,...,Median UC,Std FHR,Std UC,RMS FHR,RMS UC,Peak to RMS FHR,Peak to RMS UC,Peak FHR,Peak UC,target
0,Normal,3.52,6.0,-4.7,Excellent,Excellent,42.0,3100.0,2.0,34.0,...,23.0,59.376698,21.970835,122.35702,33.143752,70.64298,93.856248,193.0,127.0,1.0
1,Normal,2.5,6.5,-4.2,Excellent,Excellent,41.0,3200.0,1.0,29.0,...,20.5,59.311329,24.685834,135.710988,34.975993,88.289012,92.024007,224.0,127.0,1.0
2,Normal,5.84,6.6,-7.4,Excellent,Excellent,40.0,3900.0,2.0,26.0,...,3.0,51.554942,21.705502,109.963878,26.323443,64.536122,73.676557,174.5,100.0,1.0
3,Normal,2.72,5.7,-4.0,Excellent,Excellent,39.0,2740.0,1.0,24.0,...,6.0,26.070361,30.393657,135.560556,37.570082,31.189444,62.429918,166.75,100.0,1.0
4,Normal,5.19,5.5,-6.4,Excellent,Excellent,41.0,3370.0,1.0,36.0,...,14.0,51.581668,16.210997,105.951334,22.31554,123.048666,100.68446,229.0,123.0,1.0


In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
le = LabelEncoder()
p = df['pH']
df['pH']=le.fit_transform(p)

In [16]:
le1 = LabelEncoder()
p1 = df['Apgar1']
df['Apgar1']=le.fit_transform(p1)

In [17]:

le2 = LabelEncoder()
p2 = df['Apgar5']
df['Apgar5']=le.fit_transform(p2)



In [18]:
df.head()


Unnamed: 0,pH,BDecf,pCO2,BE,Apgar1,Apgar5,Gest. Weeks,Weight(g),Sex,Age,...,Median UC,Std FHR,Std UC,RMS FHR,RMS UC,Peak to RMS FHR,Peak to RMS UC,Peak FHR,Peak UC,target
0,0,3.52,6.0,-4.7,1,1,42.0,3100.0,2.0,34.0,...,23.0,59.376698,21.970835,122.35702,33.143752,70.64298,93.856248,193.0,127.0,1.0
1,0,2.5,6.5,-4.2,1,1,41.0,3200.0,1.0,29.0,...,20.5,59.311329,24.685834,135.710988,34.975993,88.289012,92.024007,224.0,127.0,1.0
2,0,5.84,6.6,-7.4,1,1,40.0,3900.0,2.0,26.0,...,3.0,51.554942,21.705502,109.963878,26.323443,64.536122,73.676557,174.5,100.0,1.0
3,0,2.72,5.7,-4.0,1,1,39.0,2740.0,1.0,24.0,...,6.0,26.070361,30.393657,135.560556,37.570082,31.189444,62.429918,166.75,100.0,1.0
4,0,5.19,5.5,-6.4,1,1,41.0,3370.0,1.0,36.0,...,14.0,51.581668,16.210997,105.951334,22.31554,123.048666,100.68446,229.0,123.0,1.0


In [125]:
dts.columns
dts.shape

(975, 47)

In [19]:
X = df.drop('target', axis=1)
y = df['target']


# Test  Train Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)
print('Training dataset len: {}, Test data len: {}'.format(
    X_train.shape[0], y_test.shape[0]))


Training dataset len: 780, Test data len: 195


In [21]:
X_train

Unnamed: 0,pH,BDecf,pCO2,BE,Apgar1,Apgar5,Gest. Weeks,Weight(g),Sex,Age,...,Median FHR,Median UC,Std FHR,Std UC,RMS FHR,RMS UC,Peak to RMS FHR,Peak to RMS UC,Peak FHR,Peak UC
969,0,2.612753,6.010114,-3.584335,1,1,40.000000,3980.653772,1.101141,33.494296,...,100.818452,17.786310,57.834869,21.715525,92.480546,30.958192,105.485613,79.378688,197.966158,110.336880
678,1,6.837973,9.474283,-9.542830,1,1,40.752390,3558.384346,2.000000,29.000000,...,95.820981,10.728489,69.065626,17.960338,103.187749,22.947317,88.921956,80.728589,192.109705,103.675906
894,0,2.156223,6.876223,-4.142657,1,1,38.391608,3148.391562,1.097902,25.489510,...,114.361887,2.723774,72.876839,26.634918,107.791989,32.289982,71.034938,94.710018,178.826927,127.000000
33,0,4.660000,6.400000,-5.900000,1,1,41.000000,3000.000000,2.000000,27.000000,...,145.500000,9.500000,32.279106,19.826447,145.040152,25.606502,56.209848,101.393498,201.250000,127.000000
31,0,2.380000,8.800000,-5.100000,1,1,40.000000,3980.000000,1.000000,42.000000,...,147.750000,5.000000,39.468608,14.650049,142.435276,18.228178,48.314724,80.771822,190.750000,99.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0,6.840000,7.300000,-9.100000,2,1,40.000000,3320.000000,1.000000,31.000000,...,144.750000,25.000000,54.664243,22.512355,138.009633,35.796044,89.990367,91.203956,228.000000,127.000000
270,0,4.380000,6.900000,-5.600000,1,1,41.000000,3330.000000,2.000000,32.000000,...,130.000000,44.500000,54.582694,41.303604,115.272453,64.658057,83.727547,62.341943,199.000000,127.000000
860,0,6.059280,6.274743,-7.436857,1,1,39.563143,3348.119891,1.563143,26.815715,...,161.958141,6.781571,77.425196,18.913860,137.481620,24.556502,58.323807,60.371930,195.805427,84.928432
435,0,3.440000,6.800000,-4.800000,1,1,41.000000,4280.000000,2.000000,25.000000,...,110.750000,15.000000,59.965075,32.265959,91.444381,42.630124,53.555619,57.369876,145.000000,100.000000


# Training Different Models

## KNN

In [22]:
# KNN
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)


KNeighborsClassifier()

In [23]:
y_pred = model_knn.predict(X_test)
#predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 82.56%


In [24]:
evaluate.evaluate_model(model_knn, X_test, y_test)


Accuracy:- 82.56%
Confusion matrix :- 
 [[97  0]
 [34 64]]
Classification Report:-
               precision    recall  f1-score   support

         0.0       0.74      1.00      0.85        97
         1.0       1.00      0.65      0.79        98

    accuracy                           0.83       195
   macro avg       0.87      0.83      0.82       195
weighted avg       0.87      0.83      0.82       195



## Support Vector Machines

In [25]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

evaluate.evaluate_model(model, X_test, y_test)


Accuracy:- 88.21%
Confusion matrix :- 
 [[94  3]
 [20 78]]
Classification Report:-
               precision    recall  f1-score   support

         0.0       0.82      0.97      0.89        97
         1.0       0.96      0.80      0.87        98

    accuracy                           0.88       195
   macro avg       0.89      0.88      0.88       195
weighted avg       0.89      0.88      0.88       195



## Random Forest Classifier

In [22]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

evaluate.evaluate_model(model, X_test, y_test)


Accuracy:- 97.44%
Confusion matrix :- 
 [[95  2]
 [ 3 95]]
Classification Report:-
               precision    recall  f1-score   support

         0.0       0.97      0.98      0.97        97
         1.0       0.98      0.97      0.97        98

    accuracy                           0.97       195
   macro avg       0.97      0.97      0.97       195
weighted avg       0.97      0.97      0.97       195



In [131]:
import pickle
from sklearn.ensemble import RandomForestClassifier
with open('random_forest_model_2.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
pred = model.predict()

In [31]:
import joblib
model = RandomForestClassifier()
joblib.dump(model, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [32]:
model = joblib.load('random_forest_model.joblib')  # Load the trained model
