## Klasifikacija zdravlja fetusa na temelju kardiotokografije

### Projekt u sklopu predmeta Strojno učenje

Učitavanje podataka:

In [1]:
import pandas as pd

In [2]:
#imena stupaca u CTG.xls koje ćemo koristiti
column_names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']

#stupci koji sadrže int vrijednosti, samo za ljepši ispis
int_columns = ['LB' ,'ASTV' ,'ALTV' ,'Width' ,'Min' ,'Max' ,'Nmax' ,'Nzeros' ,'Mode' ,'Mean' ,'Median' ,'Variance',
               'Tendency', 'NSP']

In [3]:
data = pd.read_excel('CTG.xls', sheet_name = 'Data', skiprows = 1, 
                     usecols = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 45]
                     , names = column_names)

#izbacujemo sve retke u kojima je barem jedna varijabla nepoznata (NaN)
data = data.dropna()

for col in int_columns:
    data[col] = (data[col]).astype(int)

In [4]:
data.head(15)

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120,0.0,0.0,0.0,0.0,0.0,0.0,73,0.5,43,...,62,126,2,0,120,137,121,73,1,2
1,132,0.00638,0.0,0.00638,0.00319,0.0,0.0,17,2.1,0,...,68,198,6,1,141,136,140,12,0,1
2,133,0.003322,0.0,0.008306,0.003322,0.0,0.0,16,2.1,0,...,68,198,5,1,141,135,138,13,0,1
3,134,0.002561,0.0,0.007682,0.002561,0.0,0.0,16,2.4,0,...,53,170,11,0,137,134,137,13,1,1
4,132,0.006515,0.0,0.008143,0.0,0.0,0.0,16,2.4,0,...,53,170,9,0,137,136,138,11,1,1
5,134,0.001049,0.0,0.010493,0.009444,0.0,0.002099,26,5.9,0,...,50,200,5,3,76,107,107,170,0,3
6,134,0.001403,0.0,0.012623,0.008415,0.0,0.002805,29,6.3,0,...,50,200,6,3,71,107,106,215,0,3
7,122,0.0,0.0,0.0,0.0,0.0,0.0,83,0.5,6,...,62,130,0,0,122,122,123,3,1,3
8,122,0.0,0.0,0.001517,0.0,0.0,0.0,84,0.5,5,...,62,130,0,0,122,122,123,3,1,3
9,122,0.0,0.0,0.002967,0.0,0.0,0.0,86,0.3,6,...,62,130,1,0,122,122,123,1,1,3


In [5]:
X = data.iloc[:,:-1].values # podaci bez NSP dijagnoze
y = data.iloc[:,-1].values # NSP dijagnoza

Standarizacija podataka:

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit_transform( X )

array([[-1.35222005, -0.82140279, -0.20304786, ..., -1.18164215,
         1.87056871,  1.11298001],
       [-0.1325256 ,  0.8318261 , -0.20304786, ...,  0.13203796,
        -0.23499819, -0.52452553],
       [-0.03088439,  0.0395395 , -0.20304786, ..., -0.00624416,
        -0.2004807 , -0.52452553],
       ...,
       [ 0.68060404, -0.56734041, -0.20304786, ...,  0.96173066,
        -0.51113811,  1.11298001],
       [ 0.68060404, -0.64547365, -0.20304786, ...,  0.8925896 ,
        -0.51113811,  1.11298001],
       [ 0.88388645, -0.40275396, -0.16842402, ...,  0.47774325,
        -0.61469058, -0.52452553]])

Testiranje nekoliko modela na nepromijenjenim podacima (prije oversamplinga/undesamplinga):

XGBoost s 5-fold stratified cross-validation(sačuvan omjer klasa u svim podskupovima podataka):

In [71]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold # koristit ćemo 5-fold stratified cross-validation
from sklearn.model_selection import cross_val_score

xgb = xgb.XGBClassifier()
skfold = KFold(n_splits = 5, shuffle = True, random_state = 7) 
# I suggest you select a random state value at random and use it for all your experiments. 
xgb_results = cross_val_score(xgb, X, y, cv = skfold)
# The fitting will be done inside the cross_val_score function, you don't need to worry about this beforehand.
# But if you want to use the model afterwards, you will need to fit() on the whole data again.
print("Točnost XGBoost modela: %.2f%% (standardna devijacija %.2f%%)" % (xgb_results.mean()*100, xgb_results.std()*100))

Točnost XGBoost modela: 94.87% (standardna devijacija 0.94%)


Prosjek recall-a za patološku klasu (ako se koristi stratifikacija, prosjek recall-a je OK):

In [72]:
import numpy as np
xgb_recall = 0
for train_index, test_index in skfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgb.fit(X_train, y_train)
    xgb_recall = xgb_recall + (recall_score(y_test, xgb.predict(X_test), average=None))[2]
xgb_recall = ( xgb_recall / 5 ) * 100
print("Recall za patološku klasu s XGBoost: %.2f%% " % xgb_recall)

Recall za patološku klasu s XGBoost: 91.82% 


In [78]:
# konfuzijske matrice za svaki od 5 podskupova:
#from sklearn.metrics import confusion_matrix
#for train_index, test_index in skfold.split(X,y):
#    X_train, X_test = X[train_index], X[test_index]
#    y_train, y_test = y[train_index], y[test_index]
#    xgb.fit(X_train, y_train)
#    print(confusion_matrix(y_test, xgb.predict(X_test)))
#    print("Recall za patološku klasu %f" % (recall_score(y_test, xgb.predict(X_test), average=None))[2])

Dalje ćemo u svim modelima koristiti 5-fold stratified cross-validation.

SVC s 5-fold stratified cross-validation:

In [73]:
from sklearn.svm import SVC 
svc = SVC(kernel = 'linear', random_state = 7)
svc_results = cross_val_score(svc, X, y, cv = skfold)
print("Točnost SVC modela: %.2f%% (standardna devijacija %.2f%%)" % (svc_results.mean()*100, svc_results.std()*100))

Točnost SVC modela: 88.76% (standardna devijacija 0.62%)


Prosjek recall-a za patološku klasu:

In [74]:
svc_recall = 0
for train_index, test_index in skfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc.fit(X_train, y_train)
    svc_recall = svc_recall + (recall_score(y_test, svc.predict(X_test), average=None))[2]
svc_recall = ( svc_recall / 5 ) * 100
print("Recall za patološku klasu s XGBoost: %.2f%% " % svc_recall)

Recall za patološku klasu s XGBoost: 77.84% 


Random forest:

In [76]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=21, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=10, oob_score=False, random_state=100,
            verbose=0, warm_start=False)
rf_results = cross_val_score(rf, X, y, cv = skfold)
print("Točnost Random Forest modela: %.2f%% (standardna devijacija %.2f%%)" %(rf_results.mean()*100, rf_results.std()*100))

Točnost Random Forest modela: 94.12% (standardna devijacija 1.27%)


Prosjek recall-a za patološku klasu:

In [77]:
rf_recall = 0
for train_index, test_index in skfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf.fit(X_train, y_train)
    rf_recall = rf_recall + (recall_score(y_test, rf.predict(X_test), average=None))[2]
rf_recall = ( rf_recall / 5 ) * 100
print("Recall za patološku klasu s Random Forest: %.2f%% " % rf_recall)

Recall za patološku klasu s Random Forest: 92.31% 
