In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
#imena stupaca u CTG.xls koje ćemo koristiti
column_names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']

#stupci koji sadrže int vrijednosti, samo za ljepši ispis
int_columns = ['LB' ,'ASTV' ,'ALTV' ,'Width' ,'Min' ,'Max' ,'Nmax' ,'Nzeros' ,'Mode' ,'Mean' ,'Median' ,'Variance',
               'Tendency', 'NSP']

In [3]:
data = pd.read_excel('CTG.xls', sheet_name = 'Data', skiprows = 1, 
                     usecols = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 45]
                     , names = column_names)

#izbacujemo sve retke u kojima je barem jedna varijabla nepoznata (NaN)
data = data.dropna()

for col in int_columns:
    data[col] = (data[col]).astype(int)

Stratificirana podjela u train i test skupove.

In [7]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=42)

Standardizacija.

In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
#names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency']
#X_train = pd.read_csv('X_train.csv', names = names, header = None).values
#X_test = pd.read_csv('X_test.csv', names = names, header = None).values
#y_train = pd.read_csv('y_train.csv', names = ['NSP'], header = None).values
#y_test = pd.read_csv('y_test.csv', names = ['NSP'], header = None).values

In [9]:
np.savetxt(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\X_train.csv', X_train, delimiter=",")
np.savetxt(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\X_test.csv', X_test, delimiter=",")
np.savetxt(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\y_train.csv', y_train, delimiter=",")
np.savetxt(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\y_test.csv', y_test, delimiter=",")

#X_train.to_csv(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\X_train.csv', index = None, header = True)
#X_test.to_csv(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\X_test.csv', index = None, header = True)
#y_train.to_csv(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\y_train.csv', index = None, header = True)
#y_test.to_csv(r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\y_test.csv', index = None, header = True) 

In [10]:
svc_s_pipeline = Pipeline([
        ('sampling', SMOTE()),
        ('classification', SVC(probability=True))
    ])
parameters_svc = [
    {'classification__C':[1, 10, 100, 1000], 'classification__kernel':['linear']},
    {'classification__C':[1, 10, 100, 1000], 'classification__gamma':[0.001, 0.0001], 'classification__kernel':['rbf']}
    ]

svc_s = GridSearchCV(svc_s_pipeline, parameters_svc, cv = 3)
svc_s.fit(X_train, y_train)

svc_s_best = svc_s.best_estimator_
with open("svc_SMOTE.pkl" , 'wb') as file:  
    pickle.dump(svc_s_best, file)

In [None]:
svc_bs_pipeline = Pipeline([
        ('sampling', BorderlineSMOTE()),
        ('classification', SVC(probability=True))
    ]) 
svc_bs = GridSearchCV(svc_bs_pipeline, parameters_svc, cv = 3)
svc_bs.fit(X_train, y_train)

svc_bs_best = svc_bs.best_estimator_
with open("svc_BorderlineSMOTE.pkl" , 'wb') as file:  
    pickle.dump(svc_bs_best, file)

In [None]:
svc_a_pipeline = Pipeline([
        ('sampling', ADASYN()),
        ('classification', SVC(probability=True))
    ]) 
svc_a = GridSearchCV(svc_a_pipeline, parameters_SVC, cv = 3)
svc_a.fit(X_train, y_train)

svc_a_best = svc_a.best_estimator_
with open("svc_ADASYN.pkl" , 'wb') as file:  
    pickle.dump(svc_a_best, file)

In [17]:
xgb_s_pipeline = Pipeline([
        ('sampling', SMOTE()),
        ('classification', xgb.XGBClassifier())
    ])
parameters_xgb = {
        'classification__min_child_weight': [1, 5, 10],
        'classification__gamma': [0.5, 1, 1.5, 2, 5],
        'classification__subsample': [0.6, 0.8, 1.0],
        'classification__colsample_bytree': [0.6, 0.8, 1.0],
        'classification__max_depth': [3, 4, 5]
        }

xgb_s = GridSearchCV(xgb_s_pipeline, parameters_xgb, cv = 3)
xgb_s.fit(X_train, y_train.ravel())

xgb_s_best = xgb_s.best_estimator_
with open("xgb_SMOTE.pkl" , 'wb') as file:  
    pickle.dump(xgb_s_best, file)

KeyboardInterrupt: 

In [None]:
xgb_bs_pipeline = Pipeline([
        ('sampling', BorderlineSMOTE()),
        ('classification', xgb.XGBClassifier())
    ])

xgb_bs = GridSearchCV(xgb_bs_pipeline, parameters_xgb, cv = 3)
xgb_bs.fit(X_train, y_train)

xgb_bs_best = xgb_bs.best_estimator_
with open("xgb_BorderlineSMOTE.pkl" , 'wb') as file:  
    pickle.dump(xgb_bs_best, file)

In [None]:
xgb_a_pipeline = Pipeline([
        ('sampling', ADASYN()),
        ('classification', xgb.XGBClassifier())
    ])

xgb_a = GridSearchCV(xgb_a_pipeline, parameters_xgb, cv = 3)
xgb_a.fit(X_train, y_train)

xgb_a_best = xgb_a.best_estimator_
with open("xgb_ADASYN.pkl" , 'wb') as file:  
    pickle.dump(xgb_a_best, file)

In [31]:
rf_s_pipeline = Pipeline([
        ('sampling', SMOTE()),
        ('classification', RandomForestClassifier(random_state=2018))
    ])
parameters_rf = {
    'classification__n_estimators': list(range(1,21,2)),
    'classification__max_features': list(range(1,18))
    }

rf_s = GridSearchCV(rf_s_pipeline, parameters_rf, cv = 3)
rf_s.fit(X_train, y_train)

rf_s_best = rf_s.best_estimator_
with open('rf_SMOTE.pkl' , 'wb') as file:  
    pickle.dump(rf_s_best, file)

In [33]:
print("Točnost Random forest uz SMOTE: %.2f" %rf_s_best.score(X_test, y_test))
print("Recall Random forest uz SMOTE: %.2f" %recall_score(y_test, rf_s_best.predict(X_test), average = None)[2])

Točnost Random forest uz SMOTE: 0.92
Recall Random forest uz SMOTE: 0.89


In [34]:
rf_bs_pipeline = Pipeline([
        ('sampling', BorderlineSMOTE()),
        ('classification', RandomForestClassifier(random_state=2018))
    ])

rf_bs = GridSearchCV(rf_bs_pipeline, parameters_rf, cv = 3)
rf_bs.fit(X_train, y_train)

rf_bs_best = rf_bs.best_estimator_
with open("rf_BorderlineSMOTE.pkl" , 'wb') as file:  
    pickle.dump(rf_bs_best, file)

In [35]:
print("Točnost Random forest uz BorderlineSMOTE: %.2f" %rf_bs_best.score(X_test, y_test))
print("Recall Random forest uz BorderlineSMOTE: %.2f" %recall_score(y_test, rf_bs_best.predict(X_test), average = None)[2])

Točnost Random forest uz BorderlineSMOTE: 0.93
Recall Random forest uz BorderlineSMOTE: 0.91


In [None]:
rf_a_pipeline = Pipeline([
        ('sampling', ADASYN()),
        ('classification', RandomForestClassifier(random_state=2018))
    ])

rf_a = GridSearchCV(rf_a_pipeline, parameters_rf, cv = 3)
rf_a.fit(X_train, y_train)

rf_a_best = rf_a.best_estimator_
with open("rf_ADASYN.pkl" , 'wb') as file:  
    pickle.dump(rf_a_best, file)

Training i validation skupovi za cross-validation.

In [20]:
skf = StratifiedKFold(n_splits=5, random_state=164981614)

X_training, X_validation, y_training, y_validation = [], [], [], []

for train_index, validate_index in skf.split(X_train, y_train):
    X_training.append(X_train[train_index])
    X_validation.append(X_train[validate_index])
    y_training.append(y_train[train_index])
    y_validation.append(y_train[validate_index])

Funkcija za određivanje najboljih parametara modela uz zadani oversampler.

In [21]:
def best_model_oversampling(grid_search, X_train, y_train, oversampler):
    cv_score = 0.
    cv_recall = 0.
    validation_score = 0.
    validation_recall = 0.
    for i in range (5):
        X_training_oversampled, y_training_oversampled = oversampler.fit_resample(X_training[i], y_training[i])
        grid_search.fit(X_training_oversampled, y_training_oversampled)
        best_parameters = grid_search.best_estimator_.get_params()
        model = grid_search.best_estimator_.fit(X_training_oversampled, y_training_oversampled)
        y_pred = model.predict(X_validation[i])
        acc_score = accuracy_score(y_validation[i], y_pred)
        recall  = recall_score(y_validation[i], model.predict(X_validation[i]), average = None)[2]
        cv_recall += recall
        cv_score += acc_score   
    cv_score = cv_score/5
    cv_recall = cv_recall/5
    return model

In [22]:
def best_model(grid_search, X_train, y_train):
    cv_score = 0.
    cv_recall = 0.
    validation_score = 0.
    validation_recall = 0.
    for i in range (5):
        grid_search.fit(X_training[i], y_training[i])
        best_parameters = grid_search.best_estimator_.get_params()
        model = grid_search.best_estimator_.fit(X_training[i], y_training[i])
        y_pred = model.predict(X_validation[i])
        acc_score = accuracy_score(y_validation[i], y_pred)
        recall  = recall_score(y_validation[i], model.predict(X_validation[i]), average = None)[2]
        cv_recall += recall
        cv_score += acc_score   
    cv_score = cv_score/5
    cv_recall = cv_recall/5
    return model

SVC:

In [None]:
parameters_SVC = [
    {'C':[1, 10, 100, 1000], 'kernel':['linear']},
    {'C':[1, 10, 100, 1000], 'gamma':[0.001, 0.0001], 'kernel':['rbf']}
    ]
grid_search_SVC = GridSearchCV(SVC(probability=True), parameters_SVC, cv = 3)

models = []
models.append(best_model_oversampling(grid_search_SVC, X_train, y_train, SMOTE()))
print("Istrenirao SVC SMOTE.")
models.append(best_model_oversampling(grid_search_SVC, X_train, y_train, BorderlineSMOTE()))
print("Istrenirao SVC BorderlineSMOTE.")
models.append(best_model_oversampling(grid_search_SVC, X_train, y_train, ADASYN()))
print("Istrenirao SVC ADASYN.")
models.append(best_model(grid_search_SVC, X_train, y_train))
print("Istrenirao SVC bez oversamplinga.")

pkl_filenames = {"svc_SMOTE.pkl", "svc_BorderlineSMOTE.pkl", "svc_ADASYN.pkl", "svc_obicni.pkl"}  

for i in range(4):
    with open(pkl_filenames[i], 'wb') as file:  
        pickle.dump(models[i], file)

xgBoost:

In [None]:
parameters_xgb = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
grid_search_xgb = GridSearchCV(xgb.XGBClassifier(), parameters_xgb, cv = 3)

models = []
models.append(best_model_oversampling(grid_search_xgb, X_train, y_train, SMOTE()))
print("Istrenirao xgBoost SMOTE.")
models.append(best_model_oversampling(grid_search_xgb, X_train, y_train, BorderlineSMOTE()))
print("Istrenirao xgBoost BorderlineSMOTE.")
models.append(best_model_oversampling(grid_search_xgb, X_train, y_train, ADASYN()))
print("Istrenirao xgBoost ADASYN.")
models.append(best_model(grid_search_xgb, X_train, y_train))
print("Istrenirao xgBoost bez oversamplinga.")

pkl_filenames = {"xgb_SMOTE.pkl", "xgb_BorderlineSMOTE.pkl", "xgb_ADASYN.pkl", "xgb_obicni.pkl"}  

for i in range(4):
    with open(pkl_filenames[i], 'wb') as file:  
        pickle.dump(models[i], file)

Random forest:

In [None]:
parameters_rf = {
    'n_estimators': list(range(1,21,2)),
    'max_features': list(range(1,18))
    }
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=2018), parameters_rf, cv=3)

models = []
models.append(best_model_oversampling(grid_search_rf, X_train, y_train, SMOTE()))
print("Istrenirao Random forest SMOTE.")
models.append(best_model_oversampling(grid_search_rf, X_train, y_train, BorderlineSMOTE()))
print("Istrenirao Random forest BorderlineSMOTE.")
models.append(best_model_oversampling(grid_search_rf, X_train, y_train, ADASYN()))
print("Istrenirao Random forest ADASYN.")
models.append(best_model(grid_search_rf, X_train, y_train))
print("Istrenirao Random forest bez oversamplinga.")

pkl_filenames = {"rf_SMOTE.pkl", "rf_BorderlineSMOTE.pkl", "rf_ADASYN.pkl", "rf_obicni.pkl"}  

for i in range(4):
    with open(pkl_filenames[i], 'wb') as file:  
        pickle.dump(models[i], file)

Istrenirao Random forest SMOTE.
