In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.metrics import recall_score, accuracy_score
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [13]:
#imena stupaca u CTG.xls koje ćemo koristiti
column_names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']

#stupci koji sadrže int vrijednosti, samo za ljepši ispis
int_columns = ['LB' ,'ASTV' ,'ALTV' ,'Width' ,'Min' ,'Max' ,'Nmax' ,'Nzeros' ,'Mode' ,'Mean' ,'Median' ,'Variance',
               'Tendency', 'NSP']

In [14]:
data = pd.read_excel('CTG.xls', sheet_name = 'Data', skiprows = 1, 
                     usecols = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 45]
                     , names = column_names)

#izbacujemo sve retke u kojima je barem jedna varijabla nepoznata (NaN)
data = data.dropna()

for col in int_columns:
    data[col] = (data[col]).astype(int)

Standardizacija:

Podjela u train i test skupove.

In [15]:
train, test = train_test_split(data, test_size=0.20, random_state=42)
print('Broj primjera za treniranje: ' + str(len(train)))
print('Broj primjera za testiranje: ' + str(len(test)))

Broj primjera za treniranje: 1700
Broj primjera za testiranje: 426


In [17]:
train.to_csv (r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\train.csv', index = None, header=True)
test.to_csv (r'C:\Users\Korisnik\Documents\Strojno ucenje\Projekt\test.csv', index = None, header=True)

In [18]:
X_train = train.iloc[:,:-1].values 
y_train = train.iloc[:,-1].values

X_test = test.iloc[:,:-1].values 
y_test = test.iloc[:,-1].values

Standardizacija.

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Training i validation skupovi za cross-validation.

In [20]:
skf = StratifiedKFold(n_splits=5, random_state=164981614)

X_training, X_validation, y_training, y_validation = [], [], [], []

for train_index, validate_index in skf.split(X_train, y_train):
    X_training.append(X_train[train_index])
    X_validation.append(X_train[validate_index])
    y_training.append(y_train[train_index])
    y_validation.append(y_train[validate_index])

Funkcija za određivanje najboljih parametara modela uz zadani oversampler.

In [21]:
def best_model_oversampling(grid_search, X_train, y_train, oversampler):
    cv_score = 0.
    cv_recall = 0.
    validation_score = 0.
    validation_recall = 0.
    for i in range (5):
        X_training_oversampled, y_training_oversampled = oversampler.fit_resample(X_training[i], y_training[i])
        grid_search.fit(X_training_oversampled, y_training_oversampled)
        best_parameters = grid_search.best_estimator_.get_params()
        model = grid_search.best_estimator_.fit(X_training_oversampled, y_training_oversampled)
        y_pred = model.predict(X_validation[i])
        acc_score = accuracy_score(y_validation[i], y_pred)
        recall  = recall_score(y_validation[i], model.predict(X_validation[i]), average = None)[2]
        cv_recall += recall
        cv_score += acc_score   
    cv_score = cv_score/5
    cv_recall = cv_recall/5
    return model

In [22]:
def best_model(grid_search, X_train, y_train):
    cv_score = 0.
    cv_recall = 0.
    validation_score = 0.
    validation_recall = 0.
    for i in range (5):
        grid_search.fit(X_training[i], y_training[i])
        best_parameters = grid_search.best_estimator_.get_params()
        model = grid_search.best_estimator_.fit(X_training[i], y_training[i])
        y_pred = model.predict(X_validation[i])
        acc_score = accuracy_score(y_validation[i], y_pred)
        recall  = recall_score(y_validation[i], model.predict(X_validation[i]), average = None)[2]
        cv_recall += recall
        cv_score += acc_score   
    cv_score = cv_score/5
    cv_recall = cv_recall/5
    return model

SVC:

In [None]:
parameters_SVC = [
    {'C':[1, 10, 100, 1000], 'kernel':['linear']},
    {'C':[1, 10, 100, 1000], 'gamma':[0.001, 0.0001], 'kernel':['rbf']}
    ]
grid_search_SVC = GridSearchCV(SVC(probability=True), parameters_SVC, cv = 3)

models = []
models.append(best_model_oversampling(grid_search_SVC, X_train, y_train, SMOTE()))
print("Istrenirao SVC SMOTE.")
models.append(best_model_oversampling(grid_search_SVC, X_train, y_train, BorderlineSMOTE()))
print("Istrenirao SVC BorderlineSMOTE.")
models.append(best_model_oversampling(grid_search_SVC, X_train, y_train, ADASYN()))
print("Istrenirao SVC ADASYN.")
models.append(best_model(grid_search_SVC, X_train, y_train))
print("Istrenirao SVC bez oversamplinga.")

pkl_filenames = {"svc_SMOTE.pkl", "svc_BorderlineSMOTE.pkl", "svc_ADASYN.pkl", "svc_obicni.pkl"}  

for i in range(4):
    with open(pkl_filenames[i], 'wb') as file:  
        pickle.dump(models[i], file)

xgBoost:

In [None]:
parameters_xgb = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
grid_search_xgb = GridSearchCV(xgb.XGBClassifier(), parameters_xgb, cv = 3)

models = []
models.append(best_model_oversampling(grid_search_xgb, X_train, y_train, SMOTE()))
print("Istrenirao xgBoost SMOTE.")
models.append(best_model_oversampling(grid_search_xgb, X_train, y_train, BorderlineSMOTE()))
print("Istrenirao xgBoost BorderlineSMOTE.")
models.append(best_model_oversampling(grid_search_xgb, X_train, y_train, ADASYN()))
print("Istrenirao xgBoost ADASYN.")
models.append(best_model(grid_search_xgb, X_train, y_train))
print("Istrenirao xgBoost bez oversamplinga.")

pkl_filenames = {"xgb_SMOTE.pkl", "xgb_BorderlineSMOTE.pkl", "xgb_ADASYN.pkl", "xgb_obicni.pkl"}  

for i in range(4):
    with open(pkl_filenames[i], 'wb') as file:  
        pickle.dump(models[i], file)

Random forest:

In [None]:
parameters_rf = {
    'n_estimators': list(range(1,21,2)),
    'max_features': list(range(1,18))
    }
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=2018), parameters_rf, cv=3)

models = []
models.append(best_model_oversampling(grid_search_rf, X_train, y_train, SMOTE()))
print("Istrenirao Random forest SMOTE.")
models.append(best_model_oversampling(grid_search_rf, X_train, y_train, BorderlineSMOTE()))
print("Istrenirao Random forest BorderlineSMOTE.")
models.append(best_model_oversampling(grid_search_rf, X_train, y_train, ADASYN()))
print("Istrenirao Random forest ADASYN.")
models.append(best_model(grid_search_rf, X_train, y_train))
print("Istrenirao Random forest bez oversamplinga.")

pkl_filenames = {"rf_SMOTE.pkl", "rf_BorderlineSMOTE.pkl", "rf_ADASYN.pkl", "rf_obicni.pkl"}  

for i in range(4):
    with open(pkl_filenames[i], 'wb') as file:  
        pickle.dump(models[i], file)

Istrenirao Random forest SMOTE.
