In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.metrics import recall_score, accuracy_score
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
#imena stupaca u CTG.xls koje ćemo koristiti
column_names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']

#stupci koji sadrže int vrijednosti, samo za ljepši ispis
int_columns = ['LB' ,'ASTV' ,'ALTV' ,'Width' ,'Min' ,'Max' ,'Nmax' ,'Nzeros' ,'Mode' ,'Mean' ,'Median' ,'Variance',
               'Tendency', 'NSP']

In [3]:
data = pd.read_excel('CTG.xls', sheet_name = 'Data', skiprows = 1, 
                     usecols = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 45]
                     , names = column_names)

#izbacujemo sve retke u kojima je barem jedna varijabla nepoznata (NaN)
data = data.dropna()

for col in int_columns:
    data[col] = (data[col]).astype(int)

In [4]:
data.head(15)

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120,0.0,0.0,0.0,0.0,0.0,0.0,73,0.5,43,...,62,126,2,0,120,137,121,73,1,2
1,132,0.00638,0.0,0.00638,0.00319,0.0,0.0,17,2.1,0,...,68,198,6,1,141,136,140,12,0,1
2,133,0.003322,0.0,0.008306,0.003322,0.0,0.0,16,2.1,0,...,68,198,5,1,141,135,138,13,0,1
3,134,0.002561,0.0,0.007682,0.002561,0.0,0.0,16,2.4,0,...,53,170,11,0,137,134,137,13,1,1
4,132,0.006515,0.0,0.008143,0.0,0.0,0.0,16,2.4,0,...,53,170,9,0,137,136,138,11,1,1
5,134,0.001049,0.0,0.010493,0.009444,0.0,0.002099,26,5.9,0,...,50,200,5,3,76,107,107,170,0,3
6,134,0.001403,0.0,0.012623,0.008415,0.0,0.002805,29,6.3,0,...,50,200,6,3,71,107,106,215,0,3
7,122,0.0,0.0,0.0,0.0,0.0,0.0,83,0.5,6,...,62,130,0,0,122,122,123,3,1,3
8,122,0.0,0.0,0.001517,0.0,0.0,0.0,84,0.5,5,...,62,130,0,0,122,122,123,3,1,3
9,122,0.0,0.0,0.002967,0.0,0.0,0.0,86,0.3,6,...,62,130,1,0,122,122,123,1,1,3


Podjela u train i test skupove.

In [5]:
names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency']
X_train = pd.read_csv('X_train.csv', names = names, header = None).values()
X_test = pd.read_csv('X_test.csv', names = names, header = None).values()
y_train = pd.read_csv('y_train.csv', names = ['NSP'], header = None).values()
y_test = pd.read_csv('y_test.csv', names = ['NSP'], header = None).values()

In [8]:
with open('rf_SMOTE.pkl', 'rb') as file:  
    rf_s_best = pickle.load(file)
print("Točnost Random forest uz SMOTE: %.2f" %rf_s_best.score(X_test, y_test))
print("Recall Random forest uz SMOTE: %.2f" %recall_score(y_test, rf_s_best.predict(X_test), average = None)[2])

Točnost Random forest uz SMOTE: 0.92
Recall Random forest uz SMOTE: 0.89


SVC:

In [None]:
with open('svc_SMOTE.pkl', 'rb') as file:  
    svc_SMOTE = pickle.load(file)
with open('svc_BorderlineSMOTE.pkl', 'rb') as file:  
    svc_BorderlineSMOTE = pickle.load(file)
with open('svc_ADASYN.pkl', 'rb') as file:  
    svc_ADASYN = pickle.load(file)
with open('svc_obicni.pkl', 'rb') as file:  
    svc_obicni = pickle.load(file)

print("Točnost SVC bez oversamplinga: %.2f" %svc_obicni.score(X_test, y_test))
print("Recall SVC bez oversamplinga: %.2f" %recall_score(y_test, svc_obicni.predict(X_test), average = None)[2])

print("Točnost SVC uz SMOTE: %.2f" %svc_SMOTE.score(X_test, y_test))
print("Recall SVC uz SMOTE: %.2f" %recall_score(y_test, svc_SMOTE.predict(X_test), average = None)[2])

print("Točnost SVC uz BorderlineSMOTE: %.2f" %svc_BorderlineSMOTE.score(X_test, y_test))
print("Recall SVC uz BorderlineSMOTE: %.2f" %recall_score(y_test, svc_BorderlineSMOTE.predict(X_test), average = None)[2])

print("Točnost SVC uz ADASYN: %.2f" %svc_ADASYN.score(X_test, y_test))
print("Recall SVC uz ADASYN: %.2f" %recall_score(y_test, svc_ADASYN.predict(X_test), average = None)[2])

xgBoost:

In [None]:
with open('xgb_SMOTE.pkl', 'rb') as file:  
    xgb_SMOTE = pickle.load(file)
with open('xgb_BorderlineSMOTE.pkl', 'rb') as file:  
    xgb_BorderlineSMOTE = pickle.load(file)
with open('xgb_ADASYN.pkl', 'rb') as file:  
    xgb_ADASYN = pickle.load(file)
with open('xgb_obicni.pkl', 'rb') as file:  
    xgb_obicni = pickle.load(file)

print("Točnost xgBoost bez oversamplinga: %.2f" %xgb_obicni.score(X_test, y_test))
print("Recall xgBoost bez oversamplinga: %.2f" %recall_score(y_test, xgb_obicni.predict(X_test), average = None)[2])

print("Točnost xgBoost uz SMOTE: %.2f" %xgb_SMOTE.score(X_test, y_test))
print("Recall xgBoost uz SMOTE: %.2f" %recall_score(y_test, xgb_SMOTE.predict(X_test), average = None)[2])

print("Točnost xgBoost uz BorderlineSMOTE: %.2f" %xgb_BorderlineSMOTE.score(X_test, y_test))
print("Recall xgBoost uz BorderlineSMOTE: %.2f" %recall_score(y_test, xgb_BorderlineSMOTE.predict(X_test), average = None)[2])

print("Točnost xgBoost uz ADASYN: %.2f" %xgb_ADASYN.score(X_test, y_test))
print("Recall xgBoost uz ADASYN: %.2f" %recall_score(y_test, xgb_ADASYN.predict(X_test), average = None)[2])

Random forest:

In [None]:
with open('rf_SMOTE.pkl', 'rb') as file:  
    rf_SMOTE = pickle.load(file)
with open('rf_BorderlineSMOTE.pkl', 'rb') as file:  
    rf_BorderlineSMOTE = pickle.load(file)
with open('rf_ADASYN.pkl', 'rb') as file:  
    rf_ADASYN = pickle.load(file)
with open('rf_obicni.pkl', 'rb') as file:  
    rf_obicni = pickle.load(file)

print("Točnost Random forest bez oversamplinga: %.2f" %rf_obicni.score(X_test, y_test))
print("Recall Random forest bez oversamplinga: %.2f" %recall_score(y_test, rf_obicni.predict(X_test), average = None)[2])

print("Točnost Random forest uz SMOTE: %.2f" %rf_SMOTE.score(X_test, y_test))
print("Recall Random forest uz SMOTE: %.2f" %recall_score(y_test, rf_SMOTE.predict(X_test), average = None)[2])

print("Točnost Random forest uz BorderlineSMOTE: %.2f" %rf_BorderlineSMOTE.score(X_test, y_test))
print("Recall Random forest uz BorderlineSMOTE: %.2f" %recall_score(y_test, rf_BorderlineSMOTE.predict(X_test), average = None)[2])

print("Točnost Random forest uz ADASYN: %.2f" %rf_ADASYN.score(X_test, y_test))
print("Recall Random forest uz ADASYN: %.2f" %recall_score(y_test, rf_ADASYN.predict(X_test), average = None)[2])