## Random forest

In [18]:
import pandas as pd
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [19]:
#imena stupaca u CTG.xls koje ćemo koristiti
column_names = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
                   'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP']

#stupci koji sadrže int vrijednosti, samo za ljepši ispis
int_columns = ['LB' ,'ASTV' ,'ALTV' ,'Width' ,'Min' ,'Max' ,'Nmax' ,'Nzeros' ,'Mode' ,'Mean' ,'Median' ,'Variance',
               'Tendency', 'NSP']

In [20]:
data = pd.read_excel('CTG.xls', sheet_name = 'Data', skiprows = 1, 
                     usecols = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 45]
                     , names = column_names)

#izbacujemo sve retke u kojima je barem jedna varijabla nepoznata (NaN)
data = data.dropna()

for col in int_columns:
    data[col] = (data[col]).astype(int)

In [21]:
X = data.iloc[:,:-1].values 
y = data.iloc[:,-1].values # NSP dijagnoza

from sklearn.model_selection import RepeatedStratifiedKFold

random_state = 164981614

rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=random_state)

X_train, X_test, y_train, y_test = [], [], [], []

for train_index, test_index in rskf.split(X, y):
    X_train.append(X[train_index])
    X_test.append(X[test_index])
    y_train.append(y[train_index])
    y_test.append(y[test_index])

In [22]:
#Standardization of datasets is a common requirement for many machine learning estimators implemented in scikit-learn; 
#they might behave badly if the individual features do not more or less look like standard normally distributed data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in range(5):
    X_train[i] = sc.fit_transform(X_train[i])
    X_test[i] = sc.transform(X_test[i])

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.io import arff

In [24]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [25]:
#clf = RandomForestClassifier(n_jobs=2, random_state=0)

#clf.fit(X1_train, y1_train)
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=21, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=10, oob_score=False, random_state=100,
            verbose=0, warm_start=False)

clf.fit(X1_train, y1_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=21, max_leaf_nodes=None,
            min_impurity_decrease=1e-07, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=10,
            oob_score=False, random_state=100, verbose=0, warm_start=False)

In [26]:
y_pred = clf.predict(X1_test)
predictions7 = [round(value) for value in y_pred]

In [27]:
accuracy7 = accuracy_score(y1_test, predictions7)
print("Accuracy: %.2f%%" % (accuracy7 * 100.0))

Accuracy: 93.42%


In [28]:
list(zip(X1_train, clf.feature_importances_))

[(array([1.29000000e+02, 3.81194409e-03, 0.00000000e+00, 1.27064803e-03,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.70000000e+01,
         1.20000000e+00, 1.00000000e+00, 7.90000000e+00, 6.00000000e+01,
         9.80000000e+01, 1.58000000e+02, 3.00000000e+00, 0.00000000e+00,
         1.33000000e+02, 1.33000000e+02, 1.34000000e+02, 4.00000000e+00,
         0.00000000e+00]), 0.027338920047138275),
 (array([1.42000000e+02, 3.17460317e-03, 5.00000000e-02, 2.38095238e-03,
         5.55555556e-03, 0.00000000e+00, 7.93650794e-04, 6.60000000e+01,
         3.30000000e+00, 0.00000000e+00, 3.00000000e-01, 1.13000000e+02,
         6.70000000e+01, 1.80000000e+02, 7.00000000e+00, 0.00000000e+00,
         1.41000000e+02, 1.11000000e+02, 1.47000000e+02, 9.00000000e+01,
         1.00000000e+00]), 0.05047882265130073),
 (array([1.19000000e+02, 7.74443369e-03, 0.00000000e+00, 4.84027106e-03,
         9.68054211e-04, 0.00000000e+00, 0.00000000e+00, 3.00000000e+01,
         1.80000000e+00, 

In [29]:
# CV model
#clf = RandomForestClassifier()
kfold = KFold(n_splits=5, random_state=7)
results = cross_val_score(clf, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 87.44% (7.60%)


In [30]:
from sklearn.model_selection import StratifiedKFold
clf_cv_strat = clf
kfold_strat = StratifiedKFold(n_splits=5, random_state=7)
results = cross_val_score(clf_cv_strat, X, y, cv=kfold_strat)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 85.79% (8.49%)


In [31]:
#accuracy samo za svaki train/test set; cross validacija
clf2 = clf

for i in range(5):
    print("i: %d", i)
    clf2.fit(X_train[i], y_train[i])
    y2_pred = clf2.predict(X_test[i])
    predictions77 = [round(value) for value in y2_pred]
    accuracy77 = accuracy_score(y_test[i], predictions77)
    print("Accuracy: %.2f%%" % (accuracy77 * 100.0))

i: %d 0
Accuracy: 95.33%
i: %d 1
Accuracy: 95.33%
i: %d 2
Accuracy: 96.26%
i: %d 3
Accuracy: 93.46%
i: %d 4
Accuracy: 95.79%
