In [1]:
from os import listdir
from sys import argv

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, precision_recall_curve

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

# import sys
# sys.path.insert(0, "/mnt/Dados/Documentos/xgboost/python-package/xgboost/")
import xgboost as xgb

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale, StandardScaler, minmax_scale

from sklearn.svm import SVC

import time

from sklearn.externals import joblib
import pandas as pd
from sklearn.manifold import Isomap, TSNE
from sklearn.model_selection import GroupKFold, LeavePGroupsOut, LeaveOneGroupOut

def shuffled(array):
    x = array.values.copy()
    np.random.shuffle(x)
    return x

In [2]:
data = pd.read_csv("datasetfull.csv")

In [3]:
data = data.loc[shuffled(data.index)]

In [4]:
y = data["GT"]
IMG = data["IMG"]
BLOCK = data["BLOCK"]
solo = data["solo"]
base = data["base"]

del data["GT"]
del data["IMG"] # salto enorme de precisão se deixar essa feature
del data["BLOCK"]
del data["solo"]
del data["base"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, random_state=42)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical


In [None]:
model = Sequential()

In [None]:
model.add(Dense(units=10, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=2, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

In [None]:
y_binary = to_categorical(y_train)

In [None]:
model.fit(X_train, y_binary, epochs=5, batch_size=32)

In [None]:
y_binary = to_categorical(y_test)

In [None]:
classes = model.predict(X_test, batch_size=128)

In [None]:
classes

In [6]:
start = time.time()
ratio = float(np.sum(y_train == 1)) / np.sum(y_train==0)

clf = xgb.XGBClassifier(
                max_depth = 4,
                n_estimators=1000,
                learning_rate=0.3, 
                nthread=3,
                subsample=1.0,
                colsample_bytree=1,
                #min_child_weight = 3,
                scale_pos_weight = ratio,
                reg_alpha=0.09,
                seed=1301)

clf.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc",
        eval_set=[(X_train, y_train), (X_test, y_test)], verbose = True)

print("%f" % (time.time() - start))

[0]	validation_0-auc:0.968679	validation_1-auc:0.970011
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.975711	validation_1-auc:0.975256
[2]	validation_0-auc:0.978821	validation_1-auc:0.978625
[3]	validation_0-auc:0.980017	validation_1-auc:0.979664
[4]	validation_0-auc:0.981084	validation_1-auc:0.98086
[5]	validation_0-auc:0.98189	validation_1-auc:0.981554
[6]	validation_0-auc:0.98255	validation_1-auc:0.98215
[7]	validation_0-auc:0.983492	validation_1-auc:0.982998
[8]	validation_0-auc:0.983844	validation_1-auc:0.983183
[9]	validation_0-auc:0.984544	validation_1-auc:0.98385
[10]	validation_0-auc:0.985225	validation_1-auc:0.984388
[11]	validation_0-auc:0.985692	validation_1-auc:0.984966
[12]	validation_0-auc:0.98593	validation_1-auc:0.98517
[13]	validation_0-auc:0.986118	validation_1-auc:0.985285
[14]	validation_0-auc:0.986764	validation_1-auc:0.98586
[15]	v

In [None]:
print(time.time() - start)

In [None]:
def XGBTrain(data, train_index, test_index):
    X_train, y_train = data.iloc[train_index], y.iloc[train_index]
    X_test, y_test = data.iloc[test_index], y.iloc[test_index]
    ratio = float(np.sum(y_train == 1)) / np.sum(y_train==0)

    clf = xgb.XGBClassifier(
                    max_depth = 4,
                    n_estimators=1000,
                    learning_rate=0.2, 
                    nthread=3,
                    subsample=1.0,
                    colsample_bytree=1,
                    #min_child_weight = 3,
                    scale_pos_weight = ratio,
                    reg_alpha=0.03,
                    seed=1301)

    clf.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc",
            eval_set=[(X_test, y_test)], verbose = False)
    
    return clf

In [None]:
n_splits = 3

In [None]:
# group_kfold = GroupKFold(n_splits=n_splits)
group_kfold = LeaveOneGroupOut()

In [None]:
clfs = [None for _ in range (n_splits)]
splits = list(group_kfold.split(data, y, groups=solo))
for i, (train_index, test_index) in enumerate(splits):
    print(i, end = ' ')
    clf = XGBTrain(data, train_index, test_index)
    clfs[i] = clf
    print(clf.best_score, clf.best_iteration)


In [None]:
fimp = sum(clfs[i].feature_importances_ for i in range(n_splits)) / n_splits

In [None]:
importance = pd.Series(data=fimp, index=data.columns)
# importance = pd.Series(data=clf.feature_importances_, index=data.columns)

In [None]:
importance.sort_values(ascending=False)

# Análise do Resultados

In [None]:
clf_ind = 0
clf = clfs[clf_ind]
train_index, test_index = splits[clf_ind]
X_train, y_train = data.iloc[train_index], y.iloc[train_index]
X_test, y_test = data.iloc[test_index], y.iloc[test_index]

In [None]:
predict = clf.predict_proba(X_test, ntree_limit=clf.best_iteration)[:,-1]

In [None]:
# Melhor média de TPR E TNR
FPR, TPR, TH = roc_curve(y_test, predict)
i_max = max(range(len(TPR)), key = lambda x : TPR[x] + 1 - FPR[x])
print((TPR[i_max] + 1 - FPR[i_max])/2)
plt.plot(FPR,TPR)

In [None]:
plt.figure(figsize=(12,9))
for col in importance.sort_values(ascending=False).iloc[:15].index:
    score = roc_auc_score(y, data[col])
    print("Haralick - %-30s AUC =" % col, score)
    
    fpr, tpr = roc_curve(y, data[col])[:2]
    plt.plot(fpr, tpr, label = col)

plt.plot(FPR,TPR, "k--", label = "predict", lw = 1.5)
plt.legend()
    

In [None]:
plt.figure(figsize=(17,5))
plt.scatter(predict, y_test, alpha = .1)
th = TH[i_max]
plt.plot([th, th], [1,0], "r-")

In [None]:
train_pred = clf.predict_proba(data, ntree_limit=clf.best_iteration)[:,-1]

In [None]:
m1 = ((y == 1) & (train_pred < th))
m0 = ((y == 0) & (train_pred > th))
m1.sum(), m0.sum()

In [None]:
# m = ((y_test == 1) & (predict < th)) | ((y_test == 0) & (predict > th))
# m.sum()

In [None]:
dec = PCA()

In [None]:
pca = dec.fit_transform(StandardScaler().fit_transform(data.fillna(10**-5)))

In [None]:
plt.figure(figsize=(17,5))
xvr = dec.explained_variance_ratio_[:30]
plt.bar(range(len(xvr)), np.cumsum(xvr))

In [None]:
plt.figure(figsize=(12,5))
for c in {0,1}:
    plt.subplot(1,2,c+1)
    plt.scatter(pca[y==(1-c),0], pca[y==(1-c),1], alpha = 0 * .2, label = "classe %d" % (1-c))
    plt.scatter(pca[y==c,0], pca[y==c,1], alpha = .2, label = "classe %d" % c, color = "blue" if c == 0 else "yellow")
    plt.plot([0,0], [pca[:,1].min(), pca[:,1].max()], "r--")
    plt.plot([pca[:,0].min(), pca[:,0].max()], [0,0], "r--")
    plt.legend()

In [None]:
plt.figure(figsize=(12,9))
for c in {0,1}:
    plt.scatter(pca[y==c,0], pca[y==c,1], alpha = .2, label = "classe %d" % c)
plt.scatter(pca[m0,0], pca[m0,1], label = "miss0")
plt.scatter(pca[m1,0], pca[m1,1], label = "miss1")
plt.legend()

In [None]:
man = Isomap(n_jobs=3)
# man = TSNE()

In [None]:
man.fit(data.iloc[::10])

In [None]:
iso = man.transform(StandardScaler().fit_transform(data))

In [None]:
plt.figure(figsize=(12,9))
for c in {0,1}:
    plt.scatter(iso[(y==c),0], iso[(y==c),1], alpha = .2, label = "classe %d" % c)
plt.scatter(iso[m0,0], iso[m0,1], label = "miss0")
plt.scatter(iso[m1,0], iso[m1,1], label = "miss1")
plt.legend()

In [None]:
from skimage.util.shape import view_as_blocks
from skimage.io import imread


In [None]:
paths = ["imgs_orig", "gts_orig"] # TROCAR AQUI DEPENDENDO DA BASE

path_imgs = sorted([ paths[0] + '/' + i for i in listdir(paths[0]) ])
path_gts = sorted([ paths[1] + '/' + i for i in listdir(paths[1]) ])

n_images = len(path_imgs)

In [None]:
misses = pd.concat([IMG[m0].value_counts().rename(0), IMG[m1].value_counts().rename(1)], axis = 1).fillna(0)

In [None]:
misses["sum"] = misses[0] + misses[1]

In [None]:
misses.sort_values("sum",ascending=False)

In [None]:
def highlight(ind, color):
    x_ini = (ind % 32) * 16
    x_end = x_ini + 15
    y_ini = (ind // 32) * 16
    y_end = y_ini + 15
    plt.fill_between([x_ini, x_end], y_ini, y_end, alpha =.4, color = color)

In [None]:
i = 17
img = imread(path_imgs[i])
gt = imread(path_gts[i], as_grey=True)

In [None]:
plt.figure(figsize=(16,7))
plt.subplots_adjust(wspace = 0, hspace = 0)

plt.subplot(1,2,1)
plt.imshow(img)
for b in BLOCK[(IMG == i) & m1]:
    highlight(b, "red")
for b in BLOCK[(IMG == i) & m0]:
    highlight(b, "blue")
plt.ylim((511,0))
plt.xlim((0,511))

plt.subplot(1,2,2)
plt.imshow(gt)
for b in BLOCK[(IMG == i) & (y == 1)]:
    highlight(b, "green")
plt.ylim((511,0))
plt.xlim((0,511))
