In [1]:

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, precision_recall_curve, accuracy_score

from sklearn.model_selection import train_test_split

# sys.path.insert(0, "/mnt/Dados/Documentos/xgboost/python-package/")
import xgboost as xgb

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale, StandardScaler, minmax_scale

from sklearn.svm import SVC

import time

from sklearn.externals import joblib
import pandas as pd
from sklearn.manifold import Isomap, TSNE
from sklearn.model_selection import GroupKFold, LeavePGroupsOut, LeaveOneGroupOut

def shuffled(array):
    x = array.values.copy()
    np.random.shuffle(x)
    return x

def shuffled2(array):
    x = array.copy()
    np.random.shuffle(x)
    return x

In [2]:
%%time
# Read database from disk
data = pd.read_csv("../IC2017_DATA/augmented_1.csv")

CPU times: user 1min 42s, sys: 8.38 s, total: 1min 51s
Wall time: 1min 58s


In [3]:
# Shuffle data
# data = data.loc[shuffled(data.index)]

In [4]:
data.columns

Index(['ExG_contrast_np.pi/4', 'ExG_contrast_0', 'ExG_contrast_3*np.pi/2',
       'ExG_contrast_7*np.pi/4', 'ExG_correlation_np.pi/4',
       'ExG_correlation_0', 'ExG_correlation_3*np.pi/2',
       'ExG_correlation_7*np.pi/4', 'ExG_energy_np.pi/4', 'ExG_energy_0',
       ...
       'interior_61', 'interior_62', 'interior_63', 'target', 'img_num',
       'noise_num', 'rot_num', 'sh_num', 'block_num', 'base_num'],
      dtype='object', length=321)

In [5]:
y = data['target']
IMG = data['img_num']
BLOCK = data['block_num']
solo = data['base_num']

NOISE = data['noise_num']
ROT = data['rot_num']
SHIFT = data['sh_num']

In [6]:
# del data['target']
# del data['img_num']
# del data['block_num']
# del data['base_num']

# del data['noise_num']
# del data['rot_num']
# del data['sh_num']

In [7]:
data.drop(columns=['target', 'img_num', 'block_num', 'base_num', 'noise_num', 'rot_num', 'sh_num'], inplace=True)

In [8]:
ORIGINAL = (NOISE == 0) & (ROT == 0) & (SHIFT == 0)

# 0)  AUC and Mean Acc. Analysis of everything

In [9]:
def getBestTreshold(FPR, TPR, TH):
    """This function is for calculating the Mean Accuracy, given a ROC curve"""
    i_max = max(range(len(TPR)), key = lambda x : TPR[x] + 1 - FPR[x])
    mean_accuracy = (TPR[i_max] + 1 - FPR[i_max])/2
    return mean_accuracy, TH[i_max]

In [10]:
def getPerformance(true, prediction):
    auc = roc_auc_score(true, prediction)        
    fpr, tpr, th = roc_curve(true, prediction)
    if auc < .5:
        fpr, tpr = tpr, fpr
        auc = 1 - auc
    mean_acc, bestTH = getBestTreshold(fpr, tpr, th)
    return auc, mean_acc, bestTH, fpr, tpr

In [11]:
def splitByImages(data, test_size, repetitions):
    images = list(set(IMG.loc[data.index]))
    n = len(images)
    for _ in range(repetitions):
        np.random.shuffle(images)
        split_point = int(n * (1-test_size))
        yield images[:split_point], images[split_point:]

In [12]:
def XGBTrain(data, train_imgs, val_imgs, y):

    X_train, y_train = data.loc[IMG.isin(train_imgs)], y.loc[IMG.isin(train_imgs)]
    X_val, y_val = data.loc[IMG.isin(val_imgs)], y.loc[IMG.isin(val_imgs)]
    # print(len(train_imgs), len(val_imgs))
    
    ratio = float(np.sum(y_train == 1)) / np.sum(y_train==0)

    clf = xgb.XGBClassifier(
                    max_depth = 4,
                    n_estimators=1000,
                    learning_rate=0.2, 
                    nthread=6,
                    subsample=1.0,
                    colsample_bytree=1,
                    scale_pos_weight = ratio,
                    reg_alpha=0.03,
                    seed=1301)

    clf.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc",
            eval_set=[(X_val, y_val)], verbose = False)
    
    return clf

In [13]:
LessCompact = [0]
MoreCompact = [1]
Calibrated = [0, 1]
NotCalibrated = [2]
experiments = [[LessCompact, MoreCompact], [MoreCompact, LessCompact],
              [Calibrated, NotCalibrated], [NotCalibrated, Calibrated]]

In [14]:
file = open('AccTest_augmented_results.txt', 'a')

In [15]:
n_reps = 5
auc_matrix = np.zeros((4, n_reps))
mac_matrix = np.zeros((4, n_reps))
auc_matrix_val = np.zeros((4, n_reps))
clfs = [[None] * n_reps] * 4 # 4 is the number of experiments

for e, (left, test) in enumerate(experiments):
    print("Experiment", left, test)
    print("Experiment", left, test, file = file)
    Xleft = data.loc[solo.isin(left)]
    Xtest = data.loc[solo.isin(test)]
    # print(len(Xleft), len(Xtest))
    
    
    for i, (train_imgs, val_imgs) in enumerate(splitByImages(Xleft, 1/4, n_reps)):
        # train
        clf = XGBTrain(data, train_imgs, val_imgs, y)
        
        # get val performance of this trained model: ROC, AUC and MAc, and also the best threshold value for test
        mask = IMG.isin(val_imgs) & ORIGINAL
        prediction = clf.predict_proba(data.loc[mask])[:,-1]
        val_auc, val_mean_acc, bestTH, fpr, tpr = getPerformance(y.loc[mask], prediction)
        
        # get train either?
        
        # get test performance
        mask = solo.isin(test) & ORIGINAL
        prediction = clf.predict_proba(data.loc[mask])[:,-1]
        mean_acc = accuracy_score(y.loc[mask], 1 * (prediction > bestTH)) # 1 * array = array.astype(int)
        
               
        # save performance
        mac_matrix[e, i] = mean_acc
        auc_matrix_val[e, i] = clf.best_score
        
        # save classifiers for further analysis
#         clfs[e][i] = clf
        
        print(" > %d iterations \t Val vs Test (MAc) = %.3f vs %.3f" % (clf.best_iteration, 
                                                                               val_mean_acc, mean_acc))
        print(" > %d iterations \t Val vs Test (MAc) = %.3f vs %.3f" % (clf.best_iteration, 
                                                                               val_mean_acc, mean_acc), file = file)
        
print("Done.")  
file.close()

Experiment [0] [1]
 > 26 iterations 	 Val vs Test (MAc) = 0.948 vs 0.621
 > 4 iterations 	 Val vs Test (MAc) = 0.976 vs 0.804
 > 42 iterations 	 Val vs Test (MAc) = 0.960 vs 0.850
 > 35 iterations 	 Val vs Test (MAc) = 0.984 vs 0.864
 > 25 iterations 	 Val vs Test (MAc) = 0.969 vs 0.652
Experiment [1] [0]
 > 39 iterations 	 Val vs Test (MAc) = 0.925 vs 0.954
 > 113 iterations 	 Val vs Test (MAc) = 0.932 vs 0.947
 > 31 iterations 	 Val vs Test (MAc) = 0.917 vs 0.955
 > 38 iterations 	 Val vs Test (MAc) = 0.933 vs 0.956
 > 23 iterations 	 Val vs Test (MAc) = 0.926 vs 0.957
Experiment [0, 1] [2]
 > 12 iterations 	 Val vs Test (MAc) = 0.925 vs 0.722
 > 13 iterations 	 Val vs Test (MAc) = 0.922 vs 0.715
 > 29 iterations 	 Val vs Test (MAc) = 0.925 vs 0.718
 > 33 iterations 	 Val vs Test (MAc) = 0.922 vs 0.716
 > 26 iterations 	 Val vs Test (MAc) = 0.902 vs 0.708
Experiment [2] [0, 1]
 > 12 iterations 	 Val vs Test (MAc) = 0.895 vs 0.866
 > 3 iterations 	 Val vs Test (MAc) = 0.968 vs 0.898
 

In [None]:
for e, (left, test) in enumerate(experiments):
    print("Experiment", left, test)
    print("Experiment", left, test, file = file)
    Xleft = data.loc[solo.isin(left)]
    Xtest = data.loc[solo.isin(test)]
    for i, (train_imgs, val_imgs) in enumerate(splitByImages(Xleft, 1/4, n_reps)):
        
        print(" > %d iterations \t Val vs Test (MAc) = %.3f vs %.3f" % (clf.best_iteration, 
                                                                               val_mean_acc, mean_acc))

In [None]:
# Plot ROC curves for each soil for all columns
# and save AUC and MAc for each soil for all columns in two separate dataframes

scores_auc = pd.DataFrame(None, columns = [0, 1, 2], index=data.columns)
scores_mac = pd.DataFrame(None, columns = [0, 1, 2], index=data.columns)
for soil in range(3):
    plt.figure(figsize=(12,9))
    print("\nsolo:", soil)
    
    for col in data.columns:
        score = roc_auc_score(y.loc[solo == soil], data.loc[solo == soil][col])
        fpr, tpr, th = roc_curve(y.loc[solo == soil], data.loc[solo == soil][col])
        if score < .5:
            fpr, tpr = tpr, fpr
            score = 1 - score
        mean_acc, _ = getBestTreshold(fpr, tpr, th)

        scores_auc.loc[col, soil] = score
        scores_mac.loc[col, soil] = mean_acc

        #print(" > %-30s AUC = %.3f \t MeanAcc = %.3f" % (col, score, mean_acc))
        plt.plot(fpr, tpr)

    plt.ylim((0,1))
    plt.xlim((0,1))
    plt.plot([0,1], [0,1], "k--")
    plt.ylabel("TPR")
    plt.xlabel("FPR")
    plt.show()


In [None]:
scores_mac["amean"] = scores_mac[[0,1,2]].mean(axis=1)
scores_mac["min"] = scores_mac[[0,1,2]].min(axis=1)
#scores_mac["gmean"] = np.power(scores_mac[[0,1,2]].prod(axis=1) , 1/3)

In [None]:
best_features = scores_mac.sort_values("min", ascending=False)[[0,1,2]].iloc[:10]
pyperclip.copy(best_features.to_latex())
best_features

In [None]:
cols = scores_mac.sort_values("min", ascending=False)[[0,1,2]].iloc[:10].index
plt.figure(figsize=(16,5))
for soil in range(3):
    plt.subplot(1,3,soil+1)
    
    for col in cols:
        score = roc_auc_score(y.loc[solo == soil], data.loc[solo == soil][col])
        fpr, tpr, th = roc_curve(y.loc[solo == soil], data.loc[solo == soil][col])
        if score < .5:
            fpr, tpr = tpr, fpr
            score = 1 - score
            
        #print(" > %-30s AUC = %.3f \t MeanAcc = %.3f" % (col, score, mean_acc))
        plt.plot(fpr, tpr, label = col)
    plt.title("Solo %d" % soil)
    plt.ylim((0,1))
    plt.xlim((0,1))
    plt.plot([0,1], [0,1], "k--")
    plt.ylabel("TPR")
    plt.xlabel("FPR")
    plt.legend(loc = 4)
plt.savefig("report1/figures/10bestminfeatures.png")

In [None]:
scores_mac.sort_values(2, ascending=False)[2].iloc[:10]

# 1) Check VI discriminant capability

### In this section, we check the AUC, Mean Accuracy and ROC curves for each VI in each soil type (0, 1, 2)

In [None]:
names_VI = ["ExG", "ExGR", "CIVE", "VEG", "WI", "NGRDI"]

In [None]:
VI_cols = [name + "_mean" for name in names_VI]

In [None]:
# Get only columns with mean of VI values of 16x16 regions
VI_data = data[[name + "_mean" for name in names_VI]]

In [None]:
"""
for each generalization test:
    separate into test and rest (how many times?)
    separate rest into train and val (how many times?)
    find best th value and best VI (which mean: keep the VI used and the TH.
        > maybe save all in a table and then order it
    test the best
"""

In [None]:
mac_matrix = np.zeros((4, reps))
VI_scores = [{name + "_mean" : None for name in names_VI}] * 4 # 4 is the number of experiments

for e, (left, test) in enumerate(experiments):
    print("Experiment", left, test)
    Xleft = data.loc[solo.isin(left)]
    Xtest = data.loc[solo.isin(test)]
    
    
    for col in VI_cols:
        
        # get val performance of this trained model: ROC, AUC and MAc, and also the best threshold value for test
        prediction = Xleft[col]
        val_auc, val_mean_acc, bestTH, fpr, tpr = getPerformance(y.loc[solo.isin(left)], prediction)

        
        # get test performance
        prediction = Xtest[col]
        mean_acc = accuracy_score(y.loc[solo.isin(test)], 1 * (prediction > bestTH)) # 1 * array = array.astype(int)
        if col == "CIVE_mean":
            mean_acc = accuracy_score(y.loc[solo.isin(test)], 1 * (prediction < bestTH))
        
               
        # save performance
        VI_scores[e][col] = mean_acc
        
        print(" > %s \t Val vs Test (MAc) = %.3f vs %.3f" % (col, val_mean_acc, mean_acc))
        
print("Done.")    

In [None]:
n_reps = 5
auc_matrix = np.zeros((4, reps))
mac_matrix = np.zeros((4, reps))
auc_matrix_val = np.zeros((4, reps))
clfs = [[None] * n_reps] * 4 # 4 is the number of experiments

for e, (left, test) in enumerate(experiments):
    print("Experiment", left, test)
    Xleft = data.loc[solo.isin(left)]
    Xtest = data.loc[solo.isin(test)]
    
    
    for i, (train_imgs, val_imgs) in enumerate(splitByImages(Xleft, 1/4, n_reps)):
        # train
        clf = XGBTrain(data, train_imgs, val_imgs, y)
        
        # get val performance of this trained model: ROC, AUC and MAc, and also the best threshold value for test
        prediction = clf.predict_proba(data.loc[IMG.isin(val_imgs)])[:,-1]
        val_auc, val_mean_acc, bestTH, fpr, tpr = getPerformance(y.loc[IMG.isin(val_imgs)], prediction)
        
        # get train either?
        
        # get test performance
        prediction = clf.predict_proba(data.loc[solo.isin(test)])[:,-1]
        mean_acc = accuracy_score(y.loc[solo.isin(test)], 1 * (prediction > bestTH)) # 1 * array = array.astype(int)
        
               
        # save performance
        mac_matrix[e, i] = mean_acc
        auc_matrix_val[e, i] = clf.best_score
        
        # save classifiers for further analysis
        clfs[e][i] = clfs
        
        print(" > %d iterations \t Val vs Test (MAc) = %.3f vs %.3f" % (clf.best_iteration, 
                                                                               val_mean_acc, mean_acc))
        
print("Done.")    

# Análise do Resultados

In [None]:
clf_ind = 2 # base 2 out is better to find out why training in 0 and 1 fail in generalizing to 2
clf = clfs[clf_ind][0]

train_index, test_index = splits[clf_ind]
X_train, y_train = data.iloc[train_index], y.iloc[train_index]
X_test, y_test = data.iloc[test_index], y.iloc[test_index]

predict = clf.predict_proba(X_test, ntree_limit=clf.best_iteration)[:,-1]

In [None]:
plt.figure(figsize=(12,9))
for col in scores_mac[2].sort_values(ascending=False).index[:10]:
    score = roc_auc_score(y.loc[solo==2], data.loc[solo==2][col])
    fpr, tpr, th = roc_curve(y.loc[solo==2], data.loc[solo==2][col])
    if score < .5:
        fpr, tpr = tpr, fpr
        score = 1 - score
    mean_acc, _ = getBestTreshold(fpr, tpr, th)
    print(" > %-30s AUC = %.3f \t MeanAcc = %.3f" % (col, score, mean_acc))
    plt.plot(fpr, tpr, label = col)

score = roc_auc_score(y_test, predict)   
FPR, TPR, TH = roc_curve(y_test, predict)
mean_acc, _ = getBestTreshold(FPR, TPR, TH)
print(" > %-30s AUC = %.3f \t MeanAcc = %.3f" % ("predict", score, mean_acc))
plt.plot(FPR,TPR, "k-", label = "predict", lw = 1.5)

plt.title("Top 10 features with higher minimum Mean Accuracy in all three bases vs Learning")
plt.ylim((0,1))
plt.xlim((0,1))
plt.plot([0,1], [0,1], "k--")
plt.ylabel("TPR")
plt.xlabel("FPR")
plt.legend();
plt.savefig("report1/figures/prediction_vs_bestFeatures.png")

In [None]:
plt.figure(figsize=(12,9))
for col in importance_single[2].sort_values(ascending=False)[:10].index:
    score = roc_auc_score(y.loc[solo==2], data.loc[solo==2][col])
    fpr, tpr, th = roc_curve(y.loc[solo==2], data.loc[solo==2][col])
    if score < .5:
        fpr, tpr = tpr, fpr
        score = 1 - score
    mean_acc, _ = getBestTreshold(fpr, tpr, th)
    print(" > %-30s AUC = %.3f \t MeanAcc = %.3f" % (col, score, mean_acc))
    plt.plot(fpr, tpr, label = col)

score = roc_auc_score(y_test, predict)   
FPR, TPR, TH = roc_curve(y_test, predict)
mean_acc, _ = getBestTreshold(FPR, TPR, TH)
print(" > %-30s AUC = %.3f \t MeanAcc = %.3f" % ("predict", score, mean_acc))
plt.plot(FPR,TPR, "k-", label = "predict", lw = 1.5)

plt.title("Top 10 features with higher minimum importace value in all three bases vs Learning")
plt.ylim((0,1))
plt.xlim((0,1))
plt.plot([0,1], [0,1], "k--")
plt.ylabel("TPR")
plt.xlabel("FPR")
plt.legend()
plt.savefig("report1/figures/10bestminGAIN_vs_predict")

In [None]:
plt.figure(figsize=(17,5))
plt.scatter(predict, y_test, alpha = .1)
th = TH[th_ix]
plt.plot([th, th], [1,0], "r-")

In [None]:
train_pred = clf.predict_proba(data, ntree_limit=clf.best_iteration)[:,-1]

In [None]:
m1 = ((y == 1) & (train_pred < th))
m0 = ((y == 0) & (train_pred > th))
m1.sum(), m0.sum()

In [None]:
dec = PCA()

In [None]:
pca = dec.fit_transform(StandardScaler().fit_transform(data.fillna(10**-5)))

In [None]:
plt.figure(figsize=(17,5))
xvr = dec.explained_variance_ratio_[:30]
plt.bar(range(len(xvr)), np.cumsum(xvr))

In [None]:
plt.figure(figsize=(12,5))
for c in {0,1}:
    plt.subplot(1,2,c+1)
    plt.scatter(pca[y==(1-c),0], pca[y==(1-c),1], alpha = 0 * .2, label = "classe %d" % (1-c))
    plt.scatter(pca[y==c,0], pca[y==c,1], alpha = .2, label = "classe %d" % c, color = "blue" if c == 0 else "yellow")
    plt.plot([0,0], [pca[:,1].min(), pca[:,1].max()], "r--")
    plt.plot([pca[:,0].min(), pca[:,0].max()], [0,0], "r--")
    plt.legend()

In [None]:
plt.figure(figsize=(12,9))
for c in {0,1}:
    plt.scatter(pca[y==c,0], pca[y==c,1], alpha = .2, label = "classe %d" % c)
plt.scatter(pca[m0,0], pca[m0,1], label = "miss0")
plt.scatter(pca[m1,0], pca[m1,1], label = "miss1")
plt.legend()

In [None]:
from skimage.util.shape import view_as_blocks
from skimage.io import imread


In [None]:
paths = ["imgs_orig", "gts_orig"] # TROCAR AQUI DEPENDENDO DA BASE

path_imgs = sorted([ paths[0] + '/' + i for i in listdir(paths[0]) ])
path_gts = sorted([ paths[1] + '/' + i for i in listdir(paths[1]) ])


paths2 = ["imgs", "gts"] # TROCAR AQUI DEPENDENDO DA BASE

path_imgs2 = sorted([ paths2[0] + '/' + i for i in listdir(paths2[0]) ])
path_gts2 = sorted([ paths2[1] + '/' + i for i in listdir(paths2[1]) ])

In [None]:
misses = pd.concat([IMG[m0].value_counts().rename("NOVEG-miss"), IMG[m1].value_counts().rename("VEG-miss")], axis = 1).fillna(0)

In [None]:
misses["sum"] = misses["NOVEG-miss"] + misses["VEG-miss"]

In [None]:
misses.sort_values("sum",ascending=False)

In [None]:
def highlight(ind, color):
    x_ini = (ind % 32) * 16
    x_end = x_ini + 15
    y_ini = (ind // 32) * 16
    y_end = y_ini + 15
    plt.fill_between([x_ini, x_end], y_ini, y_end, alpha =.4, color = color)

In [None]:
i = 41
if i >= 40:
    img = imread(path_imgs2[i-40])
    gt = imread(path_gts2[i-40], as_grey=True)
else:
    img = imread(path_imgs[i])
    gt = imread(path_gts[i], as_grey=True)


In [None]:
plt.figure(figsize=(16,7))
plt.subplots_adjust(wspace = 0, hspace = 0)

plt.subplot(1,2,1)
plt.imshow(img)
for b in BLOCK[(IMG == i) & m1]:
    highlight(b, "red")
for b in BLOCK[(IMG == i) & m0]:
    highlight(b, "blue")
plt.ylim((511,0))
plt.xlim((0,511))

plt.subplot(1,2,2)
plt.imshow(gt)
for b in BLOCK[(IMG == i) & (y == 1)]:
    highlight(b, "green")
plt.ylim((511,0))
plt.xlim((0,511))

plt.savefig("report1/figures/errovisu%02d.png" % i)