## Example analysis of a single epxerimental setting
with additional plots/visualizations

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import sys
import umap
import importlib
from pathlib import Path
from sklearn.cluster import OPTICS
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, rand_score, adjusted_rand_score, mutual_info_score, adjusted_mutual_info_score, normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score, fowlkes_mallows_score

sys.path.append('..')
from analysis import label_tools as lt
from analysis import visualization as viz

In [None]:
# stadtwald best
fc1_path = Path('/home/richard/data/Bamberg_Stadtwald/encodings/resnet_clahe_polygon_pred.pickle')
le_path = Path('/home/richard/data/Bamberg_Stadtwald/label_encodings/resnet_clahe_label_encodings.pickle')

# load the data and label encoder into memory
with open(fc1_path, 'rb') as f:
    data = pickle.load(f)
with open(le_path, 'rb') as l:
    le = pickle.load(l)

In [None]:
# receive dict data
files = data['filename']
fc1 = data['features']
labels = data['labels']
y_gt = le.transform(labels)

In [None]:
# dimensionality reduction
reducer = umap.UMAP(n_components=2, metric='cosine', random_state=990561) 
reduced = reducer.fit_transform(fc1)
print(reduced.shape)
plt.scatter(
    reduced[:, 0],
    reduced[:, 1])
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection Stadtwald', fontsize=24)

In [None]:
# show pano plot of UMAP
viz.pano_plot(reduced, files)

In [None]:
# clustering
clustering = OPTICS(min_samples=5).fit(reduced) # 3 good for stadtwald
labels_unmatched = clustering.labels_
y_pred = lt.label_matcher(labels_unmatched, y_gt)
zeros = [n for n in labels_unmatched if n == -1]

In [None]:
tx, ty = reduced[:,0], reduced[:,1]
tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))
labels_ordered = le.inverse_transform(range(len(le.mapper)))
df = pd.DataFrame({'files': files,
                   'x':tx,
                   'y':ty,
                   'labels': labels,
                  },
                  index=files)

y_pred_str = le.inverse_transform(y_pred)
y_gt_str = le.inverse_transform(y_gt)
df['y_pred_labels'] = pd.Series(y_pred_str, index=files)
df['y_pred_labels'] = pd.Series(y_pred, index=files)
df['labels'] = pd.Series(y_gt, index=files)

fig, ax = plt.subplots(1,2, figsize=(8,5), dpi=150)


sns.scatterplot(data=df, x='x', y='y', hue='labels', palette='tab10', hue_order=sorted(set(y_gt)), ax=ax[0]) # ground truth labels
sns.scatterplot(data=df, x='x', y='y', hue='y_pred_labels', palette='tab10', hue_order=sorted(set(y_gt)), ax=ax[1]) # predicted labels

ax[0].get_legend().remove()
ax[1].legend(bbox_to_anchor=(1.05,1))
ax[0].set_title('ground truth labels')
ax[1].set_title('predicted labels')
ax[0].set_ylabel(None)
ax[1].set_ylabel(None)
ax[0].set_xlabel(None)
ax[1].set_xlabel(None)
fig.tight_layout()
plt.show()

In [None]:
# confusion matrix
labels_ordered = le.inverse_transform(range(len(le.mapper)))
CM = confusion_matrix(y_gt, y_pred)
print('Accuracy: {:.3f}'.format(CM.trace()/CM.sum()))
viz.pretty_cm(CM, labels_ordered)

In [None]:
print(classification_report(y_gt, y_pred, zero_division=0))

In [None]:
def tp(y_true, y_pred):
    return np.sum(np.multiply([i==True for i in y_pred], y_true))
def fp(y_true, y_pred):
    return np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
def tn(y_true, y_pred):
    return np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
def fn(y_true, y_pred):
    return np.sum(np.multiply([i==False for i in y_pred], y_true))

def get_multiclass_cm_values(y_true, y_pred):
    tp_values = []
    fp_values = []
    tn_values = []
    fn_values = []
    for i in np.unique(y_true):
        modified_true = [i==j for j in y_true]
        modified_pred = [i==j for j in y_pred]
        TP = tp(modified_true, modified_pred)
        tp_values.append(TP)
        FP = fp(modified_true, modified_pred)
        fp_values.append(FP)
        TN = tn(modified_true, modified_pred)
        tn_values.append(TN)
        FN = fn(modified_true, modified_pred)
        fn_values.append(FN)
    return np.mean(tp_values), np.mean(fp_values), np.mean(fp_values),  np.mean(fp_values)

def f_star(y_gt, y_pred):
    TP, FP, TN, FN = get_multiclass_cm_values(y_gt, y_pred)
    return TP / (FN + FP + TP)

In [None]:
# ...more cluster metrics
# f-star
print("f*-score")
print(f_star(y_gt, y_pred))
# mean squared error
print("\nMSE")
print(mean_squared_error(y_gt, y_pred))
print(mean_squared_error(y_gt, y_pred, squared=False))
print("\nMAE")
print(mean_absolute_error(y_gt, y_pred))
# rand score
print("\nRand index")
print(rand_score(y_gt, y_pred))
print(adjusted_rand_score(y_gt, y_pred))
# mututal information based scores
print("\nMutual information based scores")
print(mutual_info_score(y_gt, y_pred))
print(adjusted_mutual_info_score(y_gt, y_pred))
print(normalized_mutual_info_score(y_gt, y_pred))
# Homogenity, completeness and v-measure
print("\nHomogeneity, completeness and v-measure")
print(homogeneity_score(y_gt, y_pred))
print(completeness_score(y_gt, y_pred))
print(v_measure_score(y_gt, y_pred))
# fowlkes
print("\nFowlkes-mallows-score")
print(fowlkes_mallows_score(y_gt, y_pred))
# Cohen-kappa
print("\nCohen-kappa")
print(cohen_kappa_score(y_gt, y_pred))
# matthew's correlation coefficient
print("\nMCC")
print(matthews_corrcoef(y_gt, y_pred))

In [None]:
# print image examples
df = pd.DataFrame(columns=['filename','label'])
df['filename'] = files
df['label'] = y_pred_str
n_row = 8
n_col = 2
_, axs = plt.subplots(n_row, n_col, figsize=(25, 25))
axs = axs.flatten()
imgs = df['filename']
for img_path, ax in zip(imgs[0:n_row*n_col], axs):
    img = Image.open(img_path)
    label_value = df.loc[df.filename == img_path, 'label'].values[0]
    ax.set_title(label_value)
    ax.imshow(img)
    ax.grid(None)
    plt.tight_layout()