# Praca domowa nr 4 - generator danych do raportu

### Wojciech Celej

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.rcParams["figure.figsize"] = (15,10)
plt.style.use("seaborn-whitegrid")
sns.set_context("notebook", font_scale=1.2)

---

## Załadowanie zbiorów testowych

Generacja słownika `benchmark_set` przechowującego tabele opisujące dany zbiór benchmarkowy. Klucz dla danego zbioru jest tworzony według schematu: `<nazwa_folderu>_<nazwa_zbioru>`

In [None]:
dataset_directory = "pd4-zbiory-benchmarkowe"
data_suffix = ".data.gz"
label_suffix = ".labels0.gz"

In [None]:
directories_list = [f for f in os.listdir(dataset_directory) if os.path.isdir(os.path.join(dataset_directory, f)) and not f.startswith(".")]
benchmark_set = {}
for directory in directories_list:
    path_to_dataset = os.path.join(dataset_directory, directory)
    for file in os.listdir(path_to_dataset):
        if file.endswith(data_suffix):
            data = np.loadtxt(os.path.join(path_to_dataset, file), ndmin=2)
            data_set_name = file.split(".")[0]
            label_file_name = data_set_name + label_suffix
            label = np.loadtxt(os.path.join(path_to_dataset, label_file_name), dtype=np.int)
            d = {"x": data[:, 0], "y": data[:, 1], "label": label}
            df = pd.DataFrame(data=d)            
            benchmark_set[directory+"_"+data_set_name] = df

Tak utworzone zbiory można narysować

In [None]:
for key, value in benchmark_set.items():
    d = value
    ax = sns.scatterplot(data=d, x=d.columns[0], y=d.columns[1], hue=d.columns[2], legend="full", palette="tab20")
    ax.set_title(key)
    plt.show()

---

In [None]:
import spectral
import scipy as sp
import genieclust
import sklearn.cluster
from sklearn.metrics import *

---

## Testowanie algorytmów klasteryzacji na poszczególnych zbiorach

Algorytmy, które zostaną zbadane:
* algorytm spektralny (włsana implementacja)
* algorytmy hierarchiczne z pakietu `scipy.cluster.hierarchy.linkage`
* algorytm *Genie* z pakietu `genieclust`
* 3 algorytmy pochodzące z `sklearn.cluster`: 

Utworzona zostanie tabela `DataFrame`, gdzie każdy jej wiersz będzie zawierał:
* nazwę zbioru
* nazwę użytego algorytmu  
oraz wartości:
* indeksu Fowlkesa-Mallowsa (FM): `sklearn.metrics.fowlkes_mallows_score()`
* indeksu AM: `sklearn.metrics.adjusted_mutual_info_score()`
* skorygowanego indeksu Randa (AR): `sklearn.metrics.adjusted_rand_score()`  
* analogiczne 3 wartości indeksów dla zmiennych ustandaryzowanych.

In [None]:
column_names = ("benchmark_set", "algorithm", "FM", "AM", "AR", "FM_std", "AM_std", "AR_std")
results = pd.DataFrame(columns = column_names)

In [None]:
def append_row(df: pd.DataFrame, row):
    """
    df - DataFrame to which append
    row - tuple or list containing row values - must be in proper order
    return: new DataFrame object
    """
    if len(df.columns) != len(row):
        raise ValueError
    a = dict()
    for i in range(len(row)):
        a[df.columns[i]] = row[i]
    new_df = df.append(a, ignore_index=True)
    return new_df

In [None]:
def parse_point_df(df):
    X = df.loc[:, ["x", "y"]].values
    X_std = (X-np.mean(X, axis=0))/np.std(X, ddof=1, axis=0)
    labels_true = df.loc[:, "label"].values
    numOfClusters = df.loc[:, "label"].unique().shape[0]
    return X, X_std, labels_true, numOfClusters

In [None]:
def clustering_perf_evaluation(labels_true, labels_pred):
    fm = fowlkes_mallows_score(labels_true, labels_pred)
    am = adjusted_mutual_info_score(labels_true, labels_pred, average_method="arithmetic")
    ar = adjusted_rand_score(labels_true, labels_pred)
    assert np.all(labels_pred>=0)
    return fm, am, ar

---

## 1. Własna implementacja algorytmu spektralnego

In [None]:
n_neighbors = [5, 10]

In [None]:
for n_neigh in n_neighbors:
    for set_name, set_data in benchmark_set.items():
        if not set_name.startswith("sipu_unbal"):
            continue
        X, X_std, labels_true, numOfClusters = parse_point_df(set_data)
        labels_pred = spectral.spectral_clustering(X, numOfClusters, n_neigh)
        labels_pred2 = spectral.spectral_clustering(X_std, numOfClusters, n_neigh)
        fm, am, ar = clustering_perf_evaluation(labels_true, labels_pred)
        fm2, am2, ar2 = clustering_perf_evaluation(labels_true, labels_pred2)
        row_values = (set_name, "spectral_n_neigh_"+str(n_neigh), fm, am, ar, fm2, am2, ar2)
        results = append_row(results, row_values)

---

## 2. Algorytmy hierarchiczne

In [None]:
hierarchical_methods = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]

In [None]:
for method in hierarchical_methods:
    for set_name, set_data in benchmark_set.items():
        X, X_std, labels_true, numOfClusters = parse_point_df(set_data)
        z = sp.cluster.hierarchy.linkage(X, method=method)
        labels_pred = sp.cluster.hierarchy.cut_tree(z, n_clusters=numOfClusters).reshape(-1)
        z = sp.cluster.hierarchy.linkage(X_std, method=method)
        labels_pred2 = sp.cluster.hierarchy.cut_tree(z, n_clusters=numOfClusters).reshape(-1)
        fm, am, ar = clustering_perf_evaluation(labels_true, labels_pred)
        fm2, am2, ar2 = clustering_perf_evaluation(labels_true, labels_pred2)
        row_values = (set_name, "hierarchy_"+method, fm, am, ar, fm2, am2, ar2)
        results = append_row(results, row_values)

---

## 3. Algorytm *Genie*

In [None]:
gini_tresholds = [0.2, 0.3, 0.4, 0.5]

In [None]:
for treshold in gini_tresholds:
    for set_name, set_data in benchmark_set.items():
        X, X_std, labels_true, numOfClusters = parse_point_df(set_data)
        g = genieclust.genie.Genie(n_clusters=numOfClusters, gini_threshold=treshold)
        labels_pred = g.fit_predict(X)
        labels_pred2 = g.fit_predict(X_std)
        fm, am, ar = clustering_perf_evaluation(labels_true, labels_pred)
        fm2, am2, ar2 = clustering_perf_evaluation(labels_true, labels_pred2)
        row_values = (set_name, "genieclust_tresh_"+str(treshold), fm, am, ar, fm2, am2, ar2)
        results = append_row(results, row_values)

---

## 4. Wybrane 3 Algorytmy z pakietu `sklearn.cluster`

### KMeans

In [None]:
for set_name, set_data in benchmark_set.items():
    X, X_std, labels_true, numOfClusters = parse_point_df(set_data)
    g = sklearn.cluster.KMeans(n_clusters=numOfClusters)
    labels_pred = g.fit_predict(X)
    labels_pred2 = g.fit_predict(X_std)
    fm, am, ar = clustering_perf_evaluation(labels_true, labels_pred)
    fm2, am2, ar2 = clustering_perf_evaluation(labels_true, labels_pred2)
    row_values = (set_name, "KMeans", fm, am, ar, fm2, am2, ar2)
    results = append_row(results, row_values)

### AffinityPropagation

In [None]:
damp_values = [0.7, 0.9]
for damp in damp_values:
    for set_name, set_data in benchmark_set.items():
        if not set_name.startswith("sipu_"):
            continue
        X, X_std, labels_true, numOfClusters = parse_point_df(set_data)
        g = sklearn.cluster.AffinityPropagation(damping=damp)
        labels_pred = g.fit_predict(X)
        labels_pred2 = g.fit_predict(X_std)
        fm, am, ar = clustering_perf_evaluation(labels_true, labels_pred)
        fm2, am2, ar2 = clustering_perf_evaluation(labels_true, labels_pred2)
        row_values = (set_name, "AffProp_damp_"+str(damp), fm, am, ar, fm2, am2, ar2)
        results = append_row(results, row_values)

### Birch

In [None]:
threshold_values = [0.4, 0.3, 0.2]

In [None]:
for threshold in threshold_values:
    for set_name, set_data in benchmark_set.items():
        X, X_std, labels_true, numOfClusters = parse_point_df(set_data)
        g = sklearn.cluster.Birch(n_clusters=numOfClusters, threshold=threshold)
        labels_pred = g.fit_predict(X)
        labels_pred2 = g.fit_predict(X_std)
        fm, am, ar = clustering_perf_evaluation(labels_true, labels_pred)
        fm2, am2, ar2 = clustering_perf_evaluation(labels_true, labels_pred2)
        row_values = (set_name, "Birch_thresh_"+str(threshold), fm, am, ar, fm2, am2, ar2)
        results = append_row(results, row_values)

---

## Zapisanie wyników do pliku `csv`

In [None]:
results.to_csv("results.csv", index=False)