In [4]:
import networkx as nx
import pandas as pd
import numpy as np
import seaborn as sns
import time

import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
Gmr = nx.read_graphml("data/Marta_Rovira-link-list_out.graphml")
Gv8 = nx.read_graphml("data/Vaga8Nov-link-list_out.graphml")
Gnb = nx.read_graphml("data/nochebuena-link-list_out.graphml")
Gbml = nx.read_graphml("data/BLM-link-list_out.graphml")
Gcov = nx.read_graphml("data/COVID19-link-list_out.graphml")
Gpri = nx.read_graphml("data/primavera-link-list_out.graphml")
Gqua = nx.read_graphml("data/quarantine-link-list_out.graphml")
Gsj = nx.read_graphml("data/santjordi-link-list_out.graphml")

In [None]:
nx.info(Gpri)

In [None]:
len(Gv8.nodes)

In [None]:
dict(Gpri.nodes)

In [1]:
data_mr = pd.DataFrame.from_dict(dict(Gmr.nodes), orient='index').drop(columns=['community'])
data_vaga = pd.DataFrame.from_dict(dict(Gv8.nodes), orient='index').drop(columns=['community'])
data_nochebuena = pd.DataFrame.from_dict(dict(Gnb.nodes), orient='index').drop(columns=['community'])
data_blm = pd.DataFrame.from_dict(dict(Gbml.nodes), orient='index').drop(columns=['community'])
data_covid19 = pd.DataFrame.from_dict(dict(Gcov.nodes), orient='index').drop(columns=['community'])
data_primavera = pd.DataFrame.from_dict(dict(Gpri.nodes), orient='index').drop(columns=['community'])
data_quarantine = pd.DataFrame.from_dict(dict(Gqua.nodes), orient='index').drop(columns=['community'])
data_santjordi = pd.DataFrame.from_dict(dict(Gsj.nodes), orient='index').drop(columns=['community'])

NameError: name 'pd' is not defined

In [None]:
data_primavera.columns

In [None]:
data_primavera.dispersion_index.unique()

In [None]:
dict_sources = {
    "Marta_Rovira": data_mr,
    "Nochebuena": data_nochebuena,
    "Vaga8Nov": data_vaga, 
    "BLM": data_blm,
    "COVID19": data_covid19,
    "primavera": data_primavera,
    "quarantine": data_quarantine,
    "sant_jordi": data_santjordi
}
dict_sources.keys()

## DBSCAN Clustering

In [None]:
def cluster(train_df, hyperparameters, name, isVisualized):
    print('Dataset: ', name)
    stats_df = pd.DataFrame(columns=['params', 'num_clusters', 'noise_pts'])
    
    # PCA
    data_subset = train_df.values
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(data_subset)
    train_df['pca-one'] = pca_result[:,0]
    train_df['pca-two'] = pca_result[:,1] 
    train_df['pca-three'] = pca_result[:,2]    
    print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
        
        
    for eps, minPts in hyperparameters:
        start = time.time()
        print('EPS:', eps, ' minPts:', minPts)
        clustering = DBSCAN(
            eps=eps, 
            min_samples=minPts,
            n_jobs=-1
        ).fit(train_df)
        labels_s = pd.Series(clustering.labels_)
        n_clusters = len(set(labels_s)) - (1 if -1 in labels_s else 0)
        n_noise = list(labels_s).count(-1)
        DB = metrics.davies_bouldin_score(train_df, labels_s)
        silhouette = metrics.silhouette_score(train_df, labels_s)
        stats_df = stats_df.append(
            {
                'params': str(eps) + '/' + str(minPts),
                'num_clusters': n_clusters,
                'noise_pts': n_noise,
                'DB': DB,
                'silhouette': silhouette
            },
            ignore_index=True
        )

        print('Estimated number of clusters: %d' % n_clusters)
        print('Estimated number of noise points: %d' % n_noise)
        if n_clusters > 1 and n_clusters < 10000:
            print('DB: ', DB)
            print('Silhouette: ', silhouette)
        elif n_clusters >= 5000:
            print('TOO MANY CLUSTERS!')
            continue
        else:
            print('NO CLUSTERS FORMED!')
            continue
        print('Time: ', time.time() - start)
        print('-------------------------------------------------------------')
        
        if isVisualized == True:
            train_df['label'] = labels_s
            visualization(train_df, (eps, minPts), set(labels_s))

    if isVisualized != True: 
        plt.figure(figsize=(16,10))
        sns.barplot(x="params", y="num_clusters", data=stats_df).set_title(name + " dataset")
        plt.figure(figsize=(16,10))
        sns.barplot(x="params", y="noise_pts", data=stats_df).set_title(name + " dataset")
        plt.figure(figsize=(16,10))
        sns.barplot(x="params", y="DB", data=stats_df).set_title(name + " dataset")
        plt.figure(figsize=(16,10))
        sns.barplot(x="params", y="silhouette", data=stats_df).set_title(name + " dataset")

In [None]:
def visualization(train_df, params, labels):
    (eps, minPts) = params
    
    # TSNE Training
    time_start = time.time()
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(train_df.values)
    print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

    # VISUALIZATION
    train_df['tsne-first'] = tsne_results[:,0]
    train_df['tsne-second'] = tsne_results[:,1]
    plt.figure(figsize=(16,10))
    g = sns.scatterplot(
        x="tsne-first", 
        y='tsne-second',
        hue="label",
        palette=sns.color_palette("hls", len(labels)),
        data=train_df,
        legend=False,
        alpha=0.3
    ).set_title('Params: ' + str(eps) + ', ' + str(minPts))

**DB index score**: the lower the better<br>
**Silhouette coefficient**: (-1, 1) the higher the better

In [None]:
N = 50000

params = []
for eps in [0.1, 0.3, 0.5, 1, 1.5, 2]:
    for pts in [2, 3, 5, 8]:
        params.append((eps, pts))

for name, df in dict_sources.items():
    train = df.iloc[:N,:].copy().reset_index(drop=True)
    cluster(train, params, name, False)

In [None]:
dict_params = {
    "Marta_Rovira": [(2, 5)],
    "Nochebuena": [(2, 3)],
    "Vaga8Nov": [(2, 5)], 
    "BLM": [(2, 2)],
    "COVID19": [(2, 2)],
    "primavera": [(2, 2)],
    "quarantine": [(2, 2)],
    "sant_jordi": [(2, 2)]
}

In [None]:
for name, df in dict_sources.items():
    train = df.copy().reset_index(drop=True)
    cluster(train, dict_params[name], name, True)