In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_validate

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

In [3]:
rs = 1729

In [4]:
original_df = pd.read_csv(r'../../datasets/timeseries_dataset/timeseries_dataset.csv')[0:5000]
# original_df = original_df.loc[:, ~original_df.columns.isin(['id'])]
original_df.head()

Unnamed: 0,Time,Open,High,Low,Close,Volume
0,1451347000.0,1.09746,1.09783,1.09741,1.09772,486680000.0
1,1451348000.0,1.09772,1.098,1.0977,1.0979,445920000.0
2,1451349000.0,1.0979,1.09805,1.09782,1.09792,1210700000.0
3,1451350000.0,1.09792,1.09825,1.09775,1.09808,1116910000.0
4,1451351000.0,1.09808,1.09824,1.09791,1.09822,503880000.0


In [5]:
columns = list(original_df.columns)
n_samples, n_features = original_df.shape

In [6]:
synthetic_df = pd.read_csv(r'../../datasets/timeseries_dataset/synthetic.csv')
synthetic_df = synthetic_df[columns]
synthetic_df.head()

Unnamed: 0,Time,Open,High,Low,Close,Volume
0,1455043000.0,1.114657,1.114469,1.114591,1.114875,226172800.0
1,1454257000.0,1.084449,1.083999,1.083764,1.083197,-12299860.0
2,1454773000.0,1.10681,1.105997,1.10773,1.104799,-10927810.0
3,1453602000.0,1.080069,1.079876,1.078143,1.079582,124786000.0
4,1455342000.0,1.120671,1.120474,1.119948,1.120266,30890580.0


In [7]:
predictors = {'categorical': [],
             'continuous': ['Time', 'Open', 'High', 'Low', 'Close', 'Volume']}
targets = {'categorical': [],
           'continuous': []}


In [8]:
from sklearn.preprocessing import LabelEncoder

def to_numpy(df, preds, trgts, encs=None):
    columns = list(df.columns)
    X = np.empty(df[[i for k, v in preds.items() for i in v]].shape)
    Y = np.empty(df[[i for k, v in trgts.items() for i in v]].shape)
    
    for i, col in enumerate(preds['continuous']):
        X[:, columns.index(col)] = df[col]
    for i, col in enumerate(trgts['continuous']):
        Y[:, i] = df[col]
    
    if encs is None:
        encs = dict()
    for i, col in enumerate(preds['categorical']):
        if col not in encs:
            enc = LabelEncoder()
            enc.fit(df[col])
            encs[col] = enc
        else:
            enc = encs[col]
        X[:, columns.index(col)] = enc.transform(df[col])
    for i, col in enumerate(trgts['categorical']):
        if col not in encs:
            enc = LabelEncoder()
            enc.fit(df[col])
            encs[col] = enc
        else:
            enc = encs[col]
        Y[:, i] = enc.transform(df[col])
    
    return X, Y, encs

original_X, original_Y, original_encs = to_numpy(original_df, predictors, targets)
synthetic_X, synthetic_Y, synthetic_encs = to_numpy(synthetic_df, predictors, targets, original_encs)


In [9]:
OX = np.hstack((original_X, original_Y))
SX = np.hstack((synthetic_X, synthetic_Y))
MX = np.vstack((OX, SX))
print(OX.shape, SX.shape, MX.shape)

(5000, 6) (5000, 6) (10000, 6)


In [10]:
def to_numpy_x(df, columns):
    """
    Convert dataframe to numpy array by converting categorical variables as integer labels
    :param df: The dataframe to convert
    :param columns: The continuous & discrete columns
    :return: Converted numpy arrays of X & Y and the label encoders
    """

    # declaring empty arrays for the predictor and target variables
    X = np.empty(df[[i for i in columns]].shape)

    # Assign the values from the columns, that are declared continuous,
    # into the corresponding positions in the numpy array
#     for i, col in enumerate(columns):
#         X[:, columns.index(col)] = df[col]

    # Encoding for X (predictors)
    for i, col in enumerate(columns):
        # Initialize, fit and store the label encoder for column
        enc = LabelEncoder()
        enc.fit(df[col])

        # Transform the column using the encoder object
        X[:, columns.index(col)] = enc.transform(df[col])

    return X


In [11]:
categorical_cols = list()
categorical_cols.extend(predictors['categorical'])
categorical_cols.extend(targets['categorical'])

continuous_cols = list()
continuous_cols.extend(predictors['continuous'])
continuous_cols.extend(targets['continuous'])

original_cat = to_numpy_x(original_df[categorical_cols], categorical_cols)
original_con = original_df[continuous_cols].to_numpy()
syn_cat = to_numpy_x(synthetic_df[categorical_cols], categorical_cols)
syn_con = synthetic_df[continuous_cols].to_numpy()

In [10]:
# original_clusters = clustering.fit_predict(OX)
# print(original_clusters)

In [11]:
# mixed_clusters = clustering.fit_predict(MX)
# print(mixed_clusters)

In [13]:
import math
from sklearn.cluster import AgglomerativeClustering

def log_cluster_metric(X, Xn, n_clusters=2, categorical=False):
    if categorical:
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='manhattan', linkage='average')
    else:
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='average')
    print(clusterer)
    Xm = np.concatenate((X, Xn), axis=0)
    Na = X.shape[0]
    Nb = Xn.shape[0]
    
    original_labels = clusterer.fit_predict(X)
    merged_labels = clusterer.fit_predict(Xm)
    c = Na / (Na + Nb)
    print(c)

    unique_m, counts_m = np.unique(merged_labels, return_counts=True)
    unique_o, counts_o = np.unique(original_labels, return_counts=True)

    _sum = 0

    for _index, i in enumerate(unique_m):
        _sum += ((counts_m[_index] / counts_o[_index]) - c) ** 2
#     score = math.log(_sum / len(unique_m))
    score = _sum / len(unique_m)
    return score, original_labels, merged_labels


clusters = 5
# print(f'Clustering and scoring categorical data')
# cat_score, cat_original_labels, cat_merged_labels = log_cluster_metric(original_cat, syn_cat, n_clusters=clusters, categorical=True)
print(f'Clustering and scoring continuous data')
con_score, con_original_labels, con_merged_labels = log_cluster_metric(original_con, syn_con, n_clusters=clusters)


Clustering and scoring continuous data
AgglomerativeClustering(linkage='average', n_clusters=5)
0.5


In [15]:
print(math.log(np.mean([con_score])))

8.956555457309022


In [16]:
corr = original_df.corr('pearson')
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')
 .set_precision(2))

  (corr


Unnamed: 0,Time,Open,High,Low,Close,Volume
Time,,,,,,
Open,0.73,,,,,
High,0.73,1.0,,,,
Low,0.73,1.0,1.0,,,
Close,0.73,1.0,1.0,1.0,,
Volume,0.27,0.19,0.21,0.17,0.19,


In [17]:
corr = synthetic_df.corr('pearson')
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')
 .set_precision(2))

  (corr


Unnamed: 0,Time,Open,High,Low,Close,Volume
Time,,,,,,
Open,0.73,,,,,
High,0.73,1.0,,,,
Low,0.73,1.0,0.99,,,
Close,0.73,1.0,1.0,1.0,,
Volume,0.25,0.16,0.18,0.14,0.16,


In [18]:
print(np.linalg.norm(original_df.corr('pearson')-synthetic_df.corr('pearson')))

0.09666693420148625
