In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_validate

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

In [2]:
rs = 1729

In [3]:
original_df = pd.read_csv(r'../../datasets/continuous_w_discreteTarget/WineQT.csv')
original_df = original_df.loc[:, ~original_df.columns.isin(['Id'])]
original_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
columns = list(original_df.columns)
synthetic_df = pd.read_csv(r'../../datasets/continuous_w_discreteTarget/synthetic.csv')
synthetic_df = synthetic_df[columns]
synthetic_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,9.582067,0.864727,0.21643,2.695922,0.081322,8.063538,15.173011,0.99471,3.252378,0.505812,11.72244,5
1,8.208484,0.465102,0.028454,3.658549,0.072635,9.360043,22.988563,0.994127,3.463537,0.744871,11.866676,5
2,9.099823,0.663394,0.236752,1.062344,0.09394,5.910659,20.380723,0.997278,3.376956,0.793022,11.513966,5
3,7.569382,0.53702,0.080665,3.241168,0.047722,2.577581,5.795272,0.996135,3.38215,0.421073,9.309241,5
4,5.88665,0.691991,-0.011778,1.188431,0.059731,4.130674,11.205299,0.995342,3.362238,0.700269,10.308484,5


In [5]:
predictors = {'categorical': [],
              'continuous': ['fixed acidity', 'volatile acidity', 'citric acid',
              'residual sugar', 'chlorides', 'free sulfur dioxide',
              'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']}
targets = {'categorical': ['quality'],
           'continuous': []}

In [6]:
from sklearn.preprocessing import LabelEncoder

def to_numpy(df, preds, trgts, encs=None):
    columns = list(df.columns)
    X = np.empty(df[[i for k, v in preds.items() for i in v]].shape)
    Y = np.empty(df[[i for k, v in trgts.items() for i in v]].shape)
    
    for i, col in enumerate(preds['continuous']):
        X[:, columns.index(col)] = df[col]
    for i, col in enumerate(trgts['continuous']):
        Y[:, i] = df[col]
    
    if encs is None:
        encs = dict()
    for i, col in enumerate(preds['categorical']):
        if col not in encs:
            enc = LabelEncoder()
            enc.fit(df[col])
            encs[col] = enc
        else:
            enc = encs[col]
        X[:, columns.index(col)] = enc.transform(df[col])
    for i, col in enumerate(trgts['categorical']):
        if col not in encs:
            enc = LabelEncoder()
            enc.fit(df[col])
            encs[col] = enc
        else:
            enc = encs[col]
        Y[:, i] = enc.transform(df[col])
    
    return X, Y, encs

original_X, original_Y, original_encs = to_numpy(original_df, predictors, targets)
synthetic_X, synthetic_Y, synthetic_encs = to_numpy(synthetic_df, predictors, targets, original_encs)


In [7]:
OX = np.hstack((original_X, original_Y))
SX = np.hstack((synthetic_X, synthetic_Y))
MX = np.vstack((OX, SX))
print(OX.shape, SX.shape, MX.shape)

(1143, 12) (1000, 12) (2143, 12)


In [8]:
def to_numpy_x(df, columns):
    """
    Convert dataframe to numpy array by converting categorical variables as integer labels
    :param df: The dataframe to convert
    :param columns: The continuous & discrete columns
    :return: Converted numpy arrays of X & Y and the label encoders
    """

    # declaring empty arrays for the predictor and target variables
    X = np.empty(df[[i for i in columns]].shape)

    # Assign the values from the columns, that are declared continuous,
    # into the corresponding positions in the numpy array
#     for i, col in enumerate(columns):
#         X[:, columns.index(col)] = df[col]

    # Encoding for X (predictors)
    for i, col in enumerate(columns):
        # Initialize, fit and store the label encoder for column
        enc = LabelEncoder()
        enc.fit(df[col])

        # Transform the column using the encoder object
        X[:, columns.index(col)] = enc.transform(df[col])

    return X


In [9]:
categorical_cols = list()
categorical_cols.extend(predictors['categorical'])
categorical_cols.extend(targets['categorical'])

continuous_cols = list()
continuous_cols.extend(predictors['continuous'])
continuous_cols.extend(targets['continuous'])

original_cat = to_numpy_x(original_df[categorical_cols], categorical_cols)
original_con = original_df[continuous_cols].to_numpy()
syn_cat = to_numpy_x(synthetic_df[categorical_cols], categorical_cols)
syn_con = synthetic_df[continuous_cols].to_numpy()

In [10]:
# original_clusters = clustering.fit_predict(OX)
# print(original_clusters)

In [11]:
# mixed_clusters = clustering.fit_predict(MX)
# print(mixed_clusters)

In [27]:
import math
from sklearn.cluster import AgglomerativeClustering

def log_cluster_metric(X, Xn, n_clusters=2, categorical=False):
    if categorical:
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='manhattan', linkage='average')
    else:
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='average')
    print(clusterer)
    Xm = np.concatenate((X, Xn), axis=0)
    Na = X.shape[0]
    Nb = Xn.shape[0]
    
    original_labels = clusterer.fit_predict(X)
    merged_labels = clusterer.fit_predict(Xm)
    c = Na / (Na + Nb)
    print(c)

    unique_m, counts_m = np.unique(merged_labels, return_counts=True)
    unique_o, counts_o = np.unique(original_labels, return_counts=True)

    _sum = 0

    for _index, i in enumerate(unique_m):
        _sum += ((counts_m[_index] / counts_o[_index]) - c) ** 2
#     score = math.log(_sum / len(unique_m))
    score = _sum / len(unique_m)
    return score, original_labels, merged_labels


clusters = 20
print(f'Clustering and scoring categorical data')
cat_score, cat_original_labels, cat_merged_labels = log_cluster_metric(original_cat, syn_cat, n_clusters=clusters, categorical=True)
print(f'Clustering and scoring continuous data')
con_score, con_original_labels, con_merged_labels = log_cluster_metric(original_con, syn_con, n_clusters=clusters)


Clustering and scoring categorical data
AgglomerativeClustering(affinity='manhattan', linkage='average', n_clusters=20)
0.5333644423705086
Clustering and scoring continuous data
AgglomerativeClustering(linkage='average', n_clusters=20)
0.5333644423705086


In [28]:
print(math.log(np.mean([cat_score, con_score])))

4.613299539593365


In [32]:
o_corr = original_df.corr('pearson')
mask = np.zeros_like(o_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
o_corr[mask] = np.nan
(o_corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')
 .set_precision(2))

  (o_corr


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,,,,,,,,,,,,
volatile acidity,-0.25,,,,,,,,,,,
citric acid,0.67,-0.54,,,,,,,,,,
residual sugar,0.17,-0.01,0.18,,,,,,,,,
chlorides,0.11,0.06,0.25,0.07,,,,,,,,
free sulfur dioxide,-0.16,-0.0,-0.06,0.17,0.02,,,,,,,
total sulfur dioxide,-0.11,0.08,0.04,0.19,0.05,0.66,,,,,,
density,0.68,0.02,0.38,0.38,0.21,-0.05,0.05,,,,,
pH,-0.69,0.22,-0.55,-0.12,-0.28,0.07,-0.06,-0.35,,,,
sulphates,0.17,-0.28,0.33,0.02,0.37,0.03,0.03,0.14,-0.19,,,


In [33]:
s_corr = synthetic_df.corr('pearson')
mask = np.zeros_like(s_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
s_corr[mask] = np.nan
(s_corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')
 .set_precision(2))

  (s_corr


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,,,,,,,,,,,,
volatile acidity,-0.21,,,,,,,,,,,
citric acid,0.66,-0.48,,,,,,,,,,
residual sugar,0.24,0.0,0.2,,,,,,,,,
chlorides,0.16,-0.0,0.33,0.07,,,,,,,,
free sulfur dioxide,-0.17,0.02,-0.06,0.2,0.02,,,,,,,
total sulfur dioxide,-0.15,0.05,0.03,0.17,0.02,0.65,,,,,,
density,0.6,0.03,0.34,0.43,0.15,0.0,0.04,,,,,
pH,-0.7,0.22,-0.57,-0.18,-0.29,0.07,-0.04,-0.31,,,,
sulphates,0.14,-0.29,0.36,0.02,0.41,0.05,0.1,0.08,-0.21,,,


In [25]:
print(np.linalg.norm(original_df.corr('pearson')-synthetic_df.corr('pearson')))

1.3475263671295594
