In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_validate

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning

In [2]:
rs = 1729

In [3]:
original_df = pd.read_csv(r'../../datasets/imbalanced_dataset/aug_train.csv')[0:10000]
original_df = original_df.loc[:, ~original_df.columns.isin(['id'])]
original_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0
1,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0
2,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0
3,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0
4,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0


In [37]:
columns = list(original_df.columns)
synthetic_df = pd.read_csv(r'../../datasets/imbalanced_dataset/synthetic.csv')
synthetic_df = synthetic_df[columns]
synthetic_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,40,1,9.0,0,1-2 Year,Yes,46994.601769,124.0,287,1
1,Male,38,1,3.0,0,> 2 Years,No,34303.440793,26.0,293,0
2,Male,38,1,32.0,0,1-2 Year,Yes,65557.032343,154.0,73,0
3,Female,43,1,29.0,0,1-2 Year,Yes,64987.866446,124.0,282,1
4,Male,59,1,10.0,0,1-2 Year,Yes,68556.509949,26.0,186,1


In [5]:
predictors = {'categorical': ['Gender', 'Age', 'Driving_License',
                              'Region_Code', 'Previously_Insured', 'Vehicle_Age',
                              'Vehicle_Damage', 'Policy_Sales_Channel', 'Vintage'],
             'continuous': ['Annual_Premium']}
targets = {'categorical': ['Response'],
           'continuous': []}


In [6]:
from sklearn.preprocessing import LabelEncoder

def to_numpy(df, preds, trgts, encs=None):
    columns = list(df.columns)
    X = np.empty(df[[i for k, v in preds.items() for i in v]].shape)
    Y = np.empty(df[[i for k, v in trgts.items() for i in v]].shape)
    
    for i, col in enumerate(preds['continuous']):
        X[:, columns.index(col)] = df[col]
    for i, col in enumerate(trgts['continuous']):
        Y[:, i] = df[col]
    
    if encs is None:
        encs = dict()
    for i, col in enumerate(preds['categorical']):
        if col not in encs:
            enc = LabelEncoder()
            enc.fit(df[col])
            encs[col] = enc
        else:
            enc = encs[col]
        X[:, columns.index(col)] = enc.transform(df[col])
    for i, col in enumerate(trgts['categorical']):
        if col not in encs:
            enc = LabelEncoder()
            enc.fit(df[col])
            encs[col] = enc
        else:
            enc = encs[col]
        Y[:, i] = enc.transform(df[col])
    
    return X, Y, encs

original_X, original_Y, original_encs = to_numpy(original_df, predictors, targets)
synthetic_X, synthetic_Y, synthetic_encs = to_numpy(synthetic_df, predictors, targets, original_encs)


In [7]:
OX = np.hstack((original_X, original_Y))
SX = np.hstack((synthetic_X, synthetic_Y))
MX = np.vstack((OX, SX))
print(OX.shape, SX.shape, MX.shape)

(10000, 11) (1000, 11) (11000, 11)


In [8]:
def to_numpy_x(df, columns):
    """
    Convert dataframe to numpy array by converting categorical variables as integer labels
    :param df: The dataframe to convert
    :param columns: The continuous & discrete columns
    :return: Converted numpy arrays of X & Y and the label encoders
    """

    # declaring empty arrays for the predictor and target variables
    X = np.empty(df[[i for i in columns]].shape)

    # Assign the values from the columns, that are declared continuous,
    # into the corresponding positions in the numpy array
#     for i, col in enumerate(columns):
#         X[:, columns.index(col)] = df[col]

    # Encoding for X (predictors)
    for i, col in enumerate(columns):
        # Initialize, fit and store the label encoder for column
        enc = LabelEncoder()
        enc.fit(df[col])

        # Transform the column using the encoder object
        X[:, columns.index(col)] = enc.transform(df[col])

    return X


In [9]:
categorical_cols = list()
categorical_cols.extend(predictors['categorical'])
categorical_cols.extend(targets['categorical'])

continuous_cols = list()
continuous_cols.extend(predictors['continuous'])
continuous_cols.extend(targets['continuous'])

original_cat = to_numpy_x(original_df[categorical_cols], categorical_cols)
original_con = original_df[continuous_cols].to_numpy()
syn_cat = to_numpy_x(synthetic_df[categorical_cols], categorical_cols)
syn_con = synthetic_df[continuous_cols].to_numpy()

In [10]:
# original_clusters = clustering.fit_predict(OX)
# print(original_clusters)

In [11]:
# mixed_clusters = clustering.fit_predict(MX)
# print(mixed_clusters)

In [42]:
import math
from sklearn.cluster import AgglomerativeClustering

def log_cluster_metric(X, Xn, n_clusters=2, categorical=False):
    if categorical:
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='manhattan', linkage='average')
    else:
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='average')
    print(clusterer)
    Xm = np.concatenate((X, Xn), axis=0)
    Na = X.shape[0]
    Nb = Xn.shape[0]
    
    original_labels = clusterer.fit_predict(X)
    merged_labels = clusterer.fit_predict(Xm)
    c = Na / (Na + Nb)
    print(c)

    unique_m, counts_m = np.unique(merged_labels, return_counts=True)
    unique_o, counts_o = np.unique(original_labels, return_counts=True)

    _sum = 0

    for _index, i in enumerate(unique_m):
        _sum += ((counts_m[_index] / counts_o[_index]) - c) ** 2
#     score = math.log(_sum / len(unique_m))
    score = _sum / len(unique_m)
    return score, original_labels, merged_labels


clusters = 5
print(f'Clustering and scoring categorical data')
cat_score, cat_original_labels, cat_merged_labels = log_cluster_metric(original_cat, syn_cat, n_clusters=clusters, categorical=True)
print(f'Clustering and scoring continuous data')
con_score, con_original_labels, con_merged_labels = log_cluster_metric(original_con, syn_con, n_clusters=clusters)


Clustering and scoring categorical data
AgglomerativeClustering(affinity='manhattan', linkage='average', n_clusters=5)
0.9090909090909091
Clustering and scoring continuous data
AgglomerativeClustering(linkage='average', n_clusters=5)
0.9090909090909091


In [43]:
print(math.log(np.mean([cat_score, con_score])))

0.4862190818344094


In [47]:
corr = original_df.corr('pearson')
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')
 .set_precision(2))

  (corr


Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
Age,,,,,,,,
Driving_License,-0.08,,,,,,,
Region_Code,0.05,-0.01,,,,,,
Previously_Insured,-0.27,0.02,-0.03,,,,,
Annual_Premium,0.05,-0.0,-0.0,0.03,,,,
Policy_Sales_Channel,-0.59,0.05,-0.05,0.23,-0.09,,,
Vintage,-0.01,0.02,-0.0,-0.0,-0.01,0.01,,
Response,0.13,0.01,0.01,-0.43,0.01,-0.17,-0.0,


In [48]:
corr = synthetic_df.corr('pearson')
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')
 .set_precision(2))

  (corr


Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
Age,,,,,,,,
Driving_License,-0.08,,,,,,,
Region_Code,-0.01,-0.01,,,,,,
Previously_Insured,-0.31,-0.03,-0.03,,,,,
Annual_Premium,0.3,0.02,0.02,-0.21,,,,
Policy_Sales_Channel,-0.49,-0.02,0.02,0.24,-0.21,,,
Vintage,-0.0,0.03,0.01,-0.02,-0.02,-0.04,,
Response,0.22,0.01,-0.0,-0.44,0.13,-0.16,0.02,


In [39]:
print(np.linalg.norm(original_df.corr('pearson')-synthetic_df.corr('pearson')))

0.6290196949110876
