In [1]:
import sys
import os
# Add the directory containing 'thesis_utils.py' to the system path
sys.path.append(os.path.abspath('..'))  # If 'thesis_utils.py' is one level up

import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

from thesis_utils import *

In [2]:
data = pd.read_csv("data/credit_score_cleaned.csv")
# remove columns that are not categorical or numerical
data = data.drop(['ID', 'Customer_ID', 'Name', 'SSN', 'Type_of_Loan', 'Payment_Behaviour'], axis=1)

# Generate the profiling report
profile_full = ProfileReport(data, title="Credit Score", explorative=True)

# Get the description object
desc_full = profile_full.get_description()

# Access the correlation matrix
correlation_matrix = desc_full.correlations["auto"]

categorical_cols = get_categorical_columns(data)
distributions = fit_distributions(data)
marginals = convert_distfit_to_marginals(distributions, data)

i = 1
while i <= 30:
    data_syn = generate_synthetic_dataset(
        original_data=data,
        correlation_matrix=correlation_matrix,
        categorical_columns=categorical_cols,
        marginals=marginals,
        n_rows=414,
        correlation_threshold=0.1,
        edge_strategy="random",
        sampling_strategy="local-chunks"
    )

     # Check for NaN or Inf only in numeric columns
    numeric_data = data_syn.select_dtypes(include=[np.number])
    if not data_syn.isnull().values.any() and np.isfinite(numeric_data.values).all():
        data_syn.to_csv(f"data/_copula-cloning/credit_score/cc_credit_score_{i}.csv", index=False)
        i += 1
    else: print("resample")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 21/21 [00:00<00:00, 335.77it/s]


In [3]:
data = pd.read_csv("data/air_quality_cleaned.csv")
# remove columns that are not categorical or numerical
data = data.drop(['Date', 'Time'], axis=1)

# Generate the profiling report
profile_full = ProfileReport(data, title="Air", explorative=True)

# Get the description object
desc_full = profile_full.get_description()

# Access the correlation matrix
correlation_matrix = desc_full.correlations["auto"]

categorical_cols = get_categorical_columns(data)
distributions = fit_distributions(data)
marginals = convert_distfit_to_marginals(distributions, data)

i = 1
while i <= 30:
    data_syn = generate_synthetic_dataset(
        original_data=data,
        correlation_matrix=correlation_matrix,
        categorical_columns=categorical_cols,
        marginals=marginals,
        n_rows=414,
        correlation_threshold=0.1,
        edge_strategy="random",
        sampling_strategy="local-chunks"
    )

     # Check for NaN or Inf only in numeric columns
    numeric_data = data_syn.select_dtypes(include=[np.number])
    if not data_syn.isnull().values.any() and np.isfinite(numeric_data.values).all():
        data_syn.to_csv(f"data/_copula-cloning/air_quality/cc_air_quality_{i}.csv", index=False)
        i += 1
    else: print("resample")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:00<00:00, 595.13it/s]


In [4]:
data = pd.read_csv("data/customer_churn.csv")

# Generate the profiling report
profile_full = ProfileReport(data, title="Customer", explorative=True)

# Get the description object
desc_full = profile_full.get_description()

# Access the correlation matrix
correlation_matrix = desc_full.correlations["auto"]

categorical_cols = get_categorical_columns(data)
distributions = fit_distributions(data)
marginals = convert_distfit_to_marginals(distributions, data)

i = 1
while i <= 30:
    data_syn = generate_synthetic_dataset(
        original_data=data,
        correlation_matrix=correlation_matrix,
        categorical_columns=categorical_cols,
        marginals=marginals,
        n_rows=414,
        correlation_threshold=0.1,
        edge_strategy="random",
        sampling_strategy="local-chunks"
    )

     # Check for NaN or Inf only in numeric columns
    numeric_data = data_syn.select_dtypes(include=[np.number])
    if not data_syn.isnull().values.any() and np.isfinite(numeric_data.values).all():
        data_syn.to_csv(f"data/_copula-cloning/customer_churn/cc_customer_churn_{i}.csv", index=False)
        i += 1
    else: print("resample")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 14/14 [00:00<00:00, 5159.50it/s]


In [5]:
data = pd.read_csv("data/insurance_original.csv")

# Generate the profiling report
profile_full = ProfileReport(data, title="Insurance", explorative=True)

# Get the description object
desc_full = profile_full.get_description()

# Access the correlation matrix
correlation_matrix = desc_full.correlations["auto"]

categorical_cols = get_categorical_columns(data)
distributions = fit_distributions(data)
marginals = convert_distfit_to_marginals(distributions, data)

i = 1
while i <= 30:
    data_syn = generate_synthetic_dataset(
        original_data=data,
        correlation_matrix=correlation_matrix,
        categorical_columns=categorical_cols,
        marginals=marginals,
        n_rows=414,
        correlation_threshold=0.1,
        edge_strategy="random",
        sampling_strategy="local-chunks"
    )

     # Check for NaN or Inf only in numeric columns
    numeric_data = data_syn.select_dtypes(include=[np.number])
    if not data_syn.isnull().values.any() and np.isfinite(numeric_data.values).all():
        data_syn.to_csv(f"data/_copula-cloning/insurance/cc_insurance_{i}.csv", index=False)
        i += 1
    else: print("resample")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:00<00:00, 266.78it/s]


In [6]:
data = pd.read_csv("data/real_estate_valuation_cleaned.csv")

# Generate the profiling report
profile_full = ProfileReport(data, title="Real Estate", explorative=True)

# Get the description object
desc_full = profile_full.get_description()

# Access the correlation matrix
correlation_matrix = desc_full.correlations["auto"]

categorical_cols = get_categorical_columns(data)
distributions = fit_distributions(data)
marginals = convert_distfit_to_marginals(distributions, data)

i = 1
while i <= 30:
    data_syn = generate_synthetic_dataset(
        original_data=data,
        correlation_matrix=correlation_matrix,
        categorical_columns=categorical_cols,
        marginals=marginals,
        n_rows=414,
        correlation_threshold=0.1,
        edge_strategy="random",
        sampling_strategy="local-chunks"
    )

     # Check for NaN or Inf only in numeric columns
    numeric_data = data_syn.select_dtypes(include=[np.number])
    if not data_syn.isnull().values.any() and np.isfinite(numeric_data.values).all():
        data_syn.to_csv(f"data/_copula-cloning/real_estate/cc_real_estate_{i}.csv", index=False)
        i += 1
    else: print("resample")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 6/6 [00:00<00:00, 65707.11it/s]
