In [1]:
import numpy as np
import pandas as pd
from scipy.stats import kstest, norm, uniform, beta, expon, truncnorm, anderson
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import warnings
from joblib import Parallel, delayed
import scipy.stats as stats
from tqdm import tqdm
from scipy.optimize import brentq
from itertools import combinations
import time
import random
import logging
from sklearn.metrics import pairwise_distances
from scipy.sparse.csgraph import laplacian
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import eigsh
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import rbf_kernel
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import diags
from scipy.sparse.csgraph import laplacian
from scipy.sparse import csgraph
import scipy
from numpy.linalg import eigh
from scipy.spatial.distance import pdist, squareform
warnings.filterwarnings(action='ignore')



class GaussianCopulaKmeansSynthesizer:

    def __init__(self, data):
        # self.filepath = filepath
        self.data = data
        self.execution_times = {}
        self.distributions = {}
        self.match_column = []
        self.num_column = []
        self.category_column = []
        self.special_numeric_series_columns = []
        self.datetime_column = []
        self.special_column_prefixes = []
        self.hyphenated_numeric_columns = []
        self.binary_columns_for_missing = []
        self.digit_counts = {}  # Store digit counts for each column
        self.hyphenated_format = {}
        self.column_min = {}  # Store minimum values for each column
        self.column_max = {}
        self.columns_with_missing_values = []
        self.category_intervals = {}
        self.num_of_clusters = 0
        self.gmms = {}
        self.column_cdfs = {}
        self.parameters_collections = {}
        self.column_inverse_cdfs = {}
        self.processed_data = None
        self.synthetic_data_raw = pd.DataFrame()
        self.synthetic_data = pd.DataFrame()
        self.cdf_results = pd.DataFrame()
        self.standard_gaussian_df = pd.DataFrame()

    def _start_timer(self):
        """
        Start the timer for a process.
        """
        self._timer_start = time.time()

    def _stop_timer(self, process_name):
        """
        Stop the timer and store the elapsed time for the process.

        Parameters:
        - process_name (str): The name of the process for which the timer was started.
        """
        elapsed_time = time.time() - self._timer_start
        self.execution_times[process_name] = elapsed_time


    # def detect_match(self):
    #     column_pairs = list(combinations(self.data.columns, 2))
    #     for column_pair in column_pairs:
    #         temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: x[column_pair[0]] == x[column_pair[1]], axis=1)
    #         count_true = temp[temp == True].count()
    #         ratio = count_true/len(temp)
    #         if ratio > 0.95:
    #             self.match_column.append(column_pair)



    def _identify_columns(self):
        """
        Identify and classify the columns of the dataset.

        Determines the type of each column (numerical, categorical, datetime, unique identifier, etc.)
        and classifies them into appropriate attributes of the class.
        """

        self._start_timer()
        self.unique_identifier_columns = []
        temp_special_columns = []
        temp_special_num_pattern = []

        data_info = self.data.dtypes.to_dict()
        for key, dtype in data_info.items():
            unique_values = self.data[key].nunique()
            total_values = len(self.data[key])

            # Check for uniqueness
            if unique_values == total_values:
                self.unique_identifier_columns.append(key)
                continue  # Skip further checks for this column

            if dtype == 'object' and pd.to_datetime(self.data[key], errors='coerce').notna().any():
                self.datetime_column.append(key)
            elif dtype == 'object':
                self.category_column.append(key)
            elif dtype in ['int64', 'float64']:
                self.num_column.append(key)

            if self.data[key].isnull().any():
                self.columns_with_missing_values.append(key)

        temp_column = self.unique_identifier_columns

        for column in temp_column:
            try:
                if self.data[column].str.match(r'[A-Za-z]+\d+').all():
                    extracted, prefix_lengths, number_lengths = self.extract_number_part(self.data[column])
                    self.special_numeric_series_columns.append(column)
                    # Store extracted prefixes and their lengths
                    self.special_column_prefixes[column] = extracted[0].iloc[0]
                    self.digit_counts[column] = number_lengths.iloc[0]

                elif self.data[column].str.match(r'\d+-\d+-\d+').all():
                    # Handling hyphenated numeric columns
                    self.hyphenated_numeric_columns.append(column)
                    combined_numbers, digit_counts = self.extract_numbers(self.data[column])
                    self.hyphenated_format[column] = digit_counts.iloc[0].tolist()  # Store digit count format
            except AttributeError:
                self.unique_identifier_columns.remove(column)
                self.category_column.append(column)

        if self.hyphenated_numeric_columns == [] and self.special_numeric_series_columns == []:
            self.category_column.extend(self.unique_identifier_columns)
            self.unique_identifier_columns.clear()



        #self.unique_identifier_columns = [col for col in self.unique_identifier_columns if col not in temp_special_columns]
        self.special_numeric_series_columns.extend(temp_special_columns)

        #self.unique_identifier_columns = [col for col in self.unique_identifier_columns if col not in temp_special_num_pattern]
        self.hyphenated_numeric_columns.extend(temp_special_num_pattern)

        self._stop_timer("_identify_columns")

    def extract_number_part(self, series):
        """
        Extract alphanumeric prefix and numeric part from a series.

        Parameters:
        - series (pd.Series): A pandas Series from which to extract the alphanumeric prefix and numeric part.

        Returns:
        - tuple: A tuple containing the extracted prefix, prefix lengths, and number lengths.
        """

        # Extract both alphanumeric prefix and numeric part
        regex_pattern = r'([A-Za-z]+)(\d+)'
        extracted = series.str.extract(regex_pattern)
        # Calculate the length of each part (prefix and number)
        prefix_lengths = extracted[0].apply(lambda x: len(x) if pd.notnull(x) else 0)
        number_lengths = extracted[1].apply(lambda x: len(x) if pd.notnull(x) else 0)

        return extracted, prefix_lengths, number_lengths

    def extract_numbers(self, series):

        regex_pattern = r'(\d+)-(\d+)-(\d+)'
        extracted = series.str.extract(regex_pattern)

        # Calculate the number of digits in each part
        digit_counts = extracted.applymap(lambda x: len(str(x)) if pd.notnull(x) else 0)

        # Combine the numbers into a single string
        combined_numbers = extracted.apply(lambda row: '-'.join(row.dropna()), axis=1)

        return combined_numbers.astype(str), digit_counts

    def generate_random_hyphenated_number(self, format_pattern):
        """
        Generate a random number based on the observed format pattern.
        format_pattern: List of integers representing the length of each numeric segment.
        """
        random_number_parts = [str(random.randint(0, 10**length - 1)).zfill(length) for length in format_pattern]
        return '-'.join(random_number_parts)

    def generate_random_number(self, length):
        """
        Generate a random number of a specified length.
        """
        return str(random.randint(0, 10**length - 1)).zfill(length)

    def convert_datetime_to_numerical(self):
        ref_dt = pd.Timestamp('1970-01-01')

        for column in self.datetime_column:
            # Convert to datetime, coerce errors to NaT (missing values)
            self.data[column] = pd.to_datetime(self.data[column], errors='coerce')

            # Convert datetime to numerical value (e.g., days since reference date)
            self.data[column] = (self.data[column] - ref_dt) / np.timedelta64(1, 'D')

            # Reclassify as a numerical column
            self.num_column.append(column)
            if column in self.category_column:
                self.category_column.remove(column)

    def transform_to_original_format(self, column, synthetic_series):
        format_pattern = self.hyphenated_format[column]
        formatted_series = synthetic_series.apply(lambda x: '-'.join(part.zfill(length) for part, length in zip(x.split('-'), format_pattern)))
        return formatted_series

    def handle_missing_values(self):
        self._start_timer()
        for column in self.data.columns:
            if self.data[column].isnull().any():
                # Create a binary column to mark missing values
                binary_column_name = column + '_missing'
                self.data[binary_column_name] = self.data[column].isnull().astype(int)
                self.binary_columns_for_missing.append(binary_column_name)
                # Fill missing values in the original column
                self.data[column] = self.data[column].fillna(method='ffill').fillna(method='bfill')
        self._stop_timer("handle_missing_value")

    def convert_special_numerical(self):
        for column in self.special_numeric_series_columns:
            numerical_part = self.extract_number_part(self.data[column])
            self.data[column] = numerical_part


    def assign_intervals(self):
        self.category_intervals = {}
        for column_name in self.category_column:
            column_data = self.data[column_name]
            freq = column_data.value_counts(normalize=True)
            intervals = freq.cumsum()
            category_intervals = {}
            a = 0
            for category, cum_freq in intervals.items():
                b = cum_freq
                category_intervals[category] = (a, b)
                a = b
            self.category_intervals[column_name] = category_intervals

    def sample_from_category(self, category_value, column_name):
        try:
            a, b = self.category_intervals[column_name][category_value]
            mean = (a + b) / 2
            sd = (b - a) / 6
            dist = truncnorm((0 - mean) / sd, (1 - mean) / sd, loc=mean, scale=sd)
            return dist.rvs()
        except KeyError:
            # Debugging information
            print(f"KeyError encountered in sample_from_category:")
            print(f"Column: {column_name}, Category Value: {category_value}")
            print(f"Available Categories in '{column_name}': {self.category_intervals[column_name]}")
            raise


    def preprocess_data(self):
        """
        Function: Convert all the categorical column into numerical column
        Result: Make the dataset have the same data type and prepare for the CDF
        """
        # self._identify_columns()
        # self.assign_intervals()
        self._start_timer()
        for column_name in tqdm(self.category_column):
            self.data[column_name] = self.data[column_name].apply(lambda x: self.sample_from_category(x, column_name))
        self._stop_timer("preprocess_data")

    def best_fit_distribution(self, column_data,column_name):
        distributions = ['norm', 'uniform', 'beta', 'expon', 'truncnorm']
        best_fit = None
        best_p_value = -1
        truncation_threshold = 0.05  # Threshold for considering truncation (5% of data at bounds)
        if column_data.min() < 0 or column_data.skew() > 1:
            distributions.remove('beta')
        # Check for potential truncation
        min_count = np.sum(column_data == column_data.min())
        max_count = np.sum(column_data == column_data.max())
        if min_count + max_count >= truncation_threshold * len(column_data):
            # If significant data at bounds, consider it as truncated
            best_fit = 'truncnorm'
            mean, std_dev = norm.fit(column_data)
            lower_bound = (column_data.min() - mean) / std_dev
            upper_bound = (column_data.max() - mean) / std_dev
            params = truncnorm.fit(column_data, lower_bound, upper_bound)
            _, best_p_value = kstest(column_data, 'truncnorm', args=params)
            return best_fit, best_p_value

        # If not truncated, proceed with other distributions
        for distribution in distributions:
            try:
                if distribution == 'norm':
                    params = norm.fit(column_data)
                    _, p_value = kstest(column_data, 'norm', args=params)
                elif distribution == 'uniform':
                    params = uniform.fit(column_data)
                    _, p_value = kstest(column_data, 'uniform', args=params)
                elif distribution == 'beta':
                    epsilon = 1e-10
                    scaled_data = (column_data - column_data.min() + epsilon) / (column_data.max() - column_data.min() + 2 * epsilon)
                    params = beta.fit(scaled_data, floc=0, fscale=1)
                    _, p_value = kstest(scaled_data, 'beta', args=params)
                elif distribution == 'expon':
                    params = expon.fit(column_data)
                    _, p_value = kstest(column_data, 'expon', args=params)

                if p_value > best_p_value:
                    best_p_value = p_value
                    best_fit = distribution
            except Exception as e:
                logging.error(f"Error in fitting {distribution} distribution for column '{column_name}': {e}")
                logging.info(f"Column '{column_name}' data statistics: {column_data.describe()}")
                logging.info(f"Column '{column_name}' data values (sample): {column_data.sample(10)}")



        return best_fit, best_p_value

    def get_distribution(self):
        self.relevant_columns = (set(self.num_column) | set(self.category_column)) - set(self.unique_identifier_columns) - set(self.binary_columns_for_missing)
        for column in self.relevant_columns:
            try:
                best_fit, best_p_value = self.best_fit_distribution(self.data[column], column)
                self.distributions[column] = best_fit
            except Exception as e:
                print(f"Error fitting distribution for column '{column}': {e}")
                # Optionally, you could log this error to a file instead of printing.
                # logging.error(f"Error fitting distribution for column '{column}': {e}")

        for column in self.relevant_columns:
            try:
                self.column_min[column] = self.data[column].min()
                self.column_max[column] = self.data[column].max()
            except Exception as e:
                print(f"Error calculating min/max for column '{column}': {e}")
                # Similarly, handle or log this error as needed.

    def calculate_cdf(self, column, distribution):
        if distribution == 'norm':
            mean, std = norm.fit(self.data[column])
            self.parameters_collections[column] = {'distribution':'norm', 'mean': mean, 'std': std}
            return norm.cdf(self.data[column], loc=mean, scale=std)

        if distribution == 'beta':
            data_normalized = (self.data[column] - self.data[column].min()) / (self.data[column].max() - self.data[column].min())

            # Estimate the parameters of the beta distribution
            a, b, loc, scale = beta.fit(data_normalized)

            # Calculating the CDF values using the beta distribution
            cdf_values = beta.cdf(data_normalized, a, b, loc, scale)

            self.parameters_collections[column] = {'distribution':'beta', 'a': a, 'b': b, 'loc': loc, 'scale':scale}
            return cdf_values

        if distribution == 'truncnorm':
            mean = self.data[column].mean()
            std = self.data[column].std()
            low = self.data[column].min()
            upp = self.data[column].max()
            low_std, upp_std = (low - mean) / std, (upp - mean) / std
            self.parameters_collections[column] = {'distribution':'truncnorm', 'mean': mean, 'std': std, 'low_std':low_std, 'upp_std': upp_std}
            return truncnorm.cdf(self.data[column], low_std, upp_std, loc=mean, scale=std)

        if distribution == 'uniform':
            min_value = self.data[column].min()
            max_value = self.data[column].max()
            scale = max_value - min_value
            cdf_values = uniform.cdf(self.data[column], loc=min_value, scale=scale)
            self.parameters_collections[column] = {'distribution':'uniform', 'min_value': min_value, 'max_value': max_value}
            return cdf_values

        if distribution == 'expon':
            # The scale parameter for the exponential distribution is the inverse of the mean
            scale = 1 / self.data[column].mean()
            cdf_values = expon.cdf(self.data[column], scale=scale)
            self.parameters_collections[column] = {'distribution': 'expon', 'scale': scale}
            return cdf_values

    def calculate_cdfs(self):
        for column in self.relevant_columns:
            distribution = self.distributions.get(column)
            if distribution:
                self.cdf_results[column] = self.calculate_cdf(column, distribution)
        epsilon = 1e-10  # A small epsilon value
        self.cdf_results = self.cdf_results.mask(self.cdf_results == 0, epsilon)
        self.cdf_results = self.cdf_results.mask(self.cdf_results == 1, 1 - epsilon)

    def standard_gaussian(self, p_value):
        return norm.ppf(p_value)

    def standard_gaussian_all(self):
        for column in self.cdf_results:
            self.column_inverse_cdfs[column] = self.cdf_results[column].apply(lambda x: self.standard_gaussian(x))

    def get_affinity(self):
      """
      Construct the affinity matrix using the Gaussian kernel method.

      Parameters:
      - gamma: float, the scale parameter for the Gaussian kernel.
      """
      # Ensure your data is in a suitable DataFrame format
      self.column_inverse_cdfs_df = pd.DataFrame(self.column_inverse_cdfs)

      # Define the gamma parameter for the Gaussian kernel
      gamma = 5

      # Compute the pairwise squared Euclidean distances between points in the dataset
      sq_dists = squareform(pdist(self.column_inverse_cdfs_df, 'sqeuclidean'))

      # Apply the Gaussian kernel to compute affinities
      affinities = np.exp(-gamma * sq_dists)

      # Store the dense affinities matrix directly
      self.affinity_matrix = affinities


    def get_laplacian(self):
        self.laplacian_matrix = laplacian(self.affinity_matrix, normed=False)
        self.laplacian_matrix = (self.laplacian_matrix + self.laplacian_matrix.T) / 2

    def optimal_clusters_dynamic(self):
      if scipy.sparse.issparse(self.laplacian_matrix):
        # If the Laplacian matrix is sparse, convert it to a dense array
        self.laplacian_matrix = self.laplacian_matrix.toarray()
      # Perform eigenvalue decomposition
      self.laplacian_matrix = self.laplacian_matrix.astype(np.float64)
      np.fill_diagonal(self.laplacian_matrix, self.laplacian_matrix.diagonal() + 1e-5)
      # upper_limit_k = min(self.laplacian_matrix.shape[0]-2, 100)
      eigenvalues, eigenvectors = np.linalg.eigh(self.laplacian_matrix)

      # Sort the eigenvalues in descending order
      sorted_eigenvalues = np.sort(eigenvalues)[::-1]

      # Calculate the percentage change between consecutive eigenvalues
      delta_eigenvalues = np.diff(sorted_eigenvalues) / sorted_eigenvalues[:-1]
      largest_drop_index = np.argmin(delta_eigenvalues)
      # # Calculate percentiles for categorizing delta eigenvalues
      # lower_percentile = np.percentile(delta_eigenvalues, 25)
      # upper_percentile = np.percentile(delta_eigenvalues, 75)

      # # Identifying potential cluster boundaries
      # cluster_boundaries = []
      # for i, delta in enumerate(delta_eigenvalues):
      #     if delta > upper_percentile or (lower_percentile < delta < upper_percentile):
      #         cluster_boundaries.append(i)

      # num_clusters = len(cluster_boundaries) + 1 if cluster_boundaries else 1

      self.num_of_clusters = largest_drop_index + 1

    ## discard since the spectral embedding is not possible to restore the data from the reduced dimension space
    def spectral_embedding(self):
      # Ensure the Laplacian matrix is in the correct format
      if scipy.sparse.issparse(self.laplacian_matrix):
          laplacian_matrix = self.laplacian_matrix.toarray()
      else:
          laplacian_matrix = self.laplacian_matrix

      # Convert the Laplacian matrix to float64 type and add a small value to its diagonal
      laplacian_matrix = laplacian_matrix.astype(np.float64)
      np.fill_diagonal(laplacian_matrix, laplacian_matrix.diagonal() + 1e-5)

      # Compute the eigenvalues and eigenvectors
      eigenvalues, eigenvectors = np.linalg.eigh(laplacian_matrix)

      # Update the spectral embedding result
      self.spectral_embedding_result = eigenvectors[:, 1:self.num_of_clusters + 1]


    # def apply_pca(self):
    #     # Use PCA for dimensionality reduction
    #     if isinstance(self.column_inverse_cdfs, dict):
    #         self.column_inverse_cdfs = pd.DataFrame(self.column_inverse_cdfs)

    #     self.pca = PCA(n_components=self.num_of_clusters)
    #     self.pca_result = self.pca.fit_transform(self.column_inverse_cdfs)

    def get_Kmeans(self):
        kmeans = KMeans(n_clusters=self.num_of_clusters+1, random_state=0)
        self.cluster_labels = kmeans.fit_predict(self.spectral_embedding_result)
        if self.cluster_labels is None:
            raise RuntimeError('KMeans has not been executed.')

        for cluster_label in range(self.num_of_clusters+1):
            # Select data points that belong to the current cluster
            self.df = pd.DataFrame(self.column_inverse_cdfs)
            cluster_data = self.df[self.cluster_labels == cluster_label]
            if len(cluster_data) > 1:
              num_components = min(len(cluster_data), 3)
              gmm = GaussianMixture(n_components = num_components, covariance_type='full', random_state=0)
              gmm.fit(cluster_data)
              self.gmms[cluster_label] = {
                  'model': gmm,
                  'n_samples': cluster_data.shape[0],
                  'is_single_sample': False
              }
            else:
              self.gmms[cluster_label] = {
                  'single_sample': cluster_data,
                  'is_single_sample': True
              }
        # self.gmms = {}
        # centroids = kmeans.cluster_centers_
        # self.gmm = GaussianMixture(n_components=self.num_of_clusters, covariance_type='full', random_state=0)
        # self.gmm.means_init = centroids ## Initial the means to let the apply components on different cluster
        # self.gmm.fit(self.pca_result)

    def generate_data(self):
        sampled_data = []
        for cluster_label, cluster_info in self.gmms.items():
          if cluster_info['is_single_sample']:
            sampled_data.append(cluster_info['single_sample'].to_numpy())
          else:
            # Sample data points for the current cluster based on the original count
            samples, _ = cluster_info['model'].sample(n_samples=cluster_info['n_samples'])
            sampled_data.append(samples)
        sampled_data_combined = np.vstack(sampled_data)
        # synthetic_overall = self.gmm.sample(len(self.data))[0]
        column_names = list(self.relevant_columns)
        # synthetic_original_format = self.pca.inverse_transform(synthetic_overall)
        self.synthetic_data_raw = pd.DataFrame(sampled_data_combined, columns=column_names)
        self.covariance = self.synthetic_data_raw.cov()

    def inverse_cdf(self, p, column, distribution):

        if distribution == 'norm':
            mean, std = self.parameters_collections[column]['mean'], self.parameters_collections[column]['std']
            return norm.ppf(p, loc=mean, scale=std)

        if distribution == 'beta':
            a, b, loc, scale = self.parameters_collections[column]['a'], self.parameters_collections[column]['b'], self.parameters_collections[column]['loc'], self.parameters_collections[column]['scale']
            normalized_values = beta.ppf(p, a, b, loc, scale)
            return normalized_values * (self.data[column].max() - self.data[column].min()) + self.data[column].min()

        if distribution == 'truncnorm':
            p = np.clip(p, 1e-25, 1-1e-25)
            mean = self.parameters_collections[column]['mean']
            std = self.parameters_collections[column]['std']
            low_std, upp_std = self.parameters_collections[column]['low_std'], self.parameters_collections[column]['upp_std']
            return truncnorm.ppf(p, low_std, upp_std, loc=mean, scale=std)

        if distribution == 'uniform':
            min_value = self.parameters_collections[column]['min_value']
            max_value = self.parameters_collections[column]['max_value']
            scale = max_value - min_value
            return uniform.ppf(p, loc=min_value, scale=scale)

        if distribution == 'expon':
            scale = self.parameters_collections[column]['scale']
            return expon.ppf(p, scale=scale)

        # Add logic for other distributions if needed
        return None

    def sample(self, F_inv, Sigma):
        """
        Sample numerical values from the distribution and covariances of the columns.

        Parameters:
        - F_inv: A list of inverse CDF functions for the marginals.
        - Sigma: The covariance matrix.

        Returns:
        - A sample vector x in the original space.
        """
        epsilon = 1e-10
        Sigma_reg = Sigma + np.eye(Sigma.shape[0]) * epsilon
        n = Sigma_reg.shape[0]
        v = np.random.randn(n)
        try:
            L = np.linalg.cholesky(Sigma_reg)
        except np.linalg.LinAlgError:
            print("Change to SVD Decomposition")
            # Fallback: SVD
            U, S, VT = np.linalg.svd(Sigma_reg)
            L = U * np.sqrt(S)

        u = L.dot(v)
        x = [F_inv_i(norm.cdf(u_i)) for F_inv_i, u_i in zip(F_inv, u)]
        return x

    def generate_synthetic_data(self, num_rows):
        """
        Generate synthetic data based on fitted GMMs and a covariance matrix.

        Parameters:
        - num_rows: Number of rows to generate.
        - covariance_matrix: Covariance matrix for the Gaussian Copula.

        Returns:
        - Synthetic dataset.
        """
        self._start_timer()
        F_inv = []
        # inverse_df = pd.DataFrame(self.column_inverse_cdfs)
        relevant_columns = list((set(self.data.columns) - set(self.binary_columns_for_missing)) - set(self.unique_identifier_columns))
        for column in relevant_columns:
            F_inv.append(lambda p, column = column, distribution = self.distributions.get(column): self.inverse_cdf(p, column, distribution))

        synthetic_dataset = [self.sample(F_inv, self.covariance) for _ in tqdm(range(num_rows))]
        synthetic_df = pd.DataFrame(synthetic_dataset, columns= relevant_columns)
        column_mean = synthetic_df.mean()
        self.synthetic_data = synthetic_df.fillna(column_mean)

        for column in self.relevant_columns:
            min_val = self.column_min[column]
            max_val = self.column_max[column]
            self.synthetic_data[column] = self.synthetic_data[column].clip(lower=min_val, upper=max_val)

        for column in self.special_numeric_series_columns:
            prefix = self.special_column_prefixes[column]
            number_length = self.digit_counts[column]
            # Generate synthetic data based on extracted structure
            self.synthetic_data[column] = [
                prefix + self.generate_random_number(number_length)
                for _ in range(num_rows)
            ]

        for column in self.hyphenated_numeric_columns:
            format_pattern = self.hyphenated_format.get(column, [])
            self.synthetic_data[column] = [
                self.generate_random_hyphenated_number(format_pattern) for _ in range(num_rows)
            ]
        self._stop_timer("generate_syn_data")
        return self.synthetic_data

    def numerical_to_category(self, num_value, column_name):
        """ Convert a numerical value back to its corresponding category. """
        for category, (a, b) in self.category_intervals[column_name].items():
            if a <= num_value < b:
                return category
            elif num_value > 1 and round(b) == 1:
                return category
            elif num_value < 0 and round(a) == 0:
                return category
        return None  # Return None or some default value if no category matches

    def numerical_to_datetime(self, num_value):
        """
        Convert a numerical value back to its corresponding datetime.
        """
        return pd.Timestamp("1970-01-01") + pd.to_timedelta(num_value, unit='s')

    def post_process(self):
        """
        Convert all numerical columns back to their original categorical form.

        Parameters:
        - synthetic_df: DataFrame containing synthetic data with numerical values for categorical columns.

        Returns:
        - DataFrame with categorical columns converted back to their original categories.
        """
        self._start_timer()
        for column_name in set(self.category_column) - set(self.unique_identifier_columns) - set(self.binary_columns_for_missing):
            self.synthetic_data[column_name] = self.synthetic_data[column_name].apply(lambda x: self.numerical_to_category(x, column_name))
            synthetic_data = self.synthetic_data

        for column_name in self.datetime_column:
            self.synthetic_data[column_name] = self.synthetic_data[column_name].round().astype(int)
        # Ensure the column is numeric and represents days since the epoch
            self.synthetic_data[column_name] = pd.to_timedelta(self.synthetic_data[column_name], unit='d') + pd.Timestamp("1970-01-01")

        for column in self.columns_with_missing_values:
            original_column = column.replace('_missing', '')
            self.synthetic_data.loc[self.data[column + '_missing'] == 1, original_column] = np.nan
        self._stop_timer("post_process")
        return self.synthetic_data

    def get_execution_times_df(self):
        """
        Convert the execution times dictionary to a DataFrame.

        Returns:
        - pd.DataFrame: A DataFrame with process names and their corresponding execution times.
        """
        return pd.DataFrame(list(self.execution_times.items()), columns=['Process', 'Time (seconds)'])

In [6]:
import pandas as pd
data = pd.read_csv('American_Income.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
G = GaussianCopulaKmeansSynthesizer(data)
G._identify_columns()
G.convert_datetime_to_numerical()
G.handle_missing_values()
G.assign_intervals()
G.preprocess_data()
G.get_distribution()
G.calculate_cdfs()
G.standard_gaussian_all()
G.get_affinity()
G.get_laplacian()
G.optimal_clusters_dynamic()
G.spectral_embedding()
G.get_Kmeans()
G.generate_data()
G.generate_synthetic_data(32561)
G.post_process()

100%|██████████| 9/9 [01:28<00:00,  9.81s/it]
100%|██████████| 32561/32561 [02:30<00:00, 217.02it/s]


Unnamed: 0,marital-status,native-country,education,workclass,education-num,capital-gain,age,hours-per-week,race,relationship,fnlwgt,sex,salary,capital-loss,occupation
0,Divorced,United-States,Some-college,Private,10.836977,166.533298,35.100848,29.415786,White,Own-child,109708.678915,Female,<=50K,551.134282,Craft-repair
1,Never-married,United-States,HS-grad,Private,9.796547,40.256364,36.757348,27.438150,Black,Own-child,218378.252360,Female,<=50K,768.228003,Prof-specialty
2,Married-civ-spouse,United-States,HS-grad,Private,6.431016,8396.496674,35.880076,48.600852,White,Husband,43777.887822,Male,<=50K,688.492539,Transport-moving
3,Married-civ-spouse,United-States,Some-college,Local-gov,10.436430,26725.681815,54.457744,54.257099,White,Husband,299138.481726,Male,>50K,207.308443,Craft-repair
4,Married-civ-spouse,United-States,HS-grad,Private,8.125959,4064.054526,39.327172,20.777603,White,Not-in-family,239440.802337,Female,<=50K,1.089971,Sales
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,Married-civ-spouse,United-States,HS-grad,Private,8.273412,9543.611381,30.772561,42.318478,White,Not-in-family,164399.032969,Male,<=50K,32.720028,Handlers-cleaners
32557,Never-married,United-States,Masters,Private,15.243916,7277.401715,41.397979,35.353132,White,Own-child,378344.346148,Female,>50K,2.236645,Prof-specialty
32558,Never-married,United-States,HS-grad,Local-gov,9.625036,14726.874225,32.048178,44.985374,White,Not-in-family,137281.156801,Male,<=50K,412.135306,Handlers-cleaners
32559,Separated,United-States,Some-college,Local-gov,9.934696,17151.200879,42.240031,18.244887,White,Not-in-family,19302.000000,Male,<=50K,52.180549,Craft-repair


In [9]:
data = pd.read_csv('American_Income.csv')
gckm_varies_AMI = G.synthetic_data
gckm_varies_AMI = gckm_varies_AMI[data.columns]
# gckm_varies_covtype1 = gckm_varies_covtype.astype(int)
# gckm_varies_covtype.to_csv("gckm_varies_covtype.csv", index = False)
# gckm_varies_covtype1
# gckm_varies_TI['Gender'] = gckm_varies_TI['Gender'].replace('Unknown', np.nan)

# # Revert 999 values in 'Age' to np.nan
# gckm_varies_TI['Age'] = gckm_varies_TI['Age'].replace(999, np.nan)

In [None]:
gckm_varies_AMI.to_csv("gckm_varies_AMI.csv", index = False)

### covtype

In [14]:
import pandas as pd
data = pd.read_csv('covtype.csv')

# test_data = data.sample(n=25000, random_state=1)


In [15]:
dsicard_column = ['Soil_Type15', 'Soil_Type7', 'Soil_Type25']

data_modified = data.drop(dsicard_column, axis = 1)

In [16]:
data_modified

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [None]:
G = GaussianCopulaKmeansSynthesizer(data_modified)
G._identify_columns()
G.convert_datetime_to_numerical()
G.handle_missing_values()
G.assign_intervals()
G.preprocess_data()
G.get_distribution()
G.calculate_cdfs()
G.standard_gaussian_all()
G.get_affinity()
G.get_laplacian()
G.optimal_clusters_dynamic()
G.spectral_embedding()
G.get_Kmeans()
G.generate_data()
G.generate_synthetic_data(581012)
G.post_process()

0it [00:00, ?it/s]
100%|██████████| 581012/581012 [3:18:55<00:00, 48.68it/s]


Unnamed: 0,Soil_Type28,Soil_Type1,Soil_Type27,Wilderness_Area2,Soil_Type20,Soil_Type34,Wilderness_Area3,Soil_Type23,Aspect,Soil_Type3,...,Soil_Type30,Soil_Type16,Soil_Type14,Soil_Type13,Soil_Type33,Soil_Type2,Soil_Type24,Soil_Type9,Soil_Type19,Elevation
0,0.018781,0.023366,0.023347,0.129231,0.052205,0.023188,0.000000e+00,1.336271e-01,284.478045,0.157186,...,6.403886e-01,0.084927,0.019415,0.000919,0.033405,0.139443,0.005104,0.039586,0.019080,2488.097095
1,0.044317,0.062513,0.055928,0.137513,0.258282,0.024772,2.996657e-01,4.633661e-07,10.863575,0.095926,...,5.146831e-08,0.072088,0.009672,0.095048,0.746711,0.407837,0.006878,0.046044,0.044581,2568.309519
2,0.018972,0.025791,0.012503,0.889474,0.068539,0.058033,4.018715e-01,5.680456e-01,360.000000,0.086837,...,3.703282e-01,0.082699,0.014625,0.086798,0.465384,0.050695,0.000484,0.028707,0.085442,3364.794083
3,0.031052,0.075753,0.034443,0.198733,0.396411,0.066999,3.502641e-01,3.998671e-07,147.290133,0.030150,...,1.509052e-01,0.104825,0.020013,0.011056,0.998677,0.166730,0.000005,0.013510,0.113687,2754.107587
4,0.020637,0.115737,0.037342,0.279970,0.017136,0.031815,1.255742e-09,9.987285e-01,99.311722,0.012872,...,7.124210e-04,0.018290,0.018217,0.321472,0.000031,0.001023,0.001168,0.039694,0.156007,3301.585198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,0.049058,0.011355,0.032393,0.224948,0.049859,0.070041,9.873916e-01,5.929452e-01,146.784988,0.000284,...,4.535462e-06,0.013190,0.020806,0.008937,0.001916,0.109361,0.233884,0.018863,0.125789,2983.558133
581008,0.036872,0.017912,0.033581,0.000051,0.363692,0.046111,1.000000e+00,4.492311e-09,1.880963,0.000016,...,1.734364e-01,0.011762,0.028080,0.062826,1.000000,0.027897,0.112165,0.025245,0.005508,3338.564521
581009,0.014101,0.143434,0.073862,0.040220,0.157744,0.036659,9.999912e-01,5.028765e-01,81.686850,0.036434,...,9.993761e-01,0.068628,0.020404,0.333676,0.106842,0.257077,0.005989,0.033668,0.107332,2483.447152
581010,0.018045,0.063388,0.042520,0.357916,0.243961,0.030296,6.605237e-01,2.527795e-10,100.379550,0.033785,...,7.210535e-04,0.028141,0.021241,0.000679,0.402033,0.107278,0.035865,0.060718,0.010631,3024.283052


In [None]:
gckm_covtype = G.synthetic_data
gckm_covtype[dsicard_column] = data[dsicard_column].values

gckm_covtype = gckm_covtype[data.columns]

gckm_covtype.astype(int)

gckm_covtype.to_csv("gckm_varies_covtype.csv", index = False)

In [None]:
gckm_covtype

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2488.097095,284.478045,6.957768,0.003730,26.232055,1619.804282,227.446983,233.466802,144.732773,1446.120603,...,3.438236e-10,0.033405,0.023188,0.005719,0.006836,0.011801,0.985026,0.000045,0.009078,5.069944
1,2568.309519,10.863575,23.492412,0.000059,66.762971,390.136048,234.150267,203.424398,88.840819,159.002291,...,9.824280e-03,0.746711,0.024772,0.047689,0.009607,0.011514,0.000338,0.105039,0.055005,1.001957
2,3364.794083,360.000000,6.037717,0.005840,131.758733,4361.784094,208.336808,254.000000,182.075047,2974.576736,...,5.896094e-01,0.465384,0.058033,0.014836,0.009753,0.015903,0.255418,0.176824,0.064182,1.489700
3,2754.107587,147.290133,23.172100,0.000106,108.685090,1947.058119,252.604512,211.217532,47.389499,3305.847673,...,5.865535e-02,0.998677,0.066999,0.035395,0.008850,0.011687,0.018361,0.030674,0.071890,1.008692
4,3301.585198,99.311722,1.905793,0.000001,-64.195537,6090.961298,214.405313,229.199523,146.635175,3100.456218,...,1.927114e-01,0.000031,0.031815,0.017969,0.010192,0.018415,0.427394,0.231431,0.113134,1.037167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2983.558133,146.784988,6.932425,0.005900,158.181989,2063.928342,225.785697,231.831427,143.597589,925.786350,...,1.000000e+00,0.001916,0.070041,0.040814,0.008299,0.011972,0.177958,0.000116,0.000804,4.938756
581008,3338.564521,1.880963,10.613831,0.004390,115.010891,5937.567003,206.273094,198.241661,130.266114,2813.996090,...,2.282226e-03,1.000000,0.046111,0.044029,0.010910,0.013201,0.015269,0.115547,0.049981,6.764417
581009,2483.447152,81.686850,18.658150,0.000005,-45.938587,1289.930244,239.638862,225.067256,107.528924,2646.165512,...,9.989151e-01,0.106842,0.036659,0.029652,0.009799,0.014037,0.148615,0.099694,0.000152,6.425261
581010,3024.283052,100.379550,27.756896,0.012733,112.634091,1652.907351,188.933350,176.882637,129.884808,2752.929530,...,6.308180e-01,0.402033,0.030296,0.099255,0.009029,0.009470,0.192755,0.183247,0.091629,1.001370


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
gckm_covtype.to_csv('/content/drive/My Drive/mydata.csv', index = False)

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd


# Load the MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the dataset to a range of 0 to 1
x_train, x_test = x_train / 255.0, x_test / 255.0

# Merge the training and test datasets
x_combined = np.concatenate((x_train, x_test), axis=0)

# Resize the images to 12x12
x_resized = tf.image.resize(x_combined[..., tf.newaxis], [12, 12])

# Flatten the images to a shape of (n, 144)
x_flattened = tf.reshape(x_resized, (-1, 12*12))

# Apply a threshold to convert pixel values to boolean (True for pixels > 0.5, False otherwise)
x_boolean = (x_flattened.numpy() > 0.5).astype(int)

# Create a DataFrame from the boolean array
df_mnist_boolean = pd.DataFrame(x_boolean)

# Optional: Give columns meaningful names
column_names = [f'pixel_{i}' for i in range(1, 145)]
df_mnist_boolean.columns = column_names

y_combined = np.concatenate((y_train, y_test), axis=0)

df_mnist_boolean['Number'] = y_combined

df_mnist_boolean

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


Unnamed: 0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,pixel_10,...,pixel_136,pixel_137,pixel_138,pixel_139,pixel_140,pixel_141,pixel_142,pixel_143,pixel_144,Number
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
69996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
69997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
69998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [None]:
proportion_zeros = (df_mnist_boolean == 0).mean()

columns_to_keep = proportion_zeros[proportion_zeros < 0.98].index

df_filtered = df_mnist_boolean[columns_to_keep]

columns_out = set(df_mnist_boolean.columns) - set(columns_to_keep)

if 'Number' not in df_filtered.columns:
    df_filtered['Number'] = df_mnist_boolean['Number']

df_filtered.to_csv('minist_12_filtered.csv', index = False)

minister_12_filtered = pd.read_csv('minist_12_filtered.csv')

In [None]:
G = GaussianCopulaKmeansSynthesizer(minister_12_filtered)
G._identify_columns()
G.convert_datetime_to_numerical()
G.handle_missing_values()
G.assign_intervals()
G.preprocess_data()
G.get_distribution()
G.calculate_cdfs()
G.standard_gaussian_all()
G.get_affinity()
G.get_laplacian()
G.optimal_clusters_dynamic()
G.spectral_embedding()
G.get_Kmeans()
G.generate_data()
G.generate_synthetic_data(70000)
G.post_process()

0it [00:00, ?it/s]
100%|██████████| 70000/70000 [55:57<00:00, 20.85it/s]


Unnamed: 0,pixel_45,pixel_106,pixel_124,pixel_32,pixel_28,pixel_117,pixel_19,pixel_101,pixel_93,pixel_67,...,pixel_83,Number,pixel_34,pixel_91,pixel_63,pixel_56,pixel_94,pixel_69,pixel_118,pixel_128
0,0.998881,0.013912,2.454810e-01,4.340896e-02,0.011799,1.000000e+00,5.560666e-01,7.276023e-01,1.000000e+00,1.000000e+00,...,0.255930,8.396702e+00,0.789920,1.471669e-01,6.096211e-01,2.664535e-15,1.000000e+00,6.564072e-01,0.454278,9.911268e-01
1,0.002725,0.999999,9.925331e-02,2.954647e-07,0.166546,9.936124e-01,8.677593e-12,9.999662e-01,5.655170e-01,1.000000e+00,...,0.277927,8.426519e+00,0.554020,4.587226e-01,1.721190e-02,0.000000e+00,7.798818e-02,9.970380e-01,0.000003,9.752819e-01
2,0.999999,0.038367,4.805469e-01,1.538812e-02,0.006468,9.993045e-01,7.755672e-01,1.110223e-16,9.985565e-01,1.899168e-03,...,0.137430,4.819514e+00,0.273675,1.665335e-16,1.036671e-01,0.000000e+00,8.520945e-04,5.376235e-01,0.281153,9.996919e-01
3,0.478350,1.000000,5.026000e-12,9.986001e-01,0.577578,9.335761e-01,9.976480e-02,9.718722e-01,8.451775e-07,9.892636e-04,...,0.041403,8.999714e+00,0.008227,9.919715e-01,3.274102e-01,9.073736e-01,6.321698e-01,1.000000e+00,0.346608,6.546829e-01
4,0.999457,0.116532,3.357118e-01,2.750869e-02,0.524319,9.335219e-01,1.743879e-01,1.820321e-01,2.220446e-16,6.911138e-14,...,0.050756,9.000000e+00,0.419595,9.190469e-02,1.656698e-01,9.987887e-01,5.711904e-06,6.083203e-01,0.140854,9.999999e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.804498,0.807453,6.883402e-01,7.850415e-01,0.524785,1.000000e+00,2.440934e-02,2.200996e-03,1.000000e+00,0.000000e+00,...,0.143754,4.302081e-08,0.501236,1.438916e-07,2.019873e-03,9.998993e-01,2.119410e-03,1.000000e+00,0.305177,4.814168e-01
69996,0.000000,0.279253,8.234577e-06,2.020110e-01,0.197268,1.345339e-01,9.967904e-01,3.250598e-07,2.220446e-16,6.709756e-10,...,0.222000,3.609772e+00,0.003897,1.000000e+00,8.026237e-08,3.238890e-02,9.987237e-01,1.789647e-02,0.069469,3.107615e-09
69997,0.001340,0.195050,3.620523e-01,5.687419e-03,0.001797,4.163336e-17,8.333225e-02,1.000000e+00,1.515967e-02,1.000000e+00,...,0.224536,6.164638e+00,0.002285,9.170119e-01,3.148787e-08,9.999998e-01,2.801470e-01,9.594880e-01,0.131626,1.946705e-05
69998,0.785675,1.000000,4.102746e-01,1.000000e+00,0.539224,9.992256e-01,8.668845e-01,1.110223e-16,5.112701e-01,5.320153e-01,...,0.075464,2.959789e-01,0.159005,3.282929e-02,4.342273e-01,3.618666e-06,9.980050e-01,5.551115e-17,0.030877,9.864893e-01


In [None]:
gckm_varies_mnist = G.synthetic_data

gckm_varies_mnist = gckm_varies_mnist[minister_12_filtered.columns]

gckm_varies_mnist = gckm_varies_mnist.astype(int)

gckm_varies_mnist[list(columns_out)] = df_mnist_boolean[list(columns_out)].values
gckm_varies_mnist = gckm_varies_mnist[df_mnist_boolean.columns]
gckm_varies_mnist.to_csv('gckm_varies_mnist_12_new.csv', index = False)