# Analysis and Plots of samples on the different datasets

In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# append the parent directory to the path
import sys
sys.path.append('..')

from sklearn.model_selection import train_test_split

import jellyfish

import yaml

import experiment_utils
import seaborn as sns
import tqdm

import csv

import analysis 
import utils

# re-load upon module change
%load_ext autoreload
%autoreload 2

#### load the samples on all the different datasets

In [None]:
datasets = ['IRIS', 'uci-wine', 'sklearn-diabetes', 'titanic-train',  'openml-diabetes', 'adult', 'california-housing']

original_data = {}
gpt4_samples = {}
gpt35_samples = {}
for dataset in datasets:
    df_data =  utils.load_csv_df(f'../csv/{dataset}.csv', dtype=str)
    df_gpt35 = pd.read_csv(f'../results/gpt-3.5-turbo/samples/{dataset}-temperature-0.7.csv', dtype=str)
    df_gpt4 = pd.read_csv(f'../results/gpt-4-32k-0314/samples/{dataset}-temperature-0.7.csv', dtype=str)
    original_data[dataset] = df_data
    gpt4_samples[dataset] = df_gpt4
    gpt35_samples[dataset] = df_gpt35

# fico
datasets.append('fico')
df_data = pd.read_csv(f'../../private-do-not-distribute/fico.csv', dtype=str)
df_gpt35 = pd.read_csv(f'../../private-do-not-distribute/results/fico-samples-gpt-3.5-temperature-0.7.csv', dtype=str)
df_gpt4 = pd.read_csv(f'../../private-do-not-distribute/results/fico-samples-gpt-4-temperature-0.7.csv', dtype=str)

original_data['fico'] = df_data
gpt4_samples['fico'] = df_gpt4
gpt35_samples['fico'] = df_gpt35

In [None]:
# formatted feature names
formatted_feature_names = {'IRIS': ['S-Length', 'S-Width', 'P-Length', 'P-Width', 'Species'],
'uci-wine': ['T', 'Alc', 'MAc', 'Ash', 'Alca', 'Mag', 'Phen', 'Flav', 'NFP', 'Pro.', 'Inten', 'Hue', 'od', 'Prol'],
'sklearn-diabetes': ['Age', 'Sex', 'bmi', 'bp', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y'],
'titanic-train': ['Id', 'Surv', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embkd'],
'openml-diabetes': ['Preg', 'Gluc', 'BP', 'STkns', 'Insul', 'BMI', 'DiaPF', 'Age', 'Out'],
'adult': ['Age', 'Work', 'fnl', 'Edu', 'EduNum', 'Mar', 'Occ', 'Rel', 'Race', 'Gen', 'CGain', 'CLoss', 'Hours', 'Coun', 'Inc'],
'california-housing': ['Long', 'Lat', 'MAge', 'NR', 'NBR', 'Pop', 'Hou', 'Inc', 'Val', 'Oce'],
'fico': ['RP', 'ERE', 'MSO', 'MSM', 'AMIF', 'NST', 'NT60', 'NT90', 'PTND', 'MSMR', 'MDP12']}

for d in datasets:
    fnn = formatted_feature_names[d]
    colnames = list(original_data[d].columns)
    colnames[:len(fnn)] = fnn

    original_data[d].columns = colnames
    gpt4_samples[d].columns = colnames
    gpt35_samples[d].columns = colnames

## feature values

In [None]:
for dataset in datasets:
    df_data = original_data[dataset].copy(deep=True)
    df_gpt35 = gpt35_samples[dataset].copy(deep=True)
    df_gpt4 = gpt4_samples[dataset].copy(deep=True)
    # print head for all datasets, without line break
    print(dataset)
    print(df_data.head(5).to_string(index=False, header=False, line_width=1000))
    print(df_gpt35.head(5).to_string(index=False, header=False, line_width=1000))
    print(df_gpt4.head(5).to_string(index=False, header=False, line_width=1000))

### correlation table for all datasets (at most 10 features)

In [None]:
for dataset in datasets:
#for dataset in ['titanic-train', 'california-housing']:
    print(dataset)
    df_data = original_data[dataset].copy(deep=True)
    df_gpt35 = gpt35_samples[dataset].copy(deep=True)
    df_gpt4 = gpt4_samples[dataset].copy(deep=True)
    
    for df in [df_data, df_gpt35, df_gpt4]:
        float_variables = []
        for var in df.columns:
            try:
                df[var] = df[var].astype(float)
                float_variables.append(var)
            except:
                # drop column
                df.drop(columns=[var], inplace=True)
        # drop all other columns
        df.drop(columns=[var for var in df.columns if var not in float_variables], inplace=True)


    # keep only the first features in each dataframe
    df_data = df_data.iloc[:, :8]
    df_gpt35 = df_gpt35.iloc[:, :8]
    df_gpt4 = df_gpt4.iloc[:, :8]

    # increase font size
    sns.set(font_scale=1.4)

    ############ JOINT PLOT ############
    # a figure with 3 heatmaps, one for each dataset.
    fig, axs = plt.subplots(1, 3, figsize=(25, 5), layout='compressed')
    v_min = -1
    v_max = 1

    sns.heatmap(df_data.corr(), annot=df_data.corr(), fmt='.2f', vmin=v_min, vmax=v_max, ax=axs[0], cmap=sns.diverging_palette(20, 220, n=200), square=True, annot_kws={"size": 14})
    sns.heatmap(df_gpt35.corr(), annot=True, fmt='.2f', vmin=v_min, vmax=v_max, ax=axs[1], cmap=sns.diverging_palette(20, 220, n=200), square=True, annot_kws={"size": 14})
    sns.heatmap(df_gpt4.corr(), annot=True, fmt='.2f', vmin=v_min, vmax=v_max, ax=axs[2], cmap=sns.diverging_palette(20, 220, n=200), square=True, annot_kws={"size": 14})


    axs[1].set_yticks([])
    axs[2].set_yticks([])
    # remove the colorbar from the first two heatmaps
    axs[0].collections[0].colorbar.remove()
    axs[1].collections[0].colorbar.remove()
    axs[2].collections[0].colorbar.remove()
    # add colorbar to the right of the last heatmap
    #fig.colorbar(axs[2].collections[0], ax=axs[2], location='right')
    
    # have a bit of a gap between the plots
    #plt.subplots_adjust(wspace=0.1)

    #axs[0].set_title(dataset)
    #axs[1].set_title('GPT-3.5')
    #axs[2].set_title('GPT-4')

    # save
    plt.savefig(f'figures/{dataset}-heatmap.png', dpi=600, bbox_inches='tight')

    axs[0].set_title('Dataset')
    axs[1].set_title('GPT-3.5')
    axs[2].set_title('GPT-4')  

    plt.savefig(f'figures/{dataset}-heatmap.pdf', bbox_inches='tight')
    plt.show()
    
    ############ INDIVIDUAL PLOTS ############
    for corr, name in [(df_data.corr(), 'data'), (df_gpt35.corr(), 'gpt35'), (df_gpt4.corr(), 'gpt4')]:
        # plot the correlation matrix
        fig, ax = plt.subplots(figsize=(7, 7))
        sns.heatmap(corr, annot=True, fmt='.2f', vmin=v_min, vmax=v_max, ax=ax, cmap=sns.diverging_palette(20, 220, n=200), square=True, annot_kws={"size": 14})
        ax.collections[0].colorbar.remove()
        plt.savefig(f'figures/{dataset}-heatmap-{name}.png', dpi=600, bbox_inches='tight')
        plt.show()

### 

### fraction of samples from data vs. fraction of values form data

In [None]:
for dataset in datasets:
    samples35_df = gpt35_samples[dataset]
    samples4_df = gpt4_samples[dataset]
    data_df = original_data[dataset]
        
    print(dataset)
    # for all rows in sample_df, check if it is in data_df (as an entire row)
    num_in_df = 0
    for i in range(samples35_df.shape[0]):
        row = samples35_df.iloc[i]
        if analysis.is_in_df(data_df, row):
            num_in_df += 1
    print('GPT-3.5-turbo', num_in_df / samples35_df.shape[0])

    # for all rows in sample_df, check if it is in data_df (as an entire row)
    num_in_df = 0
    for i in range(samples4_df.shape[0]):
        row = samples4_df.iloc[i]
        if analysis.is_in_df(data_df, row):
            num_in_df += 1
    print('GPT-4', num_in_df / samples4_df.shape[0])
    print('-'*80)

In [None]:
d = datasets[6]
samples35_df = gpt35_samples[d ]
samples4_df = gpt4_samples[d ]
data_df = original_data[d ]

In [None]:
data_df.head()

In [None]:
samples35_df.head()

In [None]:
samples4_df.head()

### best n-gram match

In [None]:
for dataset in datasets:
    samples35_df = gpt35_samples[dataset]
    samples4_df = gpt4_samples[dataset]
    data_df = original_data[dataset]
        
    print(dataset)
    n_gram_distance = []
    for i in range(samples35_df.shape[0]):
        row = samples35_df.iloc[i]
        min_dist, _ = analysis.find_matches(data_df, row, utils.strings_unequal)
        n_gram_distance.append(min_dist)
    print('GPT-3.5-turbo', len(data_df.columns)-np.mean(n_gram_distance), len(data_df.columns)) 
    

    print(dataset)
    n_gram_distance = []
    for i in range(samples4_df.shape[0]):
        row = samples4_df.iloc[i]
        min_dist, _ = analysis.find_matches(data_df, row, utils.strings_unequal)
        n_gram_distance.append(min_dist)
    print('GPT-4', len(data_df.columns)-np.mean(n_gram_distance), len(data_df.columns)) 

    print('-'*80)

#### individual feature values from the training data

In [None]:
for dataset in datasets:
    samples35_df = gpt35_samples[dataset]
    samples4_df = gpt4_samples[dataset]
    data_df = original_data[dataset]
    feature_names = list(data_df.columns)
        
    print(dataset)
    fvd = []
    for i in range(100):
        row = samples35_df.iloc[i]
        for feature_name in feature_names:
            min_dist, _ = analysis.find_matches(data_df, row[[feature_name]], utils.strings_unequal)
            fvd.append(min_dist)
    print('GPT-3.5-turbo', 100*(len(data_df.columns)-np.mean(fvd))/len(data_df.columns)) 
    

    fvd = []
    for i in range(100):
        row = samples4_df.iloc[i]
        for feature_name in feature_names:
            min_dist, _ = analysis.find_matches(data_df, row[[feature_name]], utils.strings_unequal)
            fvd.append(min_dist)
    print('GPT-4', 100*(len(data_df.columns)-np.mean(fvd))/len(data_df.columns)) 
    print('-'*80)