# Analysis and Plots of ordered feature completion

In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# append the parent directory to the path
import sys
sys.path.append('..')

from sklearn.model_selection import train_test_split

import jellyfish

import yaml

import experiment_utils
import seaborn as sns
import tqdm

import seaborn as sns
import pickle

import analysis 
import utils

# re-load upon module change
%load_ext autoreload
%autoreload 2

### we analyse the completion of the last feature in the data frame

In [None]:
def conditional_completion_analysis(csv_file, completions_df):
    """Analysis for the conditional completion test"""
    data_df = utils.load_csv_df(csv_file)
    feature_names = utils.get_feature_names(csv_file)
    results = {}

    # the unique values of 'num_prefix_features'
    all_prefix_features = completions_df["num_prefix_features"].unique().tolist()

    # for each number of prefix features
    for num_prefix_features in all_prefix_features:
        completion_feature_name = feature_names[num_prefix_features]
        marginal_distribution = data_df[completion_feature_name].values
        mode_completion = data_df[completion_feature_name].mode().values[0]
        valid_completions = []
        valid_marginal_completions = []
        valid_mode_completions = []

        # the respective data frame with the responses
        df = completions_df[completions_df["num_prefix_features"] == num_prefix_features]

        # for each response in the data frame
        for _, row in df.iterrows():
            # look at the response up to num_prefix_featues +1 (that is, inlcuding the first completed feature)
            # does the response occur in the dataset?
            response = row[: num_prefix_features + 1]
            #print(response)
            #print(analysis.is_in_df(data_df, response))
            # print(type(response))
            valid_completions.append(analysis.is_in_df(data_df, response))

            # now, replace the actual completion from a completion drawn from the marginal distribution in the dataset
            response[completion_feature_name] = np.random.choice(marginal_distribution)
            valid_marginal_completions.append(analysis.is_in_df(data_df, response))
            #print(response)
            #print(analysis.is_in_df(data_df, response))
            #break

            # now, replace the actual completion with the most common completion in the dataset
            response[completion_feature_name] = mode_completion
            valid_mode_completions.append(analysis.is_in_df(data_df, response))

        print("Feature: ", completion_feature_name)
        print(np.mean(valid_completions), np.std(valid_completions) / np.sqrt(len(valid_completions)))
        print(np.mean(valid_marginal_completions), np.std(valid_marginal_completions) / np.sqrt(len(valid_marginal_completions)))
        print(np.mean(valid_mode_completions), np.std(valid_mode_completions) / np.sqrt(len(valid_mode_completions)))
        print('-'*80)

        results[(completion_feature_name, num_prefix_features)] = {
            "valid_completions": valid_completions,
            "valid_marginal_completions": valid_marginal_completions,
            "valid_mode_completions": valid_mode_completions
        }
    return results

In [None]:
# adult completion analysis
csv_file = '../csv/adult.csv'


adult_feature_names = utils.get_feature_names('../csv/adult.csv')
features = ['Education', 'EducationNum', 'Occupation', 'Gender', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income']

feature_name = 'Occupation'
completions_df = pd.read_csv(f'../results/gpt-4-32k-0314/ordered-completion/adult-{feature_name}.csv', dtype=str)

#completions_df = pd.read_csv('../results/adult-completion-v1(full).csv', dtype=str)
completions_df["num_prefix_features"] = len(completions_df.columns) - 1

conditional_completion_analysis(csv_file, completions_df)

### adult

In [None]:
adult_results = {}
csv_file = '../csv/adult.csv'
feature_names = utils.get_feature_names(csv_file)
completion_fn = ['Education', 'EducationNum', 'Occupation', 'Gender', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income']
for feature_name in completion_fn:
    #completions_df = pd.read_csv(f'../results/gpt-4-32k-0314/ordered-completion/adult-{feature_name}.csv', dtype=str)
    completions_df = pd.read_csv(f'../results/gpt-3.5-turbo/ordered-completion/adult-{feature_name}.csv', dtype=str)
    completions_df["num_prefix_features"] = len(completions_df.columns) - 1
    # keep the first 10 rows (debugging)
    # completions_df = completions_df[:10]
    results = conditional_completion_analysis(csv_file, completions_df)
    # join dictionaries
    adult_results = {**adult_results, **results}1

In [None]:
# store results with pickle
#with open('../results/gpt-4-32k-0314/ordered-completion/adult-results.pickle', 'wb') as handle:
#    pickle.dump(adult_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# store results with pickle
#with open('../results/gpt-3.5-turbo/ordered-completion/adult-results.pickle', 'wb') as handle:
#    pickle.dump(adult_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# load results
adult_results = pickle.load(open('../results/gpt-4-32k-0314/ordered-completion/adult-results.pickle', 'rb'))
adult_results35 = pickle.load(open('../results/gpt-3.5-turbo/ordered-completion/adult-results.pickle', 'rb'))

In [None]:
# plot. on the x axis are the features, and we have one curve for each type of completion
completions_mean = [np.mean(v['valid_completions']) for k, v in adult_results.items()]
completions_lower_95 = [np.mean(v['valid_completions']) - 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in adult_results.items()]
completions_upper_95 = [np.mean(v['valid_completions']) + 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in adult_results.items()]

completions35_mean = [np.mean(v['valid_completions']) for k, v in adult_results35.items()]
completions35_lower_95 = [np.mean(v['valid_completions']) - 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in adult_results35.items()]
completions35_upper_95 = [np.mean(v['valid_completions']) + 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in adult_results35.items()]

marginal_completions_mean = [np.mean(v['valid_marginal_completions']) for k, v in adult_results.items()]
marginal_completions_lower_95 = [np.mean(v['valid_marginal_completions']) - 1.96 * np.std(v['valid_marginal_completions']) / np.sqrt(len(v['valid_marginal_completions'])) for k, v in adult_results.items()]
marginal_completions_upper_95 = [np.mean(v['valid_marginal_completions']) + 1.96 * np.std(v['valid_marginal_completions']) / np.sqrt(len(v['valid_marginal_completions'])) for k, v in adult_results.items()]

mode_completions_mean = [np.mean(v['valid_mode_completions']) for k, v in adult_results.items()]
mode_completions_lower_95 = [np.mean(v['valid_mode_completions']) - 1.96 * np.std(v['valid_mode_completions']) / np.sqrt(len(v['valid_mode_completions'])) for k, v in adult_results.items()]
mode_completions_upper_95 = [np.mean(v['valid_mode_completions']) + 1.96 * np.std(v['valid_mode_completions']) / np.sqrt(len(v['valid_mode_completions'])) for k, v in adult_results.items()]

In [None]:
# adjust overall plot fong size
sns.set(font_scale=1.4)

# white, no grid
sns.set_style("white")

# plot
plt.figure(figsize=(10, 5))

# line style dots with dashes, for all three lines
plt.plot(completions_mean, label='GPT-4', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='blue')
plt.fill_between(range(len(completions_mean)), completions_lower_95, completions_upper_95, alpha=0.2, color='blue')

plt.plot(completions35_mean, label='GPT-3.5-turbo', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='green')
plt.fill_between(range(len(completions35_mean)), completions35_lower_95, completions35_upper_95, alpha=0.2, color='green')

plt.plot(marginal_completions_mean, label='Marginal Distribution', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='orange')
plt.fill_between(range(len(marginal_completions_mean)), marginal_completions_lower_95, marginal_completions_upper_95, alpha=0.2, color='orange')

#plt.plot(mode_completions_mean, label='Mode', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='green')
#plt.fill_between(range(len(mode_completions_mean)), mode_completions_lower_95, mode_completions_upper_95, alpha=0.2, color='green')

plt.xticks(range(8), ['Education', 'EducationNum', 'Occupation', 'Gender', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'], rotation=25)

# y axis label
plt.ylabel('Probability of a Valid Completion')

# title
plt.title('Adult Income')

# legend below the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), fancybox=True, shadow=True, ncol=3)

# save plot
plt.savefig('figures/adult-ordered-completion.pdf', bbox_inches='tight')

### fico

In [None]:
fico_results = {}
csv_file = '../../private-do-not-distribute/fico.csv'
feature_names = utils.get_feature_names(csv_file)
completion_fn = ['AverageMInFile',
    'NumTrades90Ever2DerogPubRec',
    'MaxDelq2PublicRecLast12M',
    'NumTradesOpeninLast12M',
    'NumInqLast6M',
    'NetFractionInstallBurden',
    'NumBank2NatlTradesWHighUtilization',
    'PercentTradesWBalance']
for feature_name in completion_fn:
    #completions_df = pd.read_csv(f'../../private-do-not-distribute/results/fico-ordered-completion-gpt4-{feature_name}.csv', dtype=str)
    completions_df = pd.read_csv(f'../../private-do-not-distribute/results/fico-ordered-completion-gpt3.5-turbo-{feature_name}.csv', dtype=str)
    completions_df["num_prefix_features"] = len(completions_df.columns) - 1
    # keep the first 10 rows (debugging)
    #completions_df = completions_df[:10]
    results = conditional_completion_analysis(csv_file, completions_df)
    # join dictionaries
    fico_results = {**fico_results, **results}

In [None]:
# store results with pickle
#with open('../../private-do-not-distribute/results/fico-ordered-completion-gpt4-results.pickle', 'wb') as handle:
#   pickle.dump(fico_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#with open('../../private-do-not-distribute/results/fico-ordered-completion-gpt3.5-turbo-results.pickle', 'wb') as handle:
#   pickle.dump(fico_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fico_results_gpt4 = pickle.load(open('../../private-do-not-distribute/results/fico-ordered-completion-gpt4-results.pickle', 'rb'))
fico_results_gpt35 = pickle.load(open('../../private-do-not-distribute/results/fico-ordered-completion-gpt3.5-turbo-results.pickle', 'rb'))

In [None]:
completions_mean = [np.mean(v['valid_completions']) for k, v in fico_results_gpt4.items()]
completions_lower_95 = [np.mean(v['valid_completions']) - 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in fico_results_gpt4.items()]
completions_upper_95 = [np.mean(v['valid_completions']) + 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in fico_results_gpt4.items()]

completions35_mean = [np.mean(v['valid_completions']) for k, v in fico_results_gpt35.items()]
completions35_lower_95 = [np.mean(v['valid_completions']) - 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in fico_results_gpt35.items()]
completions35_upper_95 = [np.mean(v['valid_completions']) + 1.96 * np.std(v['valid_completions']) / np.sqrt(len(v['valid_completions'])) for k, v in fico_results_gpt35.items()]

marginal_completions_mean = [np.mean(v['valid_marginal_completions']) for k, v in fico_results_gpt4.items()]
marginal_completions_lower_95 = [np.mean(v['valid_marginal_completions']) - 1.96 * np.std(v['valid_marginal_completions']) / np.sqrt(len(v['valid_marginal_completions'])) for k, v in fico_results_gpt4.items()]
marginal_completions_upper_95 = [np.mean(v['valid_marginal_completions']) + 1.96 * np.std(v['valid_marginal_completions']) / np.sqrt(len(v['valid_marginal_completions'])) for k, v in fico_results_gpt4.items()]

mode_completions_mean = [np.mean(v['valid_mode_completions']) for k, v in fico_results_gpt4.items()]
mode_completions_lower_95 = [np.mean(v['valid_mode_completions']) - 1.96 * np.std(v['valid_mode_completions']) / np.sqrt(len(v['valid_mode_completions'])) for k, v in fico_results_gpt4.items()]
mode_completions_upper_95 = [np.mean(v['valid_mode_completions']) + 1.96 * np.std(v['valid_mode_completions']) / np.sqrt(len(v['valid_mode_completions'])) for k, v in fico_results_gpt4.items()]


In [None]:
# adjust overall plot fong size
sns.set(font_scale=1.4)

# white, no grid
sns.set_style("white")

# plot
plt.figure(figsize=(10, 5))

# line style dots with dashes, for all three lines
plt.plot(completions_mean, label='GPT-4', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='blue')
plt.fill_between(range(len(completions_mean)), completions_lower_95, completions_upper_95, alpha=0.2, color='blue')

# gpt-3
plt.plot(completions35_mean, label='GPT-3.5-turbo', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='green')
plt.fill_between(range(len(completions35_mean)), completions35_lower_95, completions35_upper_95, alpha=0.2, color='green')

plt.plot(marginal_completions_mean, label='Marginal Distribution', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='orange')
plt.fill_between(range(len(marginal_completions_mean)), marginal_completions_lower_95, marginal_completions_upper_95, alpha=0.2, color='orange')

#plt.plot(mode_completions_mean, label='Mode', linestyle='dashed', marker='o', markersize=8, linewidth=2, color='green')
#plt.fill_between(range(len(mode_completions_mean)), mode_completions_lower_95, mode_completions_upper_95, alpha=0.2, color='green')

plt.xticks(range(len(completion_fn)), ['AverageMInFile', 'NumTrades90', 'MaxDelq2', 'NumTradesOpen', 'NumInqLast6M', 'InstallBurden', 'TradesWHighUtilization', 'PercentTradesWBalance'], rotation=20)

# y axis label
plt.ylabel('Probability of a Valid Completion')

# title
plt.title('FICO')

# legend below the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), fancybox=True, shadow=True, ncol=3)

# save as pdf
plt.savefig('figures/fico-ordered-completion.pdf', bbox_inches='tight')

In [None]:
completion_fn

In [None]:
['AverageMInFile', 'NumTrades90', 'MaxDelq2', 'NumTradesOpen', 'NumInqLast6M', 'InstallBurden', 'TradesWHighUtilization', 'PercentTradesWBalance']