In [15]:
! pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.0-cp311-cp311-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.2/11.0 MB 9.6 MB/s eta 0:00:02
   --- ------------------------------------ 0.9/11.0 MB 13.5 MB/s eta 0:00:01
   ------ --------------------------------- 1.8/11.0 MB 16.6 MB/s eta 0:00:01
   ----------- ---------------------------- 3.3/11.0 MB 20.7 MB/s eta 0:00:01
   ---------------- ----------------------- 4.5/11.0 MB 22.2 MB/s eta 0:00:01
   ---------------------- ----------------- 6.2/11.0 MB 24.7 MB/s eta 0:00

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.utils import resample

# Data preparation
baseline_data = {
    'Model': ['Llama 2', 'Mistral Instruct', 'gpt-3.5-turbo-0125', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13'],
    'Accuracy': [21, 21, 87, 91, 94],
    'Precision': [0, 0, 5, 26, 31],
    'TCE': [4, 21, 7, 15, 15]
}

post_ft_data = {
    'Model': ['Llama 2 FT', 'Mistral Instruct FT', 'gpt-3.5-turbo-0125', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13'],
    'Accuracy': [77, 83, 87, 91, 94],
    'Precision': [35, 50, 5, 26, 31],
    'TCE': [9, 4, 7, 15, 15]
}

# Create DataFrames
baseline_df = pd.DataFrame(baseline_data)
post_ft_df = pd.DataFrame(post_ft_data)

# Calculate summary statistics
def calculate_summary_statistics(df, metric):
    mean = np.mean(df[metric])
    median = np.median(df[metric])
    std_dev = np.std(df[metric])
    return mean, median, std_dev

# Baseline statistics
baseline_accuracy_stats = calculate_summary_statistics(baseline_df, 'Accuracy')
baseline_precision_stats = calculate_summary_statistics(baseline_df, 'Precision')
baseline_tce_stats = calculate_summary_statistics(baseline_df, 'TCE')

# Post FT statistics
post_ft_accuracy_stats = calculate_summary_statistics(post_ft_df, 'Accuracy')
post_ft_precision_stats = calculate_summary_statistics(post_ft_df, 'Precision')
post_ft_tce_stats = calculate_summary_statistics(post_ft_df, 'TCE')

# Print summary statistics
print("Baseline Accuracy: Mean =", baseline_accuracy_stats[0], ", Median =", baseline_accuracy_stats[1], ", Std Dev =", baseline_accuracy_stats[2])
print("Baseline Precision: Mean =", baseline_precision_stats[0], ", Median =", baseline_precision_stats[1], ", Std Dev =", baseline_precision_stats[2])
print("Baseline TCE: Mean =", baseline_tce_stats[0], ", Median =", baseline_tce_stats[1], ", Std Dev =", baseline_tce_stats[2])

print("Post FT Accuracy: Mean =", post_ft_accuracy_stats[0], ", Median =", post_ft_accuracy_stats[1], ", Std Dev =", post_ft_accuracy_stats[2])
print("Post FT Precision: Mean =", post_ft_precision_stats[0], ", Median =", post_ft_precision_stats[1], ", Std Dev =", post_ft_precision_stats[2])
print("Post FT TCE: Mean =", post_ft_tce_stats[0], ", Median =", post_ft_tce_stats[1], ", Std Dev =", post_ft_tce_stats[2])

Baseline Accuracy: Mean = 62.8 , Median = 87.0 , Std Dev = 34.20175434096912
Baseline Precision: Mean = 12.4 , Median = 5.0 , Std Dev = 13.365627557282899
Baseline TCE: Mean = 12.4 , Median = 15.0 , Std Dev = 6.118823416311341
Post FT Accuracy: Mean = 86.4 , Median = 87.0 , Std Dev = 5.986651818838307
Post FT Precision: Mean = 29.4 , Median = 31.0 , Std Dev = 14.59588983241515
Post FT TCE: Mean = 10.0 , Median = 9.0 , Std Dev = 4.381780460041329


In [17]:
# Comparative analysis
accuracy_change = post_ft_df['Accuracy'] - baseline_df['Accuracy']
precision_change = post_ft_df['Precision'] - baseline_df['Precision']
tce_change = post_ft_df['TCE'] - baseline_df['TCE']

# Create a DataFrame to hold the changes with model names
changes_df = pd.DataFrame({
    'Model': baseline_df['Model'],
    'Accuracy Change': accuracy_change,
    'Precision Change': precision_change,
    'TCE Change': tce_change
})

# Print comparative analysis with model names
print(changes_df.iloc[0:2,:])

              Model  Accuracy Change  Precision Change  TCE Change
0           Llama 2               56                35           5
1  Mistral Instruct               62                50         -17


In [18]:
import scipy.stats as stats
import pandas as pd

# Baseline and post-fine-tuning data
baseline_data = {
    'Model': ['Llama 2', 'Mistral Instruct', 'gpt-3.5-turbo-0125', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13'],
    'Accuracy': [21, 21, 87, 91, 94],
    'Precision': [0, 0, 5, 26, 31],
    'TCE': [4, 21, 7, 15, 15]
}

post_ft_data = {
    'Model': ['Llama 2 FT', 'Mistral Instruct FT', 'gpt-3.5-turbo-0125', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13'],
    'Accuracy': [77, 83, 87, 91, 94],
    'Precision': [35, 50, 5, 26, 31],
    'TCE': [9, 4, 7, 15, 15]
}

# Create DataFrames
baseline_df = pd.DataFrame(baseline_data)
post_ft_df = pd.DataFrame(post_ft_data)

# Calculate the differences
accuracy_diff = post_ft_df['Accuracy'] - baseline_df['Accuracy']
precision_diff = post_ft_df['Precision'] - baseline_df['Precision']
tce_diff = post_ft_df['TCE'] - baseline_df['TCE']

# Perform Shapiro-Wilk test for normality
accuracy_normality = stats.shapiro(accuracy_diff)
precision_normality = stats.shapiro(precision_diff)
tce_normality = stats.shapiro(tce_diff)

print("Accuracy Shapiro-Wilk test:", accuracy_normality)
print("Precision Shapiro-Wilk test:", precision_normality)
print("TCE Shapiro-Wilk test:", tce_normality)

# Choose the test based on normality
if accuracy_normality.pvalue > 0.05:
    accuracy_test = stats.ttest_rel(baseline_df['Accuracy'], post_ft_df['Accuracy'])
else:
    accuracy_test = stats.wilcoxon(baseline_df['Accuracy'], post_ft_df['Accuracy'])

if precision_normality.pvalue > 0.05:
    precision_test = stats.ttest_rel(baseline_df['Precision'], post_ft_df['Precision'])
else:
    precision_test = stats.wilcoxon(baseline_df['Precision'], post_ft_df['Precision'])

if tce_normality.pvalue > 0.05:
    tce_test = stats.ttest_rel(baseline_df['TCE'], post_ft_df['TCE'])
else:
    tce_test = stats.wilcoxon(baseline_df['TCE'], post_ft_df['TCE'])

# Print results
print("Accuracy Test:", accuracy_test)
print("Precision Test:", precision_test)
print("TCE Test:", tce_test)



Accuracy Shapiro-Wilk test: ShapiroResult(statistic=0.7138381228579564, pvalue=0.013290752758146148)
Precision Shapiro-Wilk test: ShapiroResult(statistic=0.7619236826069935, pvalue=0.038235798010029756)
TCE Shapiro-Wilk test: ShapiroResult(statistic=0.7496660163102402, pvalue=0.029551876952296566)
Accuracy Test: WilcoxonResult(statistic=0.0, pvalue=0.17971249487899976)
Precision Test: WilcoxonResult(statistic=0.0, pvalue=0.17971249487899976)
TCE Test: WilcoxonResult(statistic=1.0, pvalue=0.6547208460185769)


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


In [35]:
import numpy as np

# Your baseline and post fine-tuning data
baseline_df = pd.DataFrame({
    'Model': ['Llama 2', 'Mistral Instruct', 'gpt-3.5-turbo-0125', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13'],
    'Accuracy': [21, 21, 87, 91, 94],
    'Precision': [0, 0, 5, 26, 31],
    'TCE': [4, 21, 7, 15, 15]
})

post_ft_df = pd.DataFrame({
    'Model': ['Llama 2 FT', 'Mistral Instruct FT', 'gpt-3.5-turbo-0125', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13'],
    'Accuracy': [77, 83, 87, 91, 94],
    'Precision': [35, 50, 5, 26, 31],
    'TCE': [9, 4, 7, 15, 15]
})

# Function to calculate Cohen's d for paired samples
def cohen_d(x, y):
    diff = x - y
    return np.mean(diff) / np.std(diff, ddof=1)

# Calculate effect sizes for each metric
accuracy_effect_size = cohen_d(post_ft_df['Accuracy'], baseline_df['Accuracy'])
precision_effect_size = cohen_d(post_ft_df['Precision'], baseline_df['Precision'])
tce_effect_size = cohen_d(post_ft_df['TCE'], baseline_df['TCE'])

print("Effect Size for Accuracy:", accuracy_effect_size)
print("Effect Size for Precision:", precision_effect_size)
print("Effect Size for TCE:", tce_effect_size)

Effect Size for Accuracy: 0.7287283470634874
Effect Size for Precision: 0.7120516440958823
Effect Size for TCE: -0.2842277497795395
Effect Size for Accuracy: 0.7287283470634874
Effect Size for Precision: 0.7120516440958823
Effect Size for TCE: -0.2842277497795395


In [20]:
from sklearn.utils import resample

# Function to calculate bootstrap p-values
def bootstrap_pvalue(data1, data2, n_resamples=10000):
    observed_diff = np.mean(data1) - np.mean(data2)
    combined = np.concatenate([data1, data2])
    count = 0
    for _ in range(n_resamples):
        resampled = resample(combined, replace=True, n_samples=len(combined))
        resampled_diff = np.mean(resampled[:len(data1)]) - np.mean(resampled[len(data1):])
        if abs(resampled_diff) >= abs(observed_diff):
            count += 1
    p_value = count / n_resamples
    return p_value

# Bootstrap p-values
accuracy_pvalue_bootstrap = bootstrap_pvalue(post_ft_df['Accuracy'], baseline_df['Accuracy'])
precision_pvalue_bootstrap = bootstrap_pvalue(post_ft_df['Precision'], baseline_df['Precision'])
tce_pvalue_bootstrap = bootstrap_pvalue(post_ft_df['TCE'], baseline_df['TCE'])

print("Bootstrap p-value for Accuracy:", accuracy_pvalue_bootstrap)
print("Bootstrap p-value for Precision:", precision_pvalue_bootstrap)
print("Bootstrap p-value for TCE:", tce_pvalue_bootstrap)


Bootstrap p-value for Accuracy: 0.1944
Bootstrap p-value for Precision: 0.1068
Bootstrap p-value for TCE: 0.4945


In [33]:
import math

# Define the baseline and post fine-tuning statistics
metrics = {
    'Accuracy': {'baseline_mean': 62.8, 'post_ft_mean': 86.4, 'baseline_sd': 33.71, 'post_ft_sd': 6.48},
    'Precision': {'baseline_mean': 12.4, 'post_ft_mean': 29.4, 'baseline_sd': 14.66, 'post_ft_sd': 16.47},
    'TCE': {'baseline_mean': 12.4, 'post_ft_mean': 10.0, 'baseline_sd': 6.51, 'post_ft_sd': 4.96}
}

# Function to calculate Cohen's d
def cohen_d(baseline_mean, post_ft_mean, baseline_sd, post_ft_sd):
    mean_diff = post_ft_mean - baseline_mean
    pooled_sd = math.sqrt((baseline_sd ** 2 + post_ft_sd ** 2) / 2)
    d = mean_diff / pooled_sd
    return mean_diff, pooled_sd, d

# Calculate and print Cohen's d for each metric
results = {}
for metric, values in metrics.items():
    mean_diff, pooled_sd, d = cohen_d(values['baseline_mean'], values['post_ft_mean'], values['baseline_sd'], values['post_ft_sd'])
    results[metric] = {'Mean Change': mean_diff, 'Pooled Std Dev': pooled_sd, 'Cohens d': d}

# Print the results
print("Summary of Cohens d Calculations")
print(f"{'Metric':<10} {'Mean Change':<12} {'Pooled Std Dev':<15} {'Cohens d':<10} {'Interpretation'}")
for metric, res in results.items():
    interpretation = "Small effect" if abs(res['Cohens d']) < 0.2 else \
                     "Medium effect" if abs(res['Cohens d']) < 0.5 else \
                     "Large effect"
    print(f"{metric:<10} {res['Mean Change']:<12.2f} {res['Pooled Std Dev']:<15.2f} {res['Cohens d']:<10.2f} {interpretation}")

Summary of Cohens d Calculations
Metric     Mean Change  Pooled Std Dev  Cohens d   Interpretation
Accuracy   23.60        24.27           0.97       Large effect
Precision  17.00        15.59           1.09       Large effect
TCE        -2.40        5.79            -0.41      Medium effect
