# Statistical tests with graphs

In [None]:
import boto3
import pandas as pd
from io import StringIO

# Define your bucket name and file key (file path in S3)
BUCKET_NAME = "blue-blood-data"
FILE_KEY = "final_df.csv"  # Change to your actual file path in S3

# Create an S3 client
s3 = boto3.client("s3")

# Fetch the file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=FILE_KEY)

# Read the CSV file into a pandas DataFrame
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))

# Print DataFrame
print(df.head())

In [None]:
df = df.drop(columns = ['subject_id', 'pre_charttime', 'prescription_start', 'post_charttime'], axis = 1)

In [None]:
import pandas as pd
import numpy as np
import re

def clean_and_convert(x):
    if pd.notna(x):
        try:
            # Remove square brackets if they exist
            x = re.sub(r'[\[\]]', '', x)
            # Replace multiple spaces with a single space
            cleaned = re.sub(r'\s+', ' ', x.strip())
            # Convert to numpy array of floats
            return np.array([float(i) for i in cleaned.split(' ')])
        except Exception as e:
            # If there's an error during conversion, print the error and return the original value
            print(f"Error processing value: {x}. Error: {e}")
            return x  # Return the original value in case of error
    return x  # If NaN, return as is

# Check the values before applying the function
print(df['prescription_rx_embeddings'].head())

# Apply the function to the 'prescription_rx_embeddings' column
df['prescription_rx_embeddings'] = df['prescription_rx_embeddings'].apply(clean_and_convert)

# Check the result
print(df['prescription_rx_embeddings'].head())

In [None]:
# Split the 'prescription_rx_embeddings' column into separate columns based on array elements
expanded_columns = pd.DataFrame(df['prescription_rx_embeddings'].tolist(), index=df.index)

# Optionally, rename the columns to something meaningful
expanded_columns.columns = [f'P{i}' for i in range(expanded_columns.shape[1])]

# Concatenate the expanded columns back to the original DataFrame, if needed
df = pd.concat([df, expanded_columns], axis=1)

# Drop the original 'prescription_rx_embeddings' column
df.drop(columns=['prescription_rx_embeddings'], inplace=True)

df

In [None]:
import boto3
import tarfile
import os
import pandas as pd
from io import BytesIO

# S3 bucket and key extraction
s3_uri = "s3://sagemaker-us-east-1-211125439249/pytorch-training-2025-03-21-04-33-20-724/output/output.tar.gz"
parts = s3_uri.replace("s3://", "").split("/")
bucket_name = parts[0]
key = "/".join(parts[1:])

# Initialize S3 client
s3 = boto3.client('s3')

# Download the .tar.gz file
response = s3.get_object(Bucket=bucket_name, Key=key)
file_obj = BytesIO(response['Body'].read())

# Extract the .tar.gz file
with tarfile.open(fileobj=file_obj, mode='r:gz') as tar:
    for member in tar.getmembers():
        if "synthetic_data.csv" in member.name:
            csv_file = tar.extractfile(member)
            df_synthetic = pd.read_csv(csv_file)
            break

# Display the DataFrame
df_synthetic

In [None]:
import pandas as pd
from scipy.stats import skew, kurtosis, chi2_contingency

# Function to calculate mean
def calculate_mean(df1, df2):
    return df1.mean(), df2.mean()

# Function to calculate median
def calculate_median(df1, df2):
    return df1.median(), df2.median()

# Function to calculate standard deviation
def calculate_std(df1, df2):
    return df1.std(), df2.std()

# Function to calculate skewness
def calculate_skewness(df1, df2):
    return df1.apply(lambda x: skew(x, nan_policy='omit')), df2.apply(lambda x: skew(x, nan_policy='omit'))

# Function to calculate kurtosis
def calculate_kurtosis(df1, df2):
    return df1.apply(lambda x: kurtosis(x, nan_policy='omit')), df2.apply(lambda x: kurtosis(x, nan_policy='omit'))

# Function to calculate Chi-Square Test for categorical data
def calculate_chi_square(df1, df2, cat_column1, cat_column2):
    # Create contingency tables for both DataFrames
    contingency_table1 = pd.crosstab(df1[cat_column1], df1[cat_column2])
    contingency_table2 = pd.crosstab(df2[cat_column1], df2[cat_column2])
    
    # Perform Chi-Square test for both
    chi2_stat1, p_value1, dof1, expected1 = chi2_contingency(contingency_table1)
    chi2_stat2, p_value2, dof2, expected2 = chi2_contingency(contingency_table2)
    
    # Return the results
    return {
        "Chi2 Statistic (df)": chi2_stat1,
        "P-value (df)": p_value1,
        "Degrees of Freedom (df)": dof1,
        "Expected Frequencies (df)": expected1,
        "Chi2 Statistic (new df)": chi2_stat2,
        "P-value (new df)": p_value2,
        "Degrees of Freedom (new df)": dof2,
        "Expected Frequencies (new df)": expected2
    }

# Perform the statistical analysis and print the results:

print("Mean Comparison between Original and New DataFrame:")
mean_df, mean_new_df = calculate_mean(df, df_synthetic)
print("Original DataFrame Mean:\n", mean_df)
print("New DataFrame Mean:\n", mean_new_df)

print("\nMedian Comparison between Original and New DataFrame:")
median_df, median_new_df = calculate_median(df, df_synthetic)
print("Original DataFrame Median:\n", median_df)
print("New DataFrame Median:\n", median_new_df)

print("\nStandard Deviation Comparison between Original and New DataFrame:")
std_df, std_new_df = calculate_std(df, df_synthetic)
print("Original DataFrame Standard Deviation:\n", std_df)
print("New DataFrame Standard Deviation:\n", std_new_df)

print("\nSkewness Comparison between Original and New DataFrame:")
skew_df, skew_new_df = calculate_skewness(df, df_synthetic)
print("Original DataFrame Skewness:\n", skew_df)
print("New DataFrame Skewness:\n", skew_new_df)

print("\nKurtosis Comparison between Original and New DataFrame:")
kurt_df, kurt_new_df = calculate_kurtosis(df, df_synthetic)
print("Original DataFrame Kurtosis:\n", kurt_df)
print("New DataFrame Kurtosis:\n", kurt_new_df)

# If you have categorical columns, you can perform the Chi-Square test for both DataFrames
print("\nChi-Square Test between prescription_dose_val_rx and prescription_dose_unit_rx:")
chi_square_results = calculate_chi_square(df, df_synthetic, 'prescription_dose_val_rx', 'prescription_dose_unit_rx')
print(chi_square_results)

In [None]:
def compute_statistics(real_data, synthetic_data):
    # Initialize dictionaries to store results
    mean_differences = {}
    real_means = {}
    synthetic_means = {}

    # Iterate through columns
    for col in real_data.columns:
        try:
            # Only calculate statistics for numerical columns
            if pd.api.types.is_numeric_dtype(real_data[col]):
                real_mean = real_data[col].mean()
                synthetic_mean = synthetic_data[col].mean()

                real_means[col] = real_mean  # Store mean of real data
                synthetic_means[col] = synthetic_mean  # Store mean of synthetic data
                mean_differences[col] = abs(real_mean - synthetic_mean)  # Store absolute mean difference

        except Exception as e:
            print(f"Error processing column {col}: {e}")

    # Create DataFrames for the results
    mean_diff_df = pd.DataFrame(list(mean_differences.items()), columns=['Column', 'Mean Difference'])
    avg_values_df = pd.DataFrame({'Column': list(real_means.keys()), 
                                  'Real Data Mean': list(real_means.values()), 
                                  'Synthetic Data Mean': list(synthetic_means.values())})

    return mean_diff_df, avg_values_df

mean_diff_df, avg_values_df = compute_statistics(df, df_synthetic)

# Print the tables
print("\nðŸ”¹ Mean Differences Between Real & Synthetic Data:")
print(mean_diff_df)

print("\nðŸ”¹ Average Values in Real & Synthetic Data:")
print(avg_values_df)

In [None]:
# Define real and synthetic data
real_data = df
synthetic_data = df_synthetic

# Prepare a list to store results
results = []

# Iterate through all numerical columns
for column in real_data.select_dtypes(include=['number']).columns:
    real_mean = real_data[column].mean()
    synthetic_mean = synthetic_data[column].mean()
    abs_difference = abs(real_mean - synthetic_mean)
    
    # Calculate a simple distribution difference using KL Divergence (if possible)
    distribution_diff = abs(real_data[column].std() - synthetic_data[column].std())  

    # Append results
    results.append([column, real_mean, synthetic_mean, abs_difference, distribution_diff])

# Convert results to a DataFrame
summary_df = pd.DataFrame(
    results, columns=['Column', 'Real Mean', 'Synthetic Mean', 'Abs Difference', 'Distribution Difference']
)

# Format numbers to avoid scientific notation
summary_df[['Real Mean', 'Synthetic Mean', 'Abs Difference', 'Distribution Difference']] = summary_df[
    ['Real Mean', 'Synthetic Mean', 'Abs Difference', 'Distribution Difference']
].applymap(lambda x: f"{x:,.6f}")

# Display the table
print(summary_df.to_string(index=False))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

chunk_size = 25
columns = df.columns  # List of columns to process

# Function to calculate mean
def calculate_mean(df1, df2):
    return df1.mean(), df2.mean()

# Function to calculate median
def calculate_median(df1, df2):
    return df1.median(), df2.median()

# Function to calculate standard deviation
def calculate_std(df1, df2):
    return df1.std(), df2.std()

# Function to calculate skewness
def calculate_skewness(df1, df2):
    return df1.apply(lambda x: skew(x, nan_policy='omit')), df2.apply(lambda x: skew(x, nan_policy='omit'))

# Function to calculate kurtosis
def calculate_kurtosis(df1, df2):
    return df1.apply(lambda x: kurtosis(x, nan_policy='omit')), df2.apply(lambda x: kurtosis(x, nan_policy='omit'))

# Function to plot chunked bar charts in a 3x3 grid
def plot_chunked_barcharts(stat_func, stat_label, df1, df2, columns, chunk_size=25):
    column_chunks = np.array_split(columns, len(columns) // chunk_size + 1)
    
    num_plots = len(column_chunks)
    rows = (num_plots // 3) + (1 if num_plots % 3 else 0)
    
    fig, axes = plt.subplots(rows, 3, figsize=(15, rows * 5))  # Create grid
    axes = axes.flatten()  # Flatten for easy iteration
    
    used_axes = 0  # Track used subplots

    for idx, chunk in enumerate(column_chunks):
        if idx >= len(axes):  # Prevent errors if more chunks than axes
            break

        stat_df1, stat_df2 = stat_func(df1[chunk], df2[chunk])
        stats_df = pd.DataFrame({"Original": stat_df1, "New": stat_df2})
        stats_df.dropna(how="all", inplace=True)  # Remove empty columns

        if not stats_df.empty:
            stats_df.plot(kind="bar", alpha=0.7, ax=axes[idx], title=f"{stat_label} ({chunk[0]} to {chunk[-1]})")
            used_axes += 1

    # Hide any unused subplots
    for idx in range(used_axes, len(axes)):
        fig.delaxes(axes[idx])

    plt.tight_layout()  # Adjust layout

# Call the functions to generate the charts
plot_chunked_barcharts(calculate_mean, "Mean", df, df_synthetic, columns, chunk_size)
plot_chunked_barcharts(calculate_median, "Median", df, df_synthetic, columns, chunk_size)
plot_chunked_barcharts(calculate_std, "Standard Deviation", df, df_synthetic, columns, chunk_size)
plot_chunked_barcharts(calculate_skewness, "Skewness", df, df_synthetic, columns, chunk_size)
plot_chunked_barcharts(calculate_kurtosis, "Kurtosis", df, df_synthetic, columns, chunk_size)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

chunk_size = 10
columns = df.columns  # List of columns to process

# Function to calculate mean
def calculate_mean(df1, df2):
    return df1.mean(), df2.mean()

# Function to calculate median
def calculate_median(df1, df2):
    return df1.median(), df2.median()

# Function to calculate standard deviation
def calculate_std(df1, df2):
    return df1.std(), df2.std()

# Function to calculate skewness
def calculate_skewness(df1, df2):
    return df1.apply(lambda x: skew(x, nan_policy='omit')), df2.apply(lambda x: skew(x, nan_policy='omit'))

# Function to calculate kurtosis
def calculate_kurtosis(df1, df2):
    return df1.apply(lambda x: kurtosis(x, nan_policy='omit')), df2.apply(lambda x: kurtosis(x, nan_policy='omit'))

# Function to plot error differences in a 3x3 grid
def plot_error_difference_grid(stat_func, stat_label, df1, df2, columns, chunk_size=10):
    column_chunks = np.array_split(columns, len(columns) // chunk_size + 1)
    
    num_plots = 0  # Count non-empty plots
    filtered_chunks = []  # Store only valid chunks

    for chunk in column_chunks:
        stat_df1, stat_df2 = stat_func(df1[chunk], df2[chunk])
        error_diff = abs(stat_df1 - stat_df2)
        non_zero_error_columns = error_diff[error_diff > 0]

        if not non_zero_error_columns.empty:
            filtered_chunks.append((non_zero_error_columns, chunk))
            num_plots += 1

    # Determine grid size
    rows = (num_plots // 3) + (1 if num_plots % 3 else 0)
    fig, axes = plt.subplots(rows, 3, figsize=(15, rows * 5))
    axes = axes.flatten()

    for idx, (non_zero_error_columns, chunk) in enumerate(filtered_chunks):
        ax = axes[idx]
        ax.bar(non_zero_error_columns.index, non_zero_error_columns.values, alpha=0.7, color='red')
        ax.set_title(f"{stat_label} Error Diff ({chunk[0]} to {chunk[-1]})")
        ax.set_xlabel("Columns")
        ax.set_ylabel(f"{stat_label} Error Difference")
        ax.tick_params(axis='x', rotation=90)

    # Remove unused subplots
    for idx in range(num_plots, len(axes)):
        fig.delaxes(axes[idx])

    plt.tight_layout()

# Call the function to plot error differences
plot_error_difference_grid(calculate_mean, "Mean", df, df_synthetic, columns, chunk_size)
plot_error_difference_grid(calculate_median, "Median", df, df_synthetic, columns, chunk_size)
plot_error_difference_grid(calculate_std, "Standard Deviation", df, df_synthetic, columns, chunk_size)
plot_error_difference_grid(calculate_skewness, "Skewness", df, df_synthetic, columns, chunk_size)
plot_error_difference_grid(calculate_kurtosis, "Kurtosis", df, df_synthetic, columns, chunk_size)