In [10]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_ind

In [11]:
# Define the function to perform T-Tests
def perform_t_tests(df, metrics, alpha=0.05):
    # Get unique values for 'area' and 'tenured'
    areas = df['area'].unique()
    tenured_statuses = df['tenured'].unique()
    
    # Initialize a list to store results
    results = []

    for tenured in tenured_statuses:
        for area in areas:
            for metric in metrics:
                # Subset data based on 'area' and 'tenured'
                subset = df[(df['tenured'] == tenured) & (df['area'] == area)]
                # Separate data into 'HOME' and 'OFFICE' groups
                wfh_data = subset[subset['day_type'] == 'HOME'][metric]
                office_data = subset[subset['day_type'] == 'OFFICE'][metric]
                
                if len(wfh_data) > 0 and len(office_data) > 0:
                    # Perform T-Test
                    t_stat, p_value = ttest_ind(wfh_data, office_data, equal_var=False, nan_policy='omit')
                    
                    # Calculate means and sample sizes
                    wfh_mean = wfh_data.mean()
                    office_mean = office_data.mean()
                    
                    # Append results to the list
                    results.append({
                        'Tenured': tenured,
                        'Area': area,
                        'Metric': metric,
                        'WFH Mean': wfh_mean,
                        'Office Mean': office_mean,
                        'T-Statistic': t_stat,
                        'P-Value': round(p_value, 5)
                    })
        
        # Add combined area for the current tenured status
        for metric in metrics:
            combined_subset = df[df['tenured'] == tenured]
            wfh_data_combined = combined_subset[combined_subset['day_type'] == 'HOME'][metric]
            office_data_combined = combined_subset[combined_subset['day_type'] == 'OFFICE'][metric]
            
            if len(wfh_data_combined) > 0 and len(office_data_combined) > 0:
                # Perform T-Test
                t_stat_combined, p_value_combined = ttest_ind(wfh_data_combined, office_data_combined, equal_var=False, nan_policy='omit')
                
                # Calculate means and sample sizes
                wfh_mean_combined = wfh_data_combined.mean()
                office_mean_combined = office_data_combined.mean()
                
                # Append combined area results to the list
                results.append({
                    'Tenured': tenured,
                    'Area': 'All Areas Combined',
                    'Metric': metric,
                    'WFH Mean': wfh_mean_combined,
                    'Office Mean': office_mean_combined,
                    'T-Statistic': t_stat_combined,
                    'P-Value': round(p_value_combined, 5)
                })
    
    # Convert results list to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Assuming 'result_df' is your DataFrame loaded with your data
metrics = ['avg_interaction_count', 'aht', 'avg_productivity']

# Set the alpha level (e.g., 0.05 for a 95% confidence level)
alpha = 0.05

# Perform the T-Tests and get the results
t_test_results = perform_t_tests(result_df, metrics)


print(t_test_results.head(5))

NameError: name 'result_df' is not defined

In [None]:
# Transcribe the data into a list of dictionaries
data = [
    {"Tenured": "No", "Area Name": "Chat", "Sample": 4, "Metric": "Adherence", "WFH Proportion": 0.755},
    {"Tenured": "No", "Area Name": "Chat", "Sample": 4, "Metric": "Consults", "WFH Proportion": 0.153},
    {"Tenured": "No", "Area Name": "Chat", "Sample": 4, "Metric": "OSAT", "WFH Proportion": 0.552},
    {"Tenured": "No", "Area Name": "Chat", "Sample": 4, "Metric": "Transfers", "WFH Proportion": 0.129},
    {"Tenured": "No", "Area Name": "Client Banking Services", "Sample": 46, "Metric": "Adherence", "WFH Proportion": 0.912},
    {"Tenured": "No", "Area Name": "Client Banking Services", "Sample": 46, "Metric": "Consults", "WFH Proportion": 0.206},
    {"Tenured": "No", "Area Name": "Client Banking Services", "Sample": 46, "Metric": "OSAT", "WFH Proportion": 0.472},
    {"Tenured": "No", "Area Name": "Client Banking Services", "Sample": 46, "Metric": "Transfers", "WFH Proportion": 0.148},
    {"Tenured": "No", "Area Name": "Core Service", "Sample": 594, "Metric": "Adherence", "WFH Proportion": 0.836},
    {"Tenured": "No", "Area Name": "Core Service", "Sample": 594, "Metric": "Consults", "WFH Proportion": 0.178},
    {"Tenured": "No", "Area Name": "Core Service", "Sample": 594, "Metric": "OSAT", "WFH Proportion": 0.468},
    {"Tenured": "No", "Area Name": "Core Service", "Sample": 594, "Metric": "Transfers", "WFH Proportion": 0.188},
    {"Tenured": "No", "Area Name": "Tier 2 Support", "Sample": 5, "Metric": "Adherence", "WFH Proportion": 0.888},
    {"Tenured": "No", "Area Name": "Tier 2 Support", "Sample": 5, "Metric": "Consults", "WFH Proportion": 0.104},
    {"Tenured": "No", "Area Name": "Tier 2 Support", "Sample": 5, "Metric": "OSAT", "WFH Proportion": 1.000},
    {"Tenured": "No", "Area Name": "Tier 2 Support", "Sample": 5, "Metric": "Transfers", "WFH Proportion": 0.241},
    {"Tenured": "No", "Area Name": "Trader Service", "Sample": 6, "Metric": "Adherence", "WFH Proportion": 0.822},
    {"Tenured": "No", "Area Name": "Trader Service", "Sample": 6, "Metric": "Consults", "WFH Proportion": 0.266},
    {"Tenured": "No", "Area Name": "Trader Service", "Sample": 6, "Metric": "OSAT", "WFH Proportion": 0.571},
    {"Tenured": "No", "Area Name": "Trader Service", "Sample": 6, "Metric": "Transfers", "WFH Proportion": 0.273},
]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Adding some hypothetical Office Proportion data
np.random.seed(42)  # For reproducibility
df["Office Proportion"] = np.random.rand(len(df))

# Set the alpha level (e.g., 0.05 for a 95% confidence level)
alpha = 0.05

# Perform Z-tests for each row
results = []

for _, row in df.iterrows():
    count = np.array([row["WFH Proportion"] * row["Sample"], row["Office Proportion"] * row["Sample"]])
    nobs = np.array([row["Sample"], row["Sample"]])
    
    z_score, p_value = proportions_ztest(count, nobs)
    
    results.append({
        "Tenured": row["Tenured"],
        "Area Name": row["Area Name"],
        "Sample": row["Sample"],
        "Metric": row["Metric"],
        "WFH Proportion": row["WFH Proportion"],
        "Office Proportion": row["Office Proportion"],
        "Z-Statistic": z_score,
        "P-Value": p_value,
        "Significant": p_value < alpha  # Check if the p-value is less than the alpha level
    })

# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Add combined area for each tenured status
combined_results = []

for tenured in df['Tenured'].unique():
    for metric in df['Metric'].unique():
        combined_subset = df[(df['Tenured'] == tenured) & (df['Metric'] == metric)]
        combined_sample = combined_subset['Sample'].sum()
        combined_wfh_proportion = (combined_subset['WFH Proportion'] * combined_subset['Sample']).sum() / combined_sample
        combined_office_proportion = (combined_subset['Office Proportion'] * combined_subset['Sample']).sum() / combined_sample
        
        count_combined = np.array([combined_wfh_proportion * combined_sample, combined_office_proportion * combined_sample])
        nobs_combined = np.array([combined_sample, combined_sample])
        
        z_score_combined, p_value_combined = proportions_ztest(count_combined, nobs_combined)
        
        combined_results.append({
            "Tenured": tenured,
            "Area Name": "COMBINED",
            "Sample": combined_sample,
            "Metric": metric,
            "WFH Proportion": combined_wfh_proportion,
            "Office Proportion": combined_office_proportion,
            "Z-Statistic": z_score_combined,
            "P-Value": p_value_combined,
            "Significant": p_value_combined < alpha  # Check if the p-value is less than the alpha level
        })

# Convert combined results to a DataFrame and append to the original results
df_combined_results = pd.DataFrame(combined_results)
df_final_results = pd.concat([df_results, df_combined_results], ignore_index=True)

# Reorder the columns
df_final_results = df_final_results[["Tenured", "Area Name", "Sample", "Metric", "WFH Proportion", "Office Proportion", "Z-Statistic", "P-Value", "Significant"]]

print(df_final_results.head(5))