# Hypothesis Testing (Task 3)

### Data Preparation

In [10]:
import os
import pandas as pd

# Correct path construction (use raw string or forward slashes)
file_path = r'C:\Users\user\Desktop\insurance-risk-analytics\notebooks\data\Insurance_dataset.csv'  # Add .csv

# Verify file exists
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("Data loaded successfully! Shape:", df.shape)
else:
    print("File not found. Check:")
    print("- File exists at path")
    print("- Correct extension (.csv/.parquet/.xlsx)")
    print("- No typos in path")

  df = pd.read_csv(file_path)


Data loaded successfully! Shape: (1000098, 52)


In [12]:
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Create essential binary flags
df['HadClaim'] = (df['TotalClaims'] > 0).astype(int)  # Claim occurrence
df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']  # Monetary impact

### Provincial Risk Differences

In [16]:
# Frequency test (Chi-square)
province_freq = pd.crosstab(df['Province'], df['HadClaim'])
chi2, pval, _, _ = chi2_contingency(province_freq)
print(f"Province Claim Frequency p-value: {pval:.4f}")

# Severity test (ANOVA + Tukey HSD)
province_groups = df.groupby('Province')['TotalClaims'].apply(list)
f_stat, pval = f_oneway(*province_groups)
print(f"Province Claim Severity ANOVA p-value: {pval:.4f}")

# Post-hoc analysis (if p < 0.05)
if pval < 0.05:
    tukey = pairwise_tukeyhsd(df['TotalClaims'], df['Province'])
    print(tukey.summary())

Province Claim Frequency p-value: 0.0000
Province Claim Severity ANOVA p-value: 0.0000
         Multiple Comparison of Means - Tukey HSD, FWER=0.05         
    group1        group2    meandiff p-adj    lower    upper   reject
---------------------------------------------------------------------
 Eastern Cape    Free State  -0.8905    1.0  -93.3782  91.5973  False
 Eastern Cape       Gauteng  29.9166  0.469  -14.1438   73.977  False
 Eastern Cape KwaZulu-Natal  39.5209 0.1625    -6.572  85.6137  False
 Eastern Cape       Limpopo  -3.7859    1.0  -67.0642  59.4924  False
 Eastern Cape    Mpumalanga  -5.9283    1.0  -59.2172  47.3607  False
 Eastern Cape    North West   -3.396    1.0  -50.1304  43.3384  False
 Eastern Cape Northern Cape -30.6867  0.991  -132.535  71.1616  False
 Eastern Cape  Western Cape   16.118 0.9765  -29.9541  62.1902  False
   Free State       Gauteng   30.807 0.9662  -52.2009  113.815  False
   Free State KwaZulu-Natal  40.4113 0.8605  -43.6931 124.5157  False
   

### Zip Code Analysis

In [14]:
# Top/Bottom 20% zip codes by loss ratio
zip_stats = df.groupby('PostalCode').agg(
    Claims=('TotalClaims', 'sum'),
    Premium=('TotalPremium', 'sum')
).assign(LossRatio=lambda x: x['Claims']/x['Premium'])

high_risk = df[df['PostalCode'].isin(zip_stats.nlargest(20, 'LossRatio').index)]
low_risk = df[df['PostalCode'].isin(zip_stats.nsmallest(20, 'LossRatio').index)]

# Margin comparison
t_stat, pval = ttest_ind(high_risk['ProfitMargin'], low_risk['ProfitMargin'])
print(f"\nZip Code Margin p-value: {pval:.4f}")


Zip Code Margin p-value: 0.0000


###  Gender Test

In [15]:
# Clean gender data
valid_genders = df['Gender'].dropna().unique()
print(f"Unique genders: {valid_genders}")  # Check for unexpected values

# Frequency test
gender_freq = pd.crosstab(df['Gender'], df['HadClaim'])
chi2, pval, _, _ = chi2_contingency(gender_freq)
print(f"\nGender Claim Frequency p-value: {pval:.4f}")

# Severity test
male_claims = df[df['Gender'] == 'Male']['TotalClaims']
female_claims = df[df['Gender'] == 'Female']['TotalClaims']
t_stat, pval = ttest_ind(male_claims, female_claims)
print(f"Gender Claim Severity p-value: {pval:.4f}")

Unique genders: ['Not specified' 'Male' 'Female']

Gender Claim Frequency p-value: 0.0266
Gender Claim Severity p-value: 0.8041
