In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
os.chdir(r'D:\10academy\week_3\ACIS-Insurance-Analysis')
sys.path.append(os.getcwd()) 

In [2]:
# Define the data directory and load the data
DATA_DIR = 'notebooks/data'
data = pd.read_csv(os.path.join(DATA_DIR, 'MachineLearningRating_v3.txt'), sep='|')

# Ensure numerical columns are correctly typed
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')

# Drop rows where essential financial data is missing
data.dropna(subset=['TotalPremium', 'TotalClaims', 'Province', 'Gender', 'PostalCode'], inplace=True)

# Calculate Claim Frequency
data['HasClaim'] = (data['TotalClaims'] > 0).astype(int)

# Calculate Claim Severity (only for policies with claims)
# Avoid division by zero if TotalPremium is 0
data['ClaimSeverity'] = data.apply(lambda row: row['TotalClaims'] if row['HasClaim'] == 1 else np.nan, axis=1)

# Calculate Margin
data['Margin'] = data['TotalPremium'] - data['TotalClaims']

print("Data preparation for hypothesis testing complete. Head of data with new metrics:")
print(data[['TotalPremium', 'TotalClaims', 'HasClaim', 'ClaimSeverity', 'Margin']].head())


Data preparation for hypothesis testing complete. Head of data with new metrics:
   TotalPremium  TotalClaims  HasClaim  ClaimSeverity      Margin
0     21.929825          0.0         0            NaN   21.929825
1     21.929825          0.0         0            NaN   21.929825
2      0.000000          0.0         0            NaN    0.000000
3    512.848070          0.0         0            NaN  512.848070
4      0.000000          0.0         0            NaN    0.000000


In [3]:
## H0: There are no risk differences across provinces

print("\n--- Testing H0: No risk differences across provinces ---")

# Data segmentation by Province
provinces = data['Province'].unique()

# --- Test for Claim Frequency by Province (Chi-squared test) ---
print("\n--- Claim Frequency by Province ---")
claim_province_crosstab = pd.crosstab(data['Province'], data['HasClaim'])

if claim_province_crosstab.shape[0] > 1 and claim_province_crosstab.shape[1] > 1 and (claim_province_crosstab > 0).all().all():
    chi2, p_value_freq, _, _ = stats.chi2_contingency(claim_province_crosstab)
    print(f"Chi-squared test for Claim Frequency by Province: p-value = {p_value_freq:.4f}")
    if p_value_freq < 0.05:
        print("Conclusion: Reject H0. There are statistically significant differences in claim frequency across provinces.")
        # Interpretation
        province_claim_frequency = data.groupby('Province')['HasClaim'].mean().sort_values(ascending=False)
        print("\nClaim Frequency by Province:")
        print(province_claim_frequency)
        print(f"Interpretation: Provinces like {province_claim_frequency.index[0]} show a higher claim frequency compared to others, such as {province_claim_frequency.index[-1]}.")
    else:
        print("Conclusion: Fail to reject H0. No statistically significant differences in claim frequency across provinces.")
else:
    print("Cannot perform Chi-squared test due to insufficient data or all zero values in contingency table.")


# --- Test for Claim Severity by Province (ANOVA) ---
print("\n--- Claim Severity by Province (for policies with claims) ---")
# Filter for policies with claims for severity analysis
claim_severity_data = data[data['HasClaim'] == 1].dropna(subset=['ClaimSeverity', 'Province'])

if len(claim_severity_data['Province'].unique()) > 1:
    groups = [claim_severity_data['ClaimSeverity'][claim_severity_data['Province'] == p] for p in claim_severity_data['Province'].unique()]
    groups = [g for g in groups if not g.empty]

    if len(groups) > 1:
        f_stat_severity, p_value_severity = stats.f_oneway(*groups)
        print(f"ANOVA test for Claim Severity by Province: p-value = {p_value_severity:.4f}")
        if p_value_severity < 0.05:
            print("Conclusion: Reject H0. There are statistically significant differences in claim severity across provinces.")
            # Interpretation
            province_claim_severity = claim_severity_data.groupby('Province')['ClaimSeverity'].mean().sort_values(ascending=False)
            print("\nAverage Claim Severity by Province:")
            print(province_claim_severity)
            print(f"Interpretation: Policies in {province_claim_severity.index[0]} tend to have higher average claim amounts when a claim occurs, compared to {province_claim_severity.index[-1]}.")
        else:
            print("Conclusion: Fail to reject H0. No statistically significant differences in claim severity across provinces.")
    else:
        print("Cannot perform ANOVA: Not enough distinct provinces with claims data.")
else:
    print("Cannot perform ANOVA: Only one province found with claim severity data.")


--- Testing H0: No risk differences across provinces ---

--- Claim Frequency by Province ---
Chi-squared test for Claim Frequency by Province: p-value = 0.0000
Conclusion: Reject H0. There are statistically significant differences in claim frequency across provinces.

Claim Frequency by Province:
Province
Gauteng          0.003388
KwaZulu-Natal    0.002874
Limpopo          0.002698
North West       0.002436
Mpumalanga       0.002428
Western Cape     0.002166
Eastern Cape     0.001648
Free State       0.001358
Northern Cape    0.001254
Name: HasClaim, dtype: float64
Interpretation: Provinces like Gauteng show a higher claim frequency compared to others, such as Northern Cape.

--- Claim Severity by Province (for policies with claims) ---
ANOVA test for Claim Severity by Province: p-value = 0.0000
Conclusion: Reject H0. There are statistically significant differences in claim severity across provinces.

Average Claim Severity by Province:
Province
Free State       32265.661085
KwaZulu-

In [4]:
## H0: There are no risk differences between zip codes

print("\n--- Testing H0: No risk differences between zip codes ---")

# To make this manageable, let's consider a subset of top N most frequent PostalCodes
top_n_zip_codes = data['PostalCode'].value_counts().nlargest(10).index.tolist()
data_top_zips = data[data['PostalCode'].isin(top_n_zip_codes)].copy()

if not data_top_zips.empty and len(data_top_zips['PostalCode'].unique()) > 1:
    print("\n--- Claim Frequency by Top N Postal Codes ---")
    claim_zip_crosstab = pd.crosstab(data_top_zips['PostalCode'], data_top_zips['HasClaim'])
    if claim_zip_crosstab.shape[0] > 1 and claim_zip_crosstab.shape[1] > 1 and (claim_zip_crosstab > 0).all().all():
        chi2_zip_freq, p_value_zip_freq, _, _ = stats.chi2_contingency(claim_zip_crosstab)
        print(f"Chi-squared test for Claim Frequency by Top N Postal Codes: p-value = {p_value_zip_freq:.4f}")
        if p_value_zip_freq < 0.05:
            print("Conclusion: Reject H0. There are statistically significant differences in claim frequency across these top zip codes.")
            # Interpretation
            zip_claim_frequency = data_top_zips.groupby('PostalCode')['HasClaim'].mean().sort_values(ascending=False)
            print("\nClaim Frequency by Top N Postal Codes:")
            print(zip_claim_frequency)
            print(f"Interpretation: Some zip codes ({zip_claim_frequency.index[0]} e.g.) show a higher propensity for claims.")
        else:
            print("Conclusion: Fail to reject H0. No statistically significant differences in claim frequency across these top zip codes.")
    else:
        print("Cannot perform Chi-squared test for top N zip codes due to insufficient data or all zero values.")

    # --- Test for Claim Severity by Top N PostalCodes (ANOVA) ---
    print("\n--- Claim Severity by Top N Postal Codes (for policies with claims) ---")
    claim_severity_data_top_zips = data_top_zips[data_top_zips['HasClaim'] == 1].dropna(subset=['ClaimSeverity', 'PostalCode'])

    if len(claim_severity_data_top_zips['PostalCode'].unique()) > 1:
        groups_zip_severity = [claim_severity_data_top_zips['ClaimSeverity'][claim_severity_data_top_zips['PostalCode'] == z] for z in claim_severity_data_top_zips['PostalCode'].unique()]
        groups_zip_severity = [g for g in groups_zip_severity if not g.empty]

        if len(groups_zip_severity) > 1:
            f_stat_zip_severity, p_value_zip_severity = stats.f_oneway(*groups_zip_severity)
            print(f"ANOVA test for Claim Severity by Top N Postal Codes: p-value = {p_value_zip_severity:.4f}")
            if p_value_zip_severity < 0.05:
                print("Conclusion: Reject H0. There are statistically significant differences in claim severity across these top zip codes.")
                # Interpretation
                zip_claim_severity = claim_severity_data_top_zips.groupby('PostalCode')['ClaimSeverity'].mean().sort_values(ascending=False)
                print("\nAverage Claim Severity by Top N Postal Codes:")
                print(zip_claim_severity)
                print(f"Interpretation: Certain zip codes ({zip_claim_severity.index[0]} e.g.) are associated with higher average claim costs when claims occur.")
            else:
                print("Conclusion: Fail to reject H0. No statistically significant differences in claim severity across these top zip codes.")
        else:
            print("Cannot perform ANOVA for top N zip codes: Not enough distinct zip codes with claims data.")
    else:
        print("Cannot perform ANOVA for top N zip codes: Only one postal code found with claim severity data.")
else:
    print("Insufficient data or only one unique postal code among top N for testing risk differences between zip codes.")


--- Testing H0: No risk differences between zip codes ---

--- Claim Frequency by Top N Postal Codes ---
Chi-squared test for Claim Frequency by Top N Postal Codes: p-value = 0.0000
Conclusion: Reject H0. There are statistically significant differences in claim frequency across these top zip codes.

Claim Frequency by Top N Postal Codes:
PostalCode
8000    0.004324
470     0.004303
122     0.004271
2000    0.003641
7100    0.002756
299     0.002623
1724    0.002474
458     0.002323
7784    0.001749
7405    0.001566
Name: HasClaim, dtype: float64
Interpretation: Some zip codes (8000 e.g.) show a higher propensity for claims.

--- Claim Severity by Top N Postal Codes (for policies with claims) ---
ANOVA test for Claim Severity by Top N Postal Codes: p-value = 0.0022
Conclusion: Reject H0. There are statistically significant differences in claim severity across these top zip codes.

Average Claim Severity by Top N Postal Codes:
PostalCode
7784    35156.653709
8000    33685.329976
1724   

In [5]:
## H0: There are no significant margin (profit) difference between zip codes

print("\n--- Testing H0: No significant margin (profit) difference between zip codes ---")
margin_data_top_zips = data_top_zips.dropna(subset=['Margin', 'PostalCode'])

if len(margin_data_top_zips['PostalCode'].unique()) > 1:
    groups_zip_margin = [margin_data_top_zips['Margin'][margin_data_top_zips['PostalCode'] == z] for z in margin_data_top_zips['PostalCode'].unique()]
    groups_zip_margin = [g for g in groups_zip_margin if not g.empty]

    if len(groups_zip_margin) > 1:
        f_stat_zip_margin, p_value_zip_margin = stats.f_oneway(*groups_zip_margin)
        print(f"ANOVA test for Margin by Top N Postal Codes: p-value = {p_value_zip_margin:.4f}")
        if p_value_zip_margin < 0.05:
            print("Conclusion: Reject H0. There are statistically significant differences in margin across these top zip codes.")
            # Interpretation
            zip_margin = margin_data_top_zips.groupby('PostalCode')['Margin'].mean().sort_values(ascending=False)
            print("\nAverage Margin by Top N Postal Codes:")
            print(zip_margin)
            print(f"Interpretation: Certain zip codes ({zip_margin.index[0]} e.g.) yield significantly higher profit margins, suggesting opportunities for targeted marketing or premium adjustments.")
        else:
            print("Conclusion: Fail to reject H0. No statistically significant differences in margin across these top zip codes.")
    else:
        print("Cannot perform ANOVA for top N zip codes: Not enough distinct zip codes with margin data.")
else:
    print("Insufficient data or only one unique postal code among top N for testing margin differences between zip codes.")


--- Testing H0: No significant margin (profit) difference between zip codes ---
ANOVA test for Margin by Top N Postal Codes: p-value = 0.3108
Conclusion: Fail to reject H0. No statistically significant differences in margin across these top zip codes.


In [6]:
## H0: There are not significant risk difference between Women and Men

print("\n--- Testing H0: No significant risk difference between Women and Men ---")

# Data segmentation by Gender
genders = data['Gender'].unique()
if 'Male' in genders and 'Female' in genders: # Ensure both genders are present
    print("\n--- Claim Frequency by Gender ---")
    claim_gender_crosstab = pd.crosstab(data['Gender'], data['HasClaim'])
    if claim_gender_crosstab.shape[0] > 1 and claim_gender_crosstab.shape[1] > 1 and (claim_gender_crosstab > 0).all().all():
        chi2_gender_freq, p_value_gender_freq, _, _ = stats.chi2_contingency(claim_gender_crosstab)
        print(f"Chi-squared test for Claim Frequency by Gender: p-value = {p_value_gender_freq:.4f}")
        if p_value_gender_freq < 0.05:
            print("Conclusion: Reject H0. There are statistically significant differences in claim frequency between genders.")
            gender_claim_frequency = data.groupby('Gender')['HasClaim'].mean().sort_values(ascending=False)
            print("\nClaim Frequency by Gender:")
            print(gender_claim_frequency)
            print(f"Interpretation: {gender_claim_frequency.index[0]} show a higher claim frequency compared to {gender_claim_frequency.index[1]}.")
        else:
            print("Conclusion: Fail to reject H0. No statistically significant differences in claim frequency between genders.")
    else:
        print("Cannot perform Chi-squared test for gender due to insufficient data or all zero values.")
    print("\n--- Claim Severity by Gender (for policies with claims) ---")
    claim_severity_data_gender = data[data['HasClaim'] == 1].dropna(subset=['ClaimSeverity', 'Gender'])

    if 'Male' in claim_severity_data_gender['Gender'].unique() and 'Female' in claim_severity_data_gender['Gender'].unique():
        male_claims = claim_severity_data_gender[claim_severity_data_gender['Gender'] == 'Male']['ClaimSeverity']
        female_claims = claim_severity_data_gender[claim_severity_data_gender['Gender'] == 'Female']['ClaimSeverity']

        if not male_claims.empty and not female_claims.empty:
            t_stat_gender_severity, p_value_gender_severity = stats.ttest_ind(male_claims, female_claims, equal_var=False) # Welch's t-test assuming unequal variances
            print(f"Independent t-test for Claim Severity by Gender: p-value = {p_value_gender_severity:.4f}")
            if p_value_gender_severity < 0.05:
                print("Conclusion: Reject H0. There are statistically significant differences in claim severity between genders.")
                # Interpretation
                gender_claim_severity = claim_severity_data_gender.groupby('Gender')['ClaimSeverity'].mean().sort_values(ascending=False)
                print("\nAverage Claim Severity by Gender:")
                print(gender_claim_severity)
                print(f"Interpretation: When claims occur, {gender_claim_severity.index[0]} tend to have higher average claim amounts than {gender_claim_severity.index[1]}.")
            else:
                print("Conclusion: Fail to reject H0. No statistically significant differences in claim severity between genders.")
        else:
            print("Cannot perform t-test for gender severity: One or both gender groups have no claim severity data.")
    else:
        print("Cannot perform t-test for gender severity: Not enough distinct genders with claim severity data (requires Male and Female).")
else:
    print("Cannot perform gender-based hypothesis testing: 'Male' and/or 'Female' gender categories not found in data.")


--- Testing H0: No significant risk difference between Women and Men ---

--- Claim Frequency by Gender ---
Chi-squared test for Claim Frequency by Gender: p-value = 0.0266
Conclusion: Reject H0. There are statistically significant differences in claim frequency between genders.

Claim Frequency by Gender:
Gender
Not specified    0.002833
Male             0.002195
Female           0.002073
Name: HasClaim, dtype: float64
Interpretation: Not specified show a higher claim frequency compared to Male.

--- Claim Severity by Gender (for policies with claims) ---
Independent t-test for Claim Severity by Gender: p-value = 0.5680
Conclusion: Fail to reject H0. No statistically significant differences in claim severity between genders.
