In [6]:
import pandas as pd
from scipy import stats

ALPHA = 0.05

# Assume 'df' is our preloaded and cleaned DataFrame from Task 1
df = pd.read_csv("./../data/MachineLearningRating_v3.txt", sep = "|")

# 1. Engineer 'HasClaim' for Claim Frequency analysis
# A binary flag: 1 if a claim was made, 0 otherwise.
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# 2. Engineer 'Margin' for profitability analysis
# The difference between premium and claims.
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

# 3. For Claim Severity analysis, we will filter the data to only include policies with claims.
claims_only_df = df[df['HasClaim'] == 1].copy()

print("Data prepared for hypothesis testing.")
print(f"Total policies: {len(df)}")
print(f"Policies with claims: {len(claims_only_df)}")

  df = pd.read_csv("./../data/MachineLearningRating_v3.txt", sep = "|")


Data prepared for hypothesis testing.
Total policies: 1000098
Policies with claims: 2788


In [7]:
# ===== Hypothesis 1: No risk differences across PROVINCES =====
print("\n>>> Testing Hypothesis 1: No risk differences across Provinces <<<")

# A. Claim Frequency by Province (Chi-Squared Test)
print("\n--- H1(a): Testing Claim Frequency by Province ---")
contingency_table_prov = pd.crosstab(df['Province'], df['HasClaim'])
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table_prov)
print(f"Chi-Squared Test Result: p-value = {p_value:.4f}")
if p_value < ALPHA:
    print("Result: Reject H₀. There is a statistically significant difference in claim frequency across provinces.")
else:
    print("Result: Fail to reject H₀. No significant evidence of a difference in claim frequency across provinces.")

# B. Claim Severity by Province (One-Way ANOVA)
print("\n--- H1(b): Testing Claim Severity by Province ---")
province_groups = claims_only_df.groupby('Province')['TotalClaims'].apply(list)
# Ensure we have more than one group to compare
if len(province_groups) > 1:
    f_statistic, p_value = stats.f_oneway(*province_groups)
    print(f"One-Way ANOVA Result: p-value = {p_value:.4f}")
    if p_value < ALPHA:
        print("Result: Reject H₀. There is a statistically significant difference in claim severity (average claim amount) across provinces.")
    else:
        print("Result: Fail to reject H₀. No significant evidence of a difference in claim severity across provinces.")
else:
    print("Skipping test: Only one province group found in the data with claims.")
print("-" * 50)


>>> Testing Hypothesis 1: No risk differences across Provinces <<<

--- H1(a): Testing Claim Frequency by Province ---
Chi-Squared Test Result: p-value = 0.0000
Result: Reject H₀. There is a statistically significant difference in claim frequency across provinces.

--- H1(b): Testing Claim Severity by Province ---
One-Way ANOVA Result: p-value = 0.0000
Result: Reject H₀. There is a statistically significant difference in claim severity (average claim amount) across provinces.
--------------------------------------------------


In [8]:
# ===== Hypothesis 2 & 3: No risk or margin differences between ZIP CODES =====
print("\n>>> Testing Hypotheses 2 & 3: No risk or margin differences across top 20 Zip Codes <<<")

# Prepare data for top 20 zip codes by policy volume
top_20_zipcodes = df['PostalCode'].value_counts().nlargest(20).index
df_top_zips = df[df['PostalCode'].isin(top_20_zipcodes)].copy()
claims_only_df_top_zips = df_top_zips[df_top_zips['HasClaim'] == 1].copy()
print(f"Analysis focused on the top 20 zip codes, covering {len(df_top_zips)} policies.")

# A. Claim Frequency by Zip Code (Chi-Squared Test)
print("\n--- H2(a): Testing Claim Frequency by Zip Code ---")
contingency_table_zip = pd.crosstab(df_top_zips['PostalCode'], df_top_zips['HasClaim'])
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table_zip)
print(f"Chi-Squared Test Result: p-value = {p_value:.4f}")
if p_value < ALPHA:
    print("Result: Reject H₀. There is a statistically significant difference in claim frequency across top zip codes.")
else:
    print("Result: Fail to reject H₀. No significant evidence of a difference in claim frequency across top zip codes.")

# B. Claim Severity by Zip Code (One-Way ANOVA)
print("\n--- H2(b): Testing Claim Severity by Zip Code ---")
zip_groups_severity = claims_only_df_top_zips.groupby('PostalCode')['TotalClaims'].apply(list)
if len(zip_groups_severity) > 1:
    f_statistic, p_value = stats.f_oneway(*zip_groups_severity)
    print(f"One-Way ANOVA Result: p-value = {p_value:.4f}")
    if p_value < ALPHA:
        print("Result: Reject H₀. There is a statistically significant difference in claim severity across top zip codes.")
    else:
        print("Result: Fail to reject H₀. No significant evidence of a difference in claim severity across top zip codes.")
else:
    print("Skipping test: Only one zip code group found in the data with claims.")

# C. Margin by Zip Code (One-Way ANOVA)
print("\n--- H3: Testing Margin (Profit) by Zip Code ---")
zip_groups_margin = df_top_zips.groupby('PostalCode')['Margin'].apply(list)
if len(zip_groups_margin) > 1:
    f_statistic, p_value = stats.f_oneway(*zip_groups_margin)
    print(f"One-Way ANOVA Result: p-value = {p_value:.4f}")
    if p_value < ALPHA:
        print("Result: Reject H₀. There is a statistically significant difference in margin across top zip codes.")
    else:
        print("Result: Fail to reject H₀. No significant evidence of a difference in margin across top zip codes.")
else:
    print("Skipping test: Only one zip code group found for margin analysis.")
print("-" * 50)



>>> Testing Hypotheses 2 & 3: No risk or margin differences across top 20 Zip Codes <<<
Analysis focused on the top 20 zip codes, covering 399225 policies.

--- H2(a): Testing Claim Frequency by Zip Code ---
Chi-Squared Test Result: p-value = 0.0000
Result: Reject H₀. There is a statistically significant difference in claim frequency across top zip codes.

--- H2(b): Testing Claim Severity by Zip Code ---
One-Way ANOVA Result: p-value = 0.0000
Result: Reject H₀. There is a statistically significant difference in claim severity across top zip codes.

--- H3: Testing Margin (Profit) by Zip Code ---
One-Way ANOVA Result: p-value = 0.0116
Result: Reject H₀. There is a statistically significant difference in margin across top zip codes.
--------------------------------------------------


In [9]:
# ===== Hypothesis 4: No significant risk difference between WOMEN and MEN =====
print("\n>>> Testing Hypothesis 4: No significant risk difference between Genders <<<")

# A. Claim Frequency by Gender (Chi-Squared Test)
print("\n--- H4(a): Testing Claim Frequency by Gender ---")
contingency_table_gender = pd.crosstab(df['Gender'], df['HasClaim'])
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table_gender)
print(f"Chi-Squared Test Result: p-value = {p_value:.4f}")
if p_value < ALPHA:
    print("Result: Reject H₀. There is a statistically significant difference in claim frequency between genders.")
else:
    print("Result: Fail to reject H₀. No significant evidence of a difference in claim frequency between genders.")

# B. Claim Severity by Gender (Independent t-test)
print("\n--- H4(b): Testing Claim Severity by Gender ---")
claims_men = claims_only_df[claims_only_df['Gender'] == 'M']['TotalClaims']
claims_women = claims_only_df[claims_only_df['Gender'] == 'F']['TotalClaims']

# Ensure there are claims for both genders to compare
if len(claims_men) > 1 and len(claims_women) > 1:
    # Use Welch's t-test which does not assume equal variances
    t_statistic, p_value = stats.ttest_ind(claims_men, claims_women, equal_var=False)
    print(f"Independent t-test Result: p-value = {p_value:.4f}")
    if p_value < ALPHA:
        print("Result: Reject H₀. There is a statistically significant difference in claim severity between genders.")
    else:
        print("Result: Fail to reject H₀. No significant evidence of a difference in claim severity between genders.")
else:
    print("Skipping test: Not enough claim data for at least one gender.")
print("-" * 50)


>>> Testing Hypothesis 4: No significant risk difference between Genders <<<

--- H4(a): Testing Claim Frequency by Gender ---
Chi-Squared Test Result: p-value = 0.0266
Result: Reject H₀. There is a statistically significant difference in claim frequency between genders.

--- H4(b): Testing Claim Severity by Gender ---
Skipping test: Not enough claim data for at least one gender.
--------------------------------------------------


In [10]:
print("\n--- Hypothesis Testing Complete ---")
print("The results above should be used to formulate business recommendations for the final report.")


--- Hypothesis Testing Complete ---
The results above should be used to formulate business recommendations for the final report.
