In [13]:
import sys
import os
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

# Get the current working directory
# Get project root (folder above 'notebooks')
project_root = os.path.dirname(os.getcwd())

# Path to scripts folder
scripts_path = os.path.join(project_root, "scripts")
analysis_path = os.path.join(project_root, "analysis")

# Add to sys.path
sys.path.append(project_root)
sys.path.append(scripts_path)
sys.path.append(analysis_path)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


In [2]:
df_clean = pd.read_csv("../data/df_clean.csv")

  df_clean = pd.read_csv("../data/df_clean.csv")


##### Policy-Level Feature Engineering
 Create the necessary binary and margin features in your cleaned DataFrame (df_clean)

In [3]:
from analysis import filter_gender_for_testing, run_frequency_test,run_mean_test

In [4]:
# 1. Create the binary Claim Frequency target
df_clean['HasClaim'] = np.where(df_clean['TotalClaims'] > 0, 1, 0)

# 2. Create the Margin feature
df_clean['Margin'] = df_clean['TotalPremium'] - df_clean['TotalClaims']

# 3. Create the Claims_Only DataFrame for Severity calculation
claims_only_df = df_clean[df_clean['HasClaim'] == 1].copy()

In [5]:
df_gender_test = filter_gender_for_testing(df_clean)

--- Gender Data Filtering ---
Original Policies Count: 1,000,098
Filtered Policies Count (Male/Female only): 49,572
Data retained: 4.96%


In [6]:
df_gender_claims_only_test = df_gender_test[df_gender_test['HasClaim'] == 1].copy()

##### Hypothesis 4: Risk Difference Between Women and Men ($H_0$: No risk difference between Women and Men)We assume the Gender column is correctly labeled (e.g., 'M' and 'F').

In [9]:
# Assuming 'Gender' column exists
if 'Gender' in df_gender_test.columns:
    group_men = df_gender_test[df_gender_test['Gender'] == 'Male']
    group_women = df_gender_test[df_gender_test['Gender'] == 'Female']
    
    # 1. Frequency Test (Risk)
    # The function now returns (None, None, NaN, NaN) if skipped
    stat_freq, p_freq, freq_M, freq_F = run_frequency_test(group_men, group_women, 'Gender Frequency')
    
    # 2. Severity Test (Risk) - Requires claims_only_df
    claims_men = df_gender_claims_only_test[df_gender_claims_only_test['Gender'] == 'Male']
    claims_women = df_gender_claims_only_test[df_gender_claims_only_test['Gender'] == 'Female']
    stat_sev, p_sev, sev_M, sev_F = run_mean_test(claims_men, claims_women, 'TotalClaims', 'Gender Severity')

    # 3. Margin Test (Profit)
    stat_margin, p_margin, margin_M, margin_F = run_mean_test(group_men, group_women, 'Margin', 'Gender Margin')

    print(f"\n--- Gender (M vs F) Statistical Analysis ---")

    # --- Print Frequency Results ---
    if p_freq is not None:
        result_freq = 'Reject H0' if p_freq < 0.05 else 'Fail to Reject H0'
        print(f"Frequency: M={freq_M:.4f}, F={freq_F:.4f} | P-value: {p_freq:.5f} ({result_freq})")
    else:
        print("Frequency: Test skipped due to small sample size.")

    # --- Print Severity Results ---
    if p_sev is not None and not pd.isna(p_sev): # Check for None and NaN (due to SmallSampleWarning)
        result_sev = 'Reject H0' if p_sev < 0.05 else 'Fail to Reject H0'
        print(f"Severity: M={sev_M:,.2f}, F={sev_F:,.2f} | P-value: {p_sev:.5f} ({result_sev})")
    else:
        print("Severity: Test skipped or returned NaN (small sample warning).")
        
    # --- Print Margin Results ---
    if p_margin is not None and not pd.isna(p_margin):
        result_margin = 'Reject H0' if p_margin < 0.05 else 'Fail to Reject H0'
        print(f"Margin: M={margin_M:,.2f}, F={margin_F:,.2f} | P-value: {p_margin:.5f} ({result_margin})")
    else:
        print("Margin: Test skipped or returned NaN (small sample warning).")


--- Gender (M vs F) Statistical Analysis ---
Frequency: M=0.0022, F=0.0021 | P-value: 0.84049 (Fail to Reject H0)
Severity: M=14,858.55, F=17,874.72 | P-value: 0.56803 (Fail to Reject H0)
Margin: M=4.28, F=8.03 | P-value: 0.80155 (Fail to Reject H0)


##### Hypothesis 1: Risk Differences Across Provinces ($H_0$: No risk differences across provinces)
To test provinces, we typically use the two largest provinces as the A/B groups, as required by the task ("select two categories to split the data into Group A and Group B").


#### PROVINCE HYPOTHESIS TESTING

In [11]:
# 1. Identify the two largest provinces for the A/B test
top_provinces = df_clean['Province'].value_counts().nlargest(2).index
prov_A_name = top_provinces[0]
prov_B_name = top_provinces[1]

prov_A = df_clean[df_clean['Province'] == prov_A_name]
prov_B = df_clean[df_clean['Province'] == prov_B_name]

# --- 1. Frequency Test (Risk) ---
stat_freq, p_freq, freq_A, freq_B = run_frequency_test(prov_A, prov_B, 'Province Frequency')

# --- 2. Severity Test (Risk) ---
# Filter the claims_only_df based on the two provinces
claims_prov_A = claims_only_df[claims_only_df['Province'] == prov_A_name]
claims_prov_B = claims_only_df[claims_only_df['Province'] == prov_B_name]
stat_sev, p_sev, sev_A, sev_B = run_mean_test(claims_prov_A, claims_prov_B, 'TotalClaims', 'Province Severity')

# --- 3. Margin Test (Profit) ---
stat_margin, p_margin, margin_A, margin_B = run_mean_test(prov_A, prov_B, 'Margin', 'Province Margin')


# ==============================================================================
# 3. ANALYSIS AND REPORTING
# ==============================================================================

print(f"\n--- Province ({prov_A_name} vs {prov_B_name}) Statistical Analysis ---")

# Helper function for printing results safely
def print_result(p_value, mean_A, mean_B, name_A, name_B, metric_format):
    if p_value is not None and not pd.isna(p_value):
        result = 'Reject H0' if p_value < 0.05 else 'Fail to Reject H0'
        # Format strings dynamically based on the metric (e.g., .4f for frequency, .2f for currency)
        print(f"{name_A}: {mean_A:{metric_format}}, {name_B}: {mean_B:{metric_format}} | P-value: {p_value:.5f} ({result})")
    else:
        print(f"{name_A} vs {name_B}: Test skipped or returned NaN (sample size too small).")

print("Claim Frequency:")
print_result(p_freq, freq_A, freq_B, prov_A_name, prov_B_name, '.4f')

print("Claim Severity:")
print_result(p_sev, sev_A, sev_B, prov_A_name, prov_B_name, ',.2f')

print("Margin:")
print_result(p_margin, margin_A, margin_B, prov_A_name, prov_B_name, ',.2f')


--- Province (Gauteng vs Western Cape) Statistical Analysis ---
Claim Frequency:
Gauteng: 0.0034, Western Cape: 0.0022 | P-value: 0.00000 (Reject H0)
Claim Severity:
Gauteng: 22,243.88, Western Cape: 28,095.85 | P-value: 0.03060 (Reject H0)
Margin:
Gauteng: -13.56, Western Cape: -3.41 | P-value: 0.16360 (Fail to Reject H0)


### HYPOTHESES 2 & 3: ZIP CODE ANOVA ANALYSIS

In [14]:
# CRITICAL FILTER: Select the top 10 Zip Codes by volume. 
# We use top N to ensure groups have sufficient data for reliable testing.
TOP_N_ZIPS = 10
top_zip_codes = df_clean['PostalCode'].value_counts().nlargest(TOP_N_ZIPS).index

# --- HYPOTHESIS 3: Margin Difference (Profitability) ---
# H₀: The mean Margin is the same across the top 10 Zip Codes.

# 1. Prepare data groups: Extract the 'Margin' column for each of the top N Zip Codes.
# We must also handle potential NaN values by using .dropna() on the series.
margin_groups = [
    df_clean[df_clean['PostalCode'] == zc]['Margin'].dropna() 
    for zc in top_zip_codes
]

# 2. Perform ANOVA: Analysis of Variance (f_oneway)
# *margin_groups unpacks the list of 10 pandas Series into separate arguments for f_oneway
f_stat_margin, p_margin_anova = stats.f_oneway(*margin_groups)

# --- Report Margin Results ---
print(f"\n--- ANOVA for Margin Across Top {TOP_N_ZIPS} Zip Codes (Hypothesis 3) ---")
print(f"F-statistic: {f_stat_margin:.2f} | P-value: {p_margin_anova:.5f}")
print(f"Result: {'Reject H0' if p_margin_anova < 0.05 else 'Fail to Reject H0'}")

# --- HYPOTHESIS 2: Claim Severity Difference (Risk) ---
# H₀: The mean Claim Severity is the same across the top 10 Zip Codes.
# NOTE: This must use the claims_only_df

# 1. Prepare data groups: Extract 'TotalClaims' (Severity) for each Zip Code, 
# but ONLY from the policies that actually had a claim (claims_only_df).
severity_groups = [
    claims_only_df[claims_only_df['PostalCode'] == zc]['TotalClaims'].dropna() 
    for zc in top_zip_codes
]

# 2. IMPORTANT CHECK: Ensure groups are not empty after filtering for claims.
# ANOVA will fail or give unreliable results if any group is too small.
# We will filter out any zip code group with less than 5 claims before running ANOVA.
severity_groups_valid = [g for g in severity_groups if len(g) >= 5]
valid_zip_count = len(severity_groups_valid)

if valid_zip_count < 2:
    print(f"\n--- ANOVA for Severity Across Zip Codes (Hypothesis 2) ---")
    print("Skipped: Not enough Zip Codes (min 2) had sufficient claim data for ANOVA.")
else:
    # 3. Perform ANOVA
    f_stat_severity, p_severity_anova = stats.f_oneway(*severity_groups_valid)

    # --- Report Severity Results ---
    print(f"\n--- ANOVA for Severity Across Top {valid_zip_count} Claiming Zip Codes (Hypothesis 2) ---")
    print(f"F-statistic: {f_stat_severity:.2f} | P-value: {p_severity_anova:.5f}")
    print(f"Result: {'Reject H0' if p_severity_anova < 0.05 else 'Fail to Reject H0'}")


--- ANOVA for Margin Across Top 10 Zip Codes (Hypothesis 3) ---
F-statistic: 1.05 | P-value: 0.39636
Result: Fail to Reject H0

--- ANOVA for Severity Across Top 10 Claiming Zip Codes (Hypothesis 2) ---
F-statistic: 5.24 | P-value: 0.00000
Result: Reject H0


# Task 3: Statistical Validation and Segmentation Strategy

The goal of this task is to statistically validate key hypotheses related to risk and profit metrics (Claim Frequency, Claim Severity, and Margin) to inform AlphaCare's new segmentation and pricing strategy. The significance threshold for rejecting the Null Hypothesis ($H_0$) is set at $\alpha = 0.05$.

---

## 1. Hypothesis 4: Gender Comparison (Completed)

**$H_0$: There is no significant risk difference between Women and Men.**

| KPI (Metric) | Group M (Male) | Group F (Female) | P-value | Conclusion | Risk/Profit Implication |
| :--- | :---: | :---: | :---: | :---: | :--- |
| **Claim Frequency** | 0.0022 | 0.0021 | **0.84049** | Fail to Reject $H_0$ | Frequency is statistically equivalent. |
| **Claim Severity** | 14,858.55 | 17,874.72 | **0.56803** | Fail to Reject $H_0$ | Average claim size is statistically equivalent. |
| **Margin (Profit)** | 4.28 | 8.03 | **0.80155** | Fail to Reject $H_0$ | Profitability is statistically equivalent. |

### Strategic Recommendation (Gender)

Gender is **NOT** a statistically significant driver of risk or profit. AlphaCare should adopt a **gender-neutral pricing and segmentation strategy**.

---

## 2. Hypothesis 1: Province Comparison

**$H_0$: There are no risk differences across provinces (Comparing the two largest provinces).**

(Test compares Gauteng vs. Western Cape)

### 2.1 Summary of Results

| KPI (Metric) | Gauteng (Group A) | Western Cape (Group B) | P-value | Conclusion | Risk/Profit Implication |
| :--- | :---: | :---: | :---: | :---: | :--- |
| **Claim Frequency** | 0.0034 | 0.0022 | **0.00000** | **Reject $H_0$** | **Gauteng has significantly higher claim frequency.** |
| **Claim Severity** | 22,243.88 | 28,095.85 | **0.03060** | **Reject $H_0$** | **Western Cape has significantly higher average claim size.** |
| **Margin (Profit)** | -13.56 | -3.41 | **0.16360** | Fail to Reject $H_0$ | Difference in mean profit is NOT statistically significant. |

### 2.2 Strategic Recommendation (Province)

The null hypothesis is **rejected for both key risk metrics (Frequency and Severity)**. **Province is a critical risk driver** and must be integrated into the segmentation and pricing models to capture distinct risk profiles (Gauteng: high frequency; Western Cape: high severity).

---

## 3. Hypotheses 2 & 3: Zip Code ANOVA (Completed)

**$H_0$: There are no significant differences in risk (Severity) or profit (Margin) among the top 10 Zip Codes.**

| KPI (Metric) | F-statistic | P-value | Conclusion | Business Implication |
| :--- | :---: | :---: | :---: | :--- |
| **Claim Severity** (H2) | 5.24 | **0.00000** | **Reject $H_0$** | **Claim Severity is NOT the same; local average claim cost differs.** |
| **Margin** (H3) | 1.05 | **0.39636** | Fail to Reject $H_0$ | Mean profitability is statistically equivalent across the top 10 Zip Codes. |

### 3.2 Strategic Recommendation (Zip Code)

The null hypothesis is **rejected for Claim Severity**, confirming that local geographic differences significantly impact the cost of claims.

* **Refine Pricing by Severity:** The model **must use Zip Code** to capture the differences in **Claim Severity** (high-cost risk areas) for fine-tuned pricing.
* **Margin Stability:** The overall profitability structure (Margin) can be maintained, but the underlying risk volatility must be addressed by pricing for severity.

---

## 4. Final Segmentation Strategy Summary

The statistical validation confirms the necessity of focusing the new segmentation strategy primarily on geographical factors to accurately price risk.

| Risk Driver | Statistical Impact | Recommended Action |
| :--- | :---: | :--- |
| **Gender** | No Impact (H0 maintained) | **Exclude from Pricing Model.** Adopt Gender-Neutral pricing. |
| **Province** | **High Impact** (H0 rejected) | **Must use as a mandatory feature.** Segment pricing by regional risk profile. |
| **Zip Code** | **High Impact** on Severity (H0 rejected) | **Use for Fine-Tuned Pricing.** Essential for capturing high-cost risk variance. |