In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy.stats import f_oneway, mannwhitneyu


# Loading the dataset 
df = pd.read_csv('C:\\Users\\Toshiba\\Documents\\Kifiya\\week 3\\Data\\converted_data.csv')

# Data Cleaning and Missing Values Handling
imputer = SimpleImputer(strategy='mean')
df[['TotalClaims', 'TotalPremium']] = imputer.fit_transform(df[['TotalClaims', 'TotalPremium']])
df['PostalCode'].fillna(df['PostalCode'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

# Calculate profit margin if not already present
if 'ProfitMargin' not in df.columns:
    df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']

# Define KPIs
kpi_claims = 'TotalClaims'
kpi_margin = 'ProfitMargin'

# Data Segmentation and Statistical Testing

# Hypothesis 1: Risk Differences Across Provinces
province_groups = df.groupby('Province')[kpi_claims]
prov_risks = [group for _, group in province_groups]
f_stat_provinces, p_value_provinces = f_oneway(*prov_risks)

# Hypothesis 2: Risk Differences Between Zip Codes
zip_code_groups = df.groupby('PostalCode')[kpi_claims]
zip_risks = [group for _, group in zip_code_groups]
f_stat_zip_codes, p_value_zip_codes = f_oneway(*zip_risks)

# Hypothesis 3: Margin Differences Between Zip Codes
zip_margin_groups = df.groupby('PostalCode')[kpi_margin]
zip_margin = [group for _, group in zip_margin_groups]
f_stat_margin_zip_codes, p_value_margin_zip_codes = f_oneway(*zip_margin)

# Hypothesis 4: Risk Differences Between Women and Men
women_risk = df[df['Gender'] == 'Female'][kpi_claims]
men_risk = df[df['Gender'] == 'Male'][kpi_claims]
u_stat_gender, p_value_gender = mannwhitneyu(women_risk, men_risk, alternative='two-sided')

# Analyze and Report
def analyze_and_report(p_value, hypothesis_name):
    if p_value < 0.05:
        result = "Reject the null hypothesis: Evidence suggests a significant effect."
    else:
        result = "Fail to reject the null hypothesis: No significant effect detected."
    return f"{hypothesis_name}: {result} (p-value: {p_value:.4f})"

# Reporting results for each hypothesis
print("\nHypothesis Testing Results:\n")
print(analyze_and_report(p_value_provinces, "1. Risk Differences Across Provinces"))
print(analyze_and_report(p_value_zip_codes, "2. Risk Differences Between Zip Codes"))
print(analyze_and_report(p_value_margin_zip_codes, "3. Margin Differences Between Zip Codes"))
print(analyze_and_report(p_value_gender, "4. Risk Differences Between Women and Men"))

# Additional Reporting for Data Segmentation
print("\nData Segmentation Overview:\n")
print(f"Data Segmented by Province: {len(province_groups)} provinces")
print(f"Data Segmented by Zip Code: {len(zip_code_groups)} zip codes")

# # Optionally: Save results
# results_dir = 'C:\\Users\\Toshiba\\Documents\\Kifiya\\week 3\\Results' 
# os.makedirs(results_dir, exist_ok=True)

# # Save results
# results_df = pd.DataFrame({
#     'Hypothesis': [
#         "Risk Differences Across Provinces",
#         "Risk Differences Between Zip Codes",
#         "Margin Differences Between Zip Codes",
#         "Risk Differences Between Women and Men"
#     ],
#     'P-Value': [p_value_provinces, p_value_zip_codes, p_value_margin_zip_codes, p_value_gender],
#     'Result': [
#         analyze_and_report(p_value_provinces, "1. Risk Differences Across Provinces"),
#         analyze_and_report(p_value_zip_codes, "2. Risk Differences Between Zip Codes"),
#         analyze_and_report(p_value_margin_zip_codes, "3. Margin Differences Between Zip Codes"),
#         analyze_and_report(p_value_gender, "4. Risk Differences Between Women and Men")
#     ]
# })
# results_df.to_csv(os.path.join(results_dir, 'hypothesis_testing_results.csv'), index=False)


  df = pd.read_csv('C:\\Users\\Toshiba\\Documents\\Kifiya\\week 3\\Data\\converted_data.csv')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PostalCode'].fillna(df['PostalCode'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)



Hypothesis Testing Results:

1. Risk Differences Across Provinces: Reject the null hypothesis: Evidence suggests a significant effect. (p-value: 0.0000)
2. Risk Differences Between Zip Codes: Fail to reject the null hypothesis: No significant effect detected. (p-value: 0.8907)
3. Margin Differences Between Zip Codes: Fail to reject the null hypothesis: No significant effect detected. (p-value: 0.9977)
4. Risk Differences Between Women and Men: Fail to reject the null hypothesis: No significant effect detected. (p-value: 0.8417)

Data Segmentation Overview:

Data Segmented by Province: 9 provinces
Data Segmented by Zip Code: 888 zip codes
