In [1]:
# Import all required libraries
import pandas as pd
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Set visualization style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)



In [2]:
# Load the data - make sure to use your correct file path
try:
    df = pd.read_csv('insurance_claims_cleaned_20250613_020951.csv', sep='|')
    print("✅ Data loaded successfully")
except Exception as e:
    print(f"❌ Error loading data: {str(e)}")
    exit()

# Check basic info
print("\n=== DATA STRUCTURE ===")
print(df.info())

# Calculate LossRatio if needed
if 'TotalPremium' in df.columns and 'TotalClaims' in df.columns:
    df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']
else:
    print("❌ Missing required columns for analysis")
    exit()
    

  df = pd.read_csv('insurance_claims_cleaned_20250613_020951.csv', sep='|')


✅ Data loaded successfully

=== DATA STRUCTURE ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 52 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   UnderwrittenCoverID       1000098 non-null  int64  
 1   PolicyID                  1000098 non-null  int64  
 2   TransactionMonth          1000098 non-null  object 
 3   IsVATRegistered           1000098 non-null  bool   
 4   Citizenship               1000098 non-null  object 
 5   LegalType                 1000098 non-null  object 
 6   Title                     1000098 non-null  object 
 7   Language                  1000098 non-null  object 
 8   Bank                      1000098 non-null  object 
 9   AccountType               1000098 non-null  object 
 10  MaritalStatus             1000098 non-null  object 
 11  Gender                    1000098 non-null  object 
 12  Country                   1000098

In [None]:
# ======================
# 3. EXPLORATORY DATA ANALYSIS
# ======================
print("\n=== STEP 3: EXPLORATORY DATA ANALYSIS ===")

# 3.1 Univariate Analysis
print("\n--- Univariate Analysis ---")

# Numerical distributions
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['TotalPremium'], kde=True, color='blue')
plt.title('Total Premium Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df['TotalClaims'], kde=True, color='red')
plt.title('Total Claims Distribution')
plt.tight_layout()
plt.show()

# Categorical distributions
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
df['Gender'].value_counts().plot(kind='bar', color='green')
plt.title('Gender Distribution')

plt.subplot(1, 2, 2)
df['Province'].value_counts().plot(kind='bar', color='purple')
plt.title('Province Distribution')
plt.tight_layout()
plt.show()

# Calculate Loss Ratio if not already done
if 'LossRatio' not in df.columns:
    df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

# Set style for better visuals
sns.set_style("whitegrid")
plt.figure(figsize=(14, 7))

# Fixed Bar Plot - Province vs Loss Ratio
province_plot = sns.barplot(
    x='Province', 
    y='LossRatio', 
    data=df,
    hue='Province',  # Explicitly set hue to Province
    palette='viridis',
    legend=False,    # Disable legend since hue=Province
    dodge=False      # Ensure single bars per province
)

# Customize the plot
plt.title('Loss Ratio by Province (Higher = More Risky)', fontsize=16, pad=20)
plt.xlabel('Province', fontsize=14)
plt.ylabel('Loss Ratio (Claims/Premium)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Add value labels on top of bars
for p in province_plot.patches:
    province_plot.annotate(
        f"{p.get_height():.2f}", 
        (p.get_x() + p.get_width() / 2., p.get_height()),
        ha='center', va='center', 
        xytext=(0, 10), 
        textcoords='offset points',
        fontsize=10
    )

plt.show()

# Calculate Loss Ratio
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

# Loss Ratio by Province
plt.figure(figsize=(14, 6))
sns.barplot(x='Province', y='LossRatio', data=df, palette='viridis')
plt.title('Loss Ratio by Province (Higher = More Risky)')
plt.xticks(rotation=45)
plt.show()

# Correlation Analysis
plt.figure(figsize=(10, 8))
corr_matrix = df[['TotalPremium', 'TotalClaims', 'CustomValueEstimate']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# 3.3 Temporal Analysis
print("\n--- Temporal Analysis ---")

# Monthly trends
monthly_data = df.groupby('TransactionMonth').agg({
    'TotalPremium': 'sum',
    'TotalClaims': 'sum'
}).reset_index()

plt.figure(figsize=(14, 6))
plt.plot(monthly_data['TransactionMonth'], monthly_data['TotalPremium'], label='Total Premium')
plt.plot(monthly_data['TransactionMonth'], monthly_data['TotalClaims'], label='Total Claims')
plt.title('Monthly Premium vs Claims Trend')
plt.xlabel('Date')
plt.ylabel('Amount')
plt.legend()
plt.show()



=== STEP 3: EXPLORATORY DATA ANALYSIS ===

--- Univariate Analysis ---


In [None]:
# ======================
# 4. HYPOTHESIS TESTING (FIXED)
# ======================
print("\n=== STEP 4: HYPOTHESIS TESTING ===")

# Ensure LossRatio is calculated properly
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

# Remove infinite or NaN values from LossRatio
df = df[~df['LossRatio'].isin([np.inf, -np.inf])].dropna(subset=['LossRatio'])

# Hypothesis 1: Risk differences across provinces
print("\n--- Hypothesis 1: Province Risk Differences ---")
provinces = df['Province'].unique()
province_groups = [df[df['Province'] == prov]['LossRatio'].dropna() for prov in provinces]

# Filter out empty groups
province_groups = [group for group in province_groups if len(group) > 0]

if len(province_groups) >= 2:  # Need at least 2 groups for ANOVA
    f_stat, p_value = stats.f_oneway(*province_groups)
    print(f"ANOVA p-value: {p_value:.4f}")
    print("Conclusion:", "Reject null hypothesis" if p_value < 0.05 else "Fail to reject null hypothesis")
else:
    print("Insufficient data for ANOVA test")

# Hypothesis 2: Gender risk differences
print("\n--- Hypothesis 2: Gender Risk Differences ---")
gender_groups = df['Gender'].unique()
if len(gender_groups) >= 2:
    male_loss = df[df['Gender'] == 'Male']['LossRatio'].dropna()
    female_loss = df[df['Gender'] == 'Female']['LossRatio'].dropna()
    
    if len(male_loss) > 1 and len(female_loss) > 1:  # Need at least 2 samples per group
        t_stat, p_value = stats.ttest_ind(male_loss, female_loss, equal_var=False)
        print(f"Welch's t-test p-value: {p_value:.4f}")
        print("Conclusion:", "Reject null hypothesis" if p_value < 0.05 else "Fail to reject null hypothesis")
    else:
        print("Insufficient data for t-test")
else:
    print("Insufficient gender groups for comparison")

In [None]:
# ======================
# 5. FEATURE ENGINEERING (FIXED)
# ======================
print("\n=== STEP 5: FEATURE ENGINEERING ===")

# Check column names (case sensitivity)
print("Available columns:", [col for col in df.columns if col.lower() in ['make', 'model']])

# Use correct column names (note your data shows 'make' not 'Make')
correct_make_col = 'make' if 'make' in df.columns else 'Make'
correct_model_col = 'model' if 'model' in df.columns else 'Model'

# Create new features
df['VehicleAge'] = df['TransactionMonth'].dt.year - df['RegistrationYear']
df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']

# Encode categorical variables
label_encoders = {}
categorical_cols = []

# Check which categorical columns exist
for col in [correct_make_col, correct_model_col, 'Gender', 'Province']:
    if col in df.columns:
        categorical_cols.append(col)

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col} with {len(le.classes_)} unique values")

# Select features and target
features = categorical_cols + ['VehicleAge', 'SumInsured']
X = df[features].copy()
y = df['TotalClaims'].copy()

# Verify no missing values
print("\nMissing values before split:")
print(X.isnull().sum())

# Drop any remaining NA (if necessary)
X = X.dropna()
y = y[X.index]  # Align y with cleaned X

# Train-test split
if len(X) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")
else:
    print("\n❌ No valid data remaining after cleaning")