In [293]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as datetime
import warnings
warnings.filterwarnings('ignore')
import os

In [294]:
df = pd.read_csv("./data/raw/policy_data.csv")
df.shape  # to check the shape of the dataframe

(500000, 9)

In [295]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,policy_id,policy_start_date,policy_end_date,exposure,vehicle_age,region,channel,sum_insured,premium
0,1,17-08-2018,16-08-2019,0.99726,2,South,Agent,607200,9720.0
1,2,02-06-2023,30-05-2024,0.994521,14,East,Agent,408700,9478.0
2,3,01-08-2022,28-05-2023,0.821918,2,North,Direct,1341200,17632.0
3,4,26-01-2021,04-02-2022,1.024658,6,East,Online,735600,13541.0
4,5,11-01-2021,06-02-2022,1.071233,4,South,Online,3442000,59936.0


In [296]:
# Display summary information about the dataframe and daata types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   policy_id          500000 non-null  int64  
 1   policy_start_date  500000 non-null  object 
 2   policy_end_date    500000 non-null  object 
 3   exposure           500000 non-null  float64
 4   vehicle_age        500000 non-null  int64  
 5   region             500000 non-null  object 
 6   channel            500000 non-null  object 
 7   sum_insured        500000 non-null  int64  
 8   premium            498503 non-null  float64
dtypes: float64(2), int64(3), object(4)
memory usage: 34.3+ MB


In [297]:
df.describe()

Unnamed: 0,policy_id,exposure,vehicle_age,sum_insured,premium
count,500000.0,500000.0,500000.0,500000.0,498503.0
mean,250000.5,0.958991,7.497408,809444.7,14958.301013
std,144337.711635,0.091743,4.60536,492532.3,9319.428396
min,1.0,-0.2,0.0,-7018700.0,1147.0
25%,125000.75,0.890411,4.0,481200.0,8608.0
50%,250000.5,0.958904,7.0,699100.0,12709.0
75%,375000.25,1.027397,11.0,1013900.0,18698.0
max,500000.0,2.0,15.0,9305500.0,220300.0


In [298]:
df.dtypes

policy_id              int64
policy_start_date     object
policy_end_date       object
exposure             float64
vehicle_age            int64
region                object
channel               object
sum_insured            int64
premium              float64
dtype: object

In [299]:
# Check for missing values in each column
df.isnull().sum()

policy_id               0
policy_start_date       0
policy_end_date         0
exposure                0
vehicle_age             0
region                  0
channel                 0
sum_insured             0
premium              1497
dtype: int64

In [300]:
# check policy _id 
df['policy_id'].is_unique
df.columns

Index(['policy_id', 'policy_start_date', 'policy_end_date', 'exposure',
       'vehicle_age', 'region', 'channel', 'sum_insured', 'premium'],
      dtype='object')

In [301]:
#exposure are negative and more that 1 and it is not hence drop exposure
df = df.drop(columns=['exposure'],axis=1)
df.head()

Unnamed: 0,policy_id,policy_start_date,policy_end_date,vehicle_age,region,channel,sum_insured,premium
0,1,17-08-2018,16-08-2019,2,South,Agent,607200,9720.0
1,2,02-06-2023,30-05-2024,14,East,Agent,408700,9478.0
2,3,01-08-2022,28-05-2023,2,North,Direct,1341200,17632.0
3,4,26-01-2021,04-02-2022,6,East,Online,735600,13541.0
4,5,11-01-2021,06-02-2022,4,South,Online,3442000,59936.0


In [302]:
df['channel'].value_counts()

channel
Agent            200198
Online           109723
Broker            89978
Direct            60049
Bancassurance     40052
Name: count, dtype: int64

In [303]:
# Check for negative values in 'sum_insured' column
(df['sum_insured'] < 0).sum()

np.int64(1560)

In [304]:
df_clean = df[df['sum_insured'] > 0].copy()
df_clean.shape

(498440, 8)

In [305]:
(df_clean['sum_insured'] < 0).sum()
df_clean.dtypes

policy_id              int64
policy_start_date     object
policy_end_date       object
vehicle_age            int64
region                object
channel               object
sum_insured            int64
premium              float64
dtype: object

In [306]:
# Parse dates first
df_clean['policy_start_date'] = pd.to_datetime(df['policy_start_date'])
df_clean['policy_end_date'] = pd.to_datetime(df['policy_end_date'])

# check valid dates and remove invalid dates
df_clean_valid = df_clean[df_clean['policy_start_date'] < df_clean['policy_end_date']]
df_clean.shape

(498440, 8)

In [307]:
# Calculate exposure in years and earned premium
df_clean_valid['exposure'] = ((df_clean_valid['policy_end_date'] - df_clean_valid['policy_start_date']).dt.days + 1) / 365.25
df_clean_valid['exposure'] = df_clean_valid['exposure'].clip(lower=0, upper=1)
df.shape
df_clean_valid['earned_premium'] = df_clean_valid['premium'] * df_clean_valid['exposure']

In [324]:
# Save cleaned dataframe
df_clean_valid.to_csv('./data/processed/policy_clean.csv', index=False)
print("✅ Saved: policy_clean")

✅ Saved: policy_clean


In [309]:
#clean claims data
# Load claims data
df = pd.read_csv('./data/raw/claims_data.csv')
df.shape

(89084, 21)

In [310]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89084 entries, 0 to 89083
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   policy_id         44587 non-null  float64
 1   claim_id          44587 non-null  float64
 2   accident_date     44587 non-null  object 
 3   accident_year     44587 non-null  float64
 4   development_year  89084 non-null  int64  
 5   claim_amount      44587 non-null  float64
 6   claim_status      44587 non-null  object 
 7   Unnamed: 7        0 non-null      float64
 8   Unnamed: 8        0 non-null      float64
 9   Unnamed: 9        0 non-null      float64
 10  Unnamed: 10       0 non-null      float64
 11  Unnamed: 11       12 non-null     object 
 12  Unnamed: 12       12 non-null     object 
 13  Unnamed: 13       10 non-null     float64
 14  Unnamed: 14       9 non-null      float64
 15  Unnamed: 15       8 non-null      float64
 16  Unnamed: 16       7 non-null      float6

In [311]:
df = df.iloc[:,[0,1,2,3,4,5]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89084 entries, 0 to 89083
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   policy_id         44587 non-null  float64
 1   claim_id          44587 non-null  float64
 2   accident_date     44587 non-null  object 
 3   accident_year     44587 non-null  float64
 4   development_year  89084 non-null  int64  
 5   claim_amount      44587 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 4.1+ MB


In [312]:
# Check for missing values
(df['claim_amount'] <= 0).sum()

np.int64(0)

In [313]:
df.head()

Unnamed: 0,policy_id,claim_id,accident_date,accident_year,development_year,claim_amount
0,24.0,1.0,06-06-2025,2025.0,0,521910.0
1,32.0,2.0,24-10-2019,2019.0,1,92636.0
2,46.0,3.0,12-11-2024,2024.0,1,28492.0
3,55.0,4.0,28-05-2022,2022.0,3,17377.0
4,55.0,5.0,14-09-2021,2021.0,0,13725.0


In [314]:
df['policy_id'].is_unique

False

In [315]:
df['accident_date'] = pd.to_datetime(df['accident_date'], dayfirst=True , errors='coerce')
df.dtypes

policy_id                  float64
claim_id                   float64
accident_date       datetime64[ns]
accident_year              float64
development_year             int64
claim_amount               float64
dtype: object

In [316]:
df = df.astype({
    'claim_id': 'Int64',
    'accident_year': 'Int64'
})


In [317]:
df.dtypes

policy_id                  float64
claim_id                     Int64
accident_date       datetime64[ns]
accident_year                Int64
development_year             int64
claim_amount               float64
dtype: object

In [318]:
# Save cleaned dataframe
df.to_csv('./data/processed/claims_clean.csv', index=False)
print("✅ Saved: claims_clean")

✅ Saved: claims_clean


In [319]:
#cleaning and loading payment_data
payment = pd.read_csv('./data/raw/payment_data.csv').copy()

In [320]:
payment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155694 entries, 0 to 155693
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   claim_id      155694 non-null  int64 
 1   payment_date  155694 non-null  object
 2   paid_amount   155694 non-null  int64 
 3   case_reserve  155694 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [321]:
payment['payment_date'] = pd.to_datetime(payment['payment_date'], dayfirst=True , errors='coerce')
payment.dtypes

claim_id                 int64
payment_date    datetime64[ns]
paid_amount              int64
case_reserve             int64
dtype: object

In [322]:
#checking Negative values
print((payment['case_reserve'] < 0).sum())
print((payment['paid_amount'] < 0).sum())

0
0


In [323]:
# Save cleaned dataframe
payment.to_csv('./data/processed/payment_clean.csv', index=False)
print("✅ Saved: payment_clean")

✅ Saved: payment_clean
