In [1]:
import pandas as pd
import numpy as np

# loading dataset

In [14]:
df = pd.read_csv('insurance_claims.csv')

In [15]:
df.shape

(1000, 40)

In [16]:
data=df.replace('?',np.NAN) # missing values are represented with ? so we replaced it with Nan

In [17]:
data.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', '_c39'],
      dtype='object')

In [18]:
cols_to_drop = ['policy_number','policy_bind_date','policy_state','insured_zip','incident_date','incident_location', 'incident_state', 'incident_city','incident_hour_of_the_day','incident_state', 'incident_city','auto_make','auto_model', 'auto_year','insured_hobbies']

In [19]:
len(cols_to_drop)

15

In [20]:
data.drop(columns=cols_to_drop,inplace = True)

In [21]:
data.head()

Unnamed: 0,months_as_customer,age,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_occupation,insured_relationship,...,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,fraud_reported,_c39
0,328,48,250/500,1000,1406.91,0,MALE,MD,craft-repair,husband,...,YES,1,2,YES,71610,6510,13020,52080,Y,
1,228,42,250/500,2000,1197.22,5000000,MALE,MD,machine-op-inspct,other-relative,...,,0,0,,5070,780,780,3510,Y,
2,134,29,100/300,2000,1413.14,5000000,FEMALE,PhD,sales,own-child,...,NO,2,3,NO,34650,7700,3850,23100,N,
3,256,41,250/500,2000,1415.74,6000000,FEMALE,PhD,armed-forces,unmarried,...,,1,2,NO,63400,6340,6340,50720,Y,
4,228,44,500/1000,1000,1583.91,6000000,MALE,Associate,sales,unmarried,...,NO,0,1,NO,6500,1300,650,4550,N,


In [25]:
data.isna().sum()

months_as_customer                0
age                               0
policy_csl                        0
policy_deductable                 0
policy_annual_premium             0
umbrella_limit                    0
insured_sex                       0
insured_education_level           0
insured_occupation                0
insured_relationship              0
capital-gains                     0
capital-loss                      0
incident_type                     0
collision_type                  178
incident_severity                 0
authorities_contacted             0
number_of_vehicles_involved       0
property_damage                 360
bodily_injuries                   0
witnesses                         0
police_report_available         343
total_claim_amount                0
injury_claim                      0
property_claim                    0
vehicle_claim                     0
fraud_reported                    0
_c39                           1000
dtype: int64

In [43]:
data.drop(labels=['_c39'],axis = 'columns',inplace = True)

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_csl                   1000 non-null   object 
 3   policy_deductable            1000 non-null   int64  
 4   policy_annual_premium        1000 non-null   float64
 5   umbrella_limit               1000 non-null   int64  
 6   insured_sex                  1000 non-null   object 
 7   insured_education_level      1000 non-null   object 
 8   insured_occupation           1000 non-null   object 
 9   insured_relationship         1000 non-null   object 
 10  capital-gains                1000 non-null   int64  
 11  capital-loss                 1000 non-null   int64  
 12  incident_type                1000 non-null   object 
 13  collision_type     

In [49]:
# !pip install sklearn-pandas==1.5.0

In [50]:
from sklearn_pandas import CategoricalImputer

In [51]:
imputer = CategoricalImputer()

In [52]:
data['collision_type'] = imputer.fit_transform(data['collision_type'])
data['property_damage'] = imputer.fit_transform(data['property_damage'])
data['police_report_available'] = imputer.fit_transform(data['police_report_available'])


In [80]:
categ_df = data.select_dtypes(include = ['object']).copy()

In [72]:
categ_df.shape

(1000, 12)

In [58]:
categ_df.columns

Index(['policy_csl', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'property_damage', 'police_report_available', 'fraud_reported'],
      dtype='object')

In [59]:
categ_df.policy_csl.unique()

array(['250/500', '100/300', '500/1000'], dtype=object)

In [60]:
categ_df.insured_sex.unique()

array(['MALE', 'FEMALE'], dtype=object)

In [61]:
categ_df.insured_education_level.unique()

array(['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College',
       'JD'], dtype=object)

In [82]:
categ_df.insured_sex.unique()

array(['MALE', 'FEMALE'], dtype=object)

In [83]:
categ_df.policy_csl = categ_df.policy_csl.map({'250/500':1, '100/300':2.5, '500/1000':5})
categ_df.insured_education_level = categ_df.insured_education_level.map({'MD':6, 'PhD':7, 'Associate':5, 'Masters':4, 'High School':2, 'College':3,  'JD':1})
categ_df.incident_severity = categ_df.incident_severity.map({'Major Damage':3, 'Minor Damage':2, 'Total Loss':4, 'Trivial Damage':1})
categ_df.insured_sex = categ_df.insured_sex.map({'MALE':1, 'FEMALE':0})
categ_df.property_damage = categ_df.property_damage.map({'YES':1, 'NO':0})
categ_df.police_report_available = categ_df.police_report_available.map({'YES':1, 'NO':0})
categ_df.fraud_reported = categ_df.fraud_reported.map({'Y':1, 'N':0})


In [84]:
for col in categ_df.drop(columns=['policy_csl','insured_education_level','incident_severity','insured_sex','property_damage','insured_sex','police_report_available','fraud_reported']):
    categ_df = pd.get_dummies(categ_df,columns=[col],prefix=[col],drop_first=True)

In [85]:
categ_df

Unnamed: 0,policy_csl,insured_sex,insured_education_level,incident_severity,property_damage,police_report_available,fraud_reported,insured_occupation_armed-forces,insured_occupation_craft-repair,insured_occupation_exec-managerial,...,insured_relationship_wife,incident_type_Parked Car,incident_type_Single Vehicle Collision,incident_type_Vehicle Theft,collision_type_Rear Collision,collision_type_Side Collision,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police
0,1.0,1,6,3,1,1,1,0,1,0,...,0,0,1,0,0,1,0,0,0,1
1,1.0,1,6,2,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,1
2,2.5,0,7,2,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,1.0,0,7,3,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,1
4,5.0,1,5,2,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,5.0,0,4,2,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
996,2.5,0,7,3,1,0,0,0,0,0,...,1,0,1,0,1,0,1,0,0,0
997,1.0,0,4,2,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
998,5.0,1,5,3,0,1,0,0,0,0,...,1,0,1,0,1,0,0,0,1,0
