In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

In [53]:
na_vals = ['-5', '-1', '???', 'MISSINGVAL', 'MISSEDDATA', 'MISSINGVALUE', 'NA', '?']

In [54]:
with open('TrainData/init_data/Train_Claim.csv', 'r') as f:
    train_claim = pd.read_csv(f, na_values=na_vals)

with open('TrainData/init_data/Train_Policy.csv', 'r') as f:
    train_policy = pd.read_csv(f, na_values=na_vals)

with open('TrainData/init_data/Train_Vehicle.csv', 'r') as f:
    train_vehicle = pd.read_csv(f, na_values=na_vals)

with open('TrainData/init_data/Train_Demographics.csv', 'r') as f:
    train_demographics = pd.read_csv(f, na_values=na_vals)

In [55]:
print(train_claim.isna().sum())
print(train_policy.isna().sum())
print(train_vehicle.isna().sum())
print(train_demographics.isna().sum())

CustomerID                   0
DateOfIncident               0
TypeOfIncident               0
TypeOfCollission          5162
SeverityOfIncident           0
AuthoritiesContacted         0
IncidentState                0
IncidentCity                 0
IncidentAddress              0
IncidentTime                31
NumberOfVehicles             0
PropertyDamage           10459
BodilyInjuries               0
Witnesses                   46
PoliceReport              9805
AmountOfTotalClaim          50
AmountOfInjuryClaim          0
AmountOfPropertyClaim        0
AmountOfVehicleDamage        0
dtype: int64
InsurancePolicyNumber           0
CustomerLoyaltyPeriod           0
DateOfPolicyCoverage            0
InsurancePolicyState            0
Policy_CombinedSingleLimit      0
Policy_Deductible               0
PolicyAnnualPremium           141
UmbrellaLimit                   0
InsuredRelationship             0
CustomerID                      0
dtype: int64
CustomerID                  0
VehicleAttribut

In [56]:
train_claim["PropertyDamage"].unique()

array([nan, 'YES', 'NO'], dtype=object)

In [57]:
train_demographics = train_demographics.drop(['Country', 'InsuredZipCode'], axis=1)

In [58]:
train_policy = train_policy.drop(['InsurancePolicyNumber'], axis=1)

In [281]:
vehicle_df = pd.get_dummies(train_vehicle, columns=['VehicleAttribute'])
vehicle_df.columns = vehicle_df.columns.str.replace('VehicleAttribute_', '')

vehicle_df.index = vehicle_df['CustomerID']
vehicle_df.drop('CustomerID', axis=1, inplace=True)

vehicle_exp_df = vehicle_df.groupby('CustomerID').sum()
for group, df in vehicle_df.groupby('CustomerID'):
    df = df.reset_index()
    for col in vehicle_exp_df.columns:
        if attr := df[df[col] == 1]['VehicleAttributeDetails'].values:
            vehicle_exp_df.loc[group, col] = attr[0]

vehicle = vehicle_exp_df.copy()
vehicle.head()

Unnamed: 0_level_0,VehicleID,VehicleMake,VehicleModel,VehicleYOM
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cust10000,Vehicle26917,Audi,A5,2008
Cust10001,Vehicle15893,Audi,A5,2006
Cust10002,Vehicle5152,Volkswagen,Jetta,1999
Cust10003,Vehicle37363,Volkswagen,Jetta,2003
Cust10004,Vehicle28633,Toyota,CRV,2010


In [282]:
vehicle = vehicle.drop(['VehicleID'], axis=1)

In [283]:
with open('TrainData/vehicle.csv', 'wb') as f:
    vehicle.to_csv(f)

In [6]:
with open('TrainData/vehicle.csv', 'rb') as f:
    vehicle = pd.read_csv(f, index_col=0)

In [7]:
train = train_claim.merge(train_policy, on='CustomerID', how='left')
train = train.merge(train_demographics, on='CustomerID', how='left')
train = train.merge(vehicle, on='CustomerID', how='left')

train.head()

Unnamed: 0,CustomerID,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,...,InsuredAge,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss,VehicleMake,VehicleModel,VehicleYOM
0,Cust10000,2015-02-03,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City1,Location 1311,17.0,...,35,MALE,JD,armed-forces,movies,56700,-48500,Audi,A5,2008
1,Cust10001,2015-02-02,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City5,Location 1311,10.0,...,36,MALE,JD,tech-support,cross-fit,70600,-48500,Audi,A5,2006
2,Cust10002,2015-01-15,Single Vehicle Collision,Side Collision,Minor Damage,Other,State8,City6,Location 2081,22.0,...,33,MALE,JD,armed-forces,polo,66400,-63700,Volkswagen,Jetta,1999
3,Cust10003,2015-01-19,Single Vehicle Collision,Side Collision,Minor Damage,Other,State9,City6,Location 2081,22.0,...,36,MALE,JD,armed-forces,polo,47900,-73400,Volkswagen,Jetta,2003
4,Cust10004,2015-01-09,Single Vehicle Collision,Rear Collision,Minor Damage,Fire,State8,City6,Location 1695,10.0,...,29,FEMALE,High School,exec-managerial,dancing,0,-41500,Toyota,CRV,2010


In [8]:
train['PolicyCombinedLimit'] = train['Policy_CombinedSingleLimit'].apply(lambda x: x.split('/')[0]).astype(int)
train['PolicySingleLimit'] = train['Policy_CombinedSingleLimit'].apply(lambda x: x.split('/')[1]).astype(int)
train = train.drop('Policy_CombinedSingleLimit', axis=1)

In [9]:
with open('TrainData/init_data/Traindata_with_Target.csv', 'rb') as f:
    target = pd.read_csv(f)

#target = target.reset_index()
target['ReportedFraud'] = target['ReportedFraud'].map({'Y': 1, 'N': 0})
train = train.merge(target, on='CustomerID', how='left')
train.head()

Unnamed: 0,CustomerID,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,...,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss,VehicleMake,VehicleModel,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,ReportedFraud
0,Cust10000,2015-02-03,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City1,Location 1311,17.0,...,armed-forces,movies,56700,-48500,Audi,A5,2008,100,300,0
1,Cust10001,2015-02-02,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City5,Location 1311,10.0,...,tech-support,cross-fit,70600,-48500,Audi,A5,2006,100,300,0
2,Cust10002,2015-01-15,Single Vehicle Collision,Side Collision,Minor Damage,Other,State8,City6,Location 2081,22.0,...,armed-forces,polo,66400,-63700,Volkswagen,Jetta,1999,500,1000,0
3,Cust10003,2015-01-19,Single Vehicle Collision,Side Collision,Minor Damage,Other,State9,City6,Location 2081,22.0,...,armed-forces,polo,47900,-73400,Volkswagen,Jetta,2003,500,1000,0
4,Cust10004,2015-01-09,Single Vehicle Collision,Rear Collision,Minor Damage,Fire,State8,City6,Location 1695,10.0,...,exec-managerial,dancing,0,-41500,Toyota,CRV,2010,100,300,0


In [10]:
train = train.drop(['CustomerID'], axis=1)
with open('TrainData/merged_data.csv', 'w') as f:
    train.to_csv(f, index=False)

In [11]:
train.columns

Index(['DateOfIncident', 'TypeOfIncident', 'TypeOfCollission',
       'SeverityOfIncident', 'AuthoritiesContacted', 'IncidentState',
       'IncidentCity', 'IncidentAddress', 'IncidentTime', 'NumberOfVehicles',
       'PropertyDamage', 'BodilyInjuries', 'Witnesses', 'PoliceReport',
       'AmountOfTotalClaim', 'AmountOfInjuryClaim', 'AmountOfPropertyClaim',
       'AmountOfVehicleDamage', 'CustomerLoyaltyPeriod',
       'DateOfPolicyCoverage', 'InsurancePolicyState', 'Policy_Deductible',
       'PolicyAnnualPremium', 'UmbrellaLimit', 'InsuredRelationship',
       'InsuredAge', 'InsuredGender', 'InsuredEducationLevel',
       'InsuredOccupation', 'InsuredHobbies', 'CapitalGains', 'CapitalLoss',
       'VehicleMake', 'VehicleModel', 'VehicleYOM', 'PolicyCombinedLimit',
       'PolicySingleLimit', 'ReportedFraud'],
      dtype='object')

In [12]:
train['DateOfIncident'] = pd.to_datetime(train['DateOfIncident'].astype(str))

In [13]:
train['DateOfPolicyCoverage'] = pd.to_datetime(train['DateOfPolicyCoverage'].astype(str))

In [14]:
train['VehicleYOM'] = pd.to_datetime(train['VehicleYOM'].astype(str))

In [15]:
train['VehicleAge'] = train['DateOfIncident'].dt.year - train['VehicleYOM'].dt.year

In [16]:
train['PolicyAge'] = train['DateOfIncident'].dt.year - train['DateOfPolicyCoverage'].dt.year

In [17]:
train.head()

Unnamed: 0,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,NumberOfVehicles,...,CapitalGains,CapitalLoss,VehicleMake,VehicleModel,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,ReportedFraud,VehicleAge,PolicyAge
0,2015-02-03,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City1,Location 1311,17.0,3,...,56700,-48500,Audi,A5,2008-01-01,100,300,0,7,17
1,2015-02-02,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City5,Location 1311,10.0,3,...,70600,-48500,Audi,A5,2006-01-01,100,300,0,9,15
2,2015-01-15,Single Vehicle Collision,Side Collision,Minor Damage,Other,State8,City6,Location 2081,22.0,1,...,66400,-63700,Volkswagen,Jetta,1999-01-01,500,1000,0,16,14
3,2015-01-19,Single Vehicle Collision,Side Collision,Minor Damage,Other,State9,City6,Location 2081,22.0,1,...,47900,-73400,Volkswagen,Jetta,2003-01-01,500,1000,0,12,10
4,2015-01-09,Single Vehicle Collision,Rear Collision,Minor Damage,Fire,State8,City6,Location 1695,10.0,1,...,0,-41500,Toyota,CRV,2010-01-01,100,300,0,5,19


In [18]:
datatypes = pd.DataFrame(train.dtypes, columns=['dtype'], index=train.columns)
datatypes

Unnamed: 0,dtype
DateOfIncident,datetime64[ns]
TypeOfIncident,object
TypeOfCollission,object
SeverityOfIncident,object
AuthoritiesContacted,object
IncidentState,object
IncidentCity,object
IncidentAddress,object
IncidentTime,float64
NumberOfVehicles,int64


In [19]:
object_cols = datatypes[datatypes['dtype'] == 'object'].index
object_cols = object_cols.tolist()
train[object_cols] = train[object_cols].astype('category')

In [20]:
datatypes = pd.DataFrame(train.dtypes, columns=['dtype'], index=train.columns)
datatypes

Unnamed: 0,dtype
DateOfIncident,datetime64[ns]
TypeOfIncident,category
TypeOfCollission,category
SeverityOfIncident,category
AuthoritiesContacted,category
IncidentState,category
IncidentCity,category
IncidentAddress,category
IncidentTime,float64
NumberOfVehicles,int64


In [21]:
num_cols = datatypes[datatypes['dtype'] != 'category'].index
num_cols = num_cols.tolist()
num_cols.remove('ReportedFraud')
num_cols.remove('Witnesses')
num_cols.remove('NumberOfVehicles')
num_cols

['DateOfIncident',
 'IncidentTime',
 'BodilyInjuries',
 'AmountOfTotalClaim',
 'AmountOfInjuryClaim',
 'AmountOfPropertyClaim',
 'AmountOfVehicleDamage',
 'CustomerLoyaltyPeriod',
 'DateOfPolicyCoverage',
 'Policy_Deductible',
 'PolicyAnnualPremium',
 'UmbrellaLimit',
 'InsuredAge',
 'CapitalGains',
 'CapitalLoss',
 'VehicleYOM',
 'PolicyCombinedLimit',
 'PolicySingleLimit',
 'VehicleAge',
 'PolicyAge']

In [22]:
cat_cols = datatypes[datatypes['dtype'] == 'category'].index
cat_cols = cat_cols.tolist()
cat_cols.extend(['Witnesses', 'NumberOfVehicles'])
cat_cols

['TypeOfIncident',
 'TypeOfCollission',
 'SeverityOfIncident',
 'AuthoritiesContacted',
 'IncidentState',
 'IncidentCity',
 'IncidentAddress',
 'PropertyDamage',
 'PoliceReport',
 'InsurancePolicyState',
 'InsuredRelationship',
 'InsuredGender',
 'InsuredEducationLevel',
 'InsuredOccupation',
 'InsuredHobbies',
 'VehicleMake',
 'VehicleModel',
 'Witnesses',
 'NumberOfVehicles']

In [23]:
len(cat_cols) + len(num_cols) == len(train.columns) - 1

True

In [24]:
dates = datatypes[datatypes['dtype'] == 'datetime64[ns]'].index
dates = dates.tolist()
dates

['DateOfIncident', 'DateOfPolicyCoverage', 'VehicleYOM']

In [25]:
train[dates] = train[dates] - pd.Timestamp("1980-01-01")

In [26]:
train[dates].head()

Unnamed: 0,DateOfIncident,DateOfPolicyCoverage,VehicleYOM
0,12817 days,6872 days,10227 days
1,12816 days,7624 days,9497 days
2,12798 days,7713 days,6940 days
3,12802 days,9232 days,8401 days
4,12792 days,6142 days,10958 days


In [27]:
train[dates].dtypes

DateOfIncident          timedelta64[ns]
DateOfPolicyCoverage    timedelta64[ns]
VehicleYOM              timedelta64[ns]
dtype: object

In [28]:
train[dates] = train[dates].astype('int64')

In [29]:
cat_imputer = SimpleImputer(strategy='most_frequent')
train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])

num_imputer = SimpleImputer(strategy='median')
train[num_cols] = num_imputer.fit_transform(train[num_cols])


In [30]:
train.head()

Unnamed: 0,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,NumberOfVehicles,...,CapitalGains,CapitalLoss,VehicleMake,VehicleModel,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,ReportedFraud,VehicleAge,PolicyAge
0,1.107389e+18,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City1,Location 1311,17.0,3,...,56700.0,-48500.0,Audi,A5,8.836128e+17,100.0,300.0,0,7.0,17.0
1,1.107302e+18,Multi-vehicle Collision,Side Collision,Total Loss,Police,State7,City5,Location 1311,10.0,3,...,70600.0,-48500.0,Audi,A5,8.205408e+17,100.0,300.0,0,9.0,15.0
2,1.105747e+18,Single Vehicle Collision,Side Collision,Minor Damage,Other,State8,City6,Location 2081,22.0,1,...,66400.0,-63700.0,Volkswagen,Jetta,5.99616e+17,500.0,1000.0,0,16.0,14.0
3,1.106093e+18,Single Vehicle Collision,Side Collision,Minor Damage,Other,State9,City6,Location 2081,22.0,1,...,47900.0,-73400.0,Volkswagen,Jetta,7.258464e+17,500.0,1000.0,0,12.0,10.0
4,1.105229e+18,Single Vehicle Collision,Rear Collision,Minor Damage,Fire,State8,City6,Location 1695,10.0,1,...,0.0,-41500.0,Toyota,CRV,9.467712e+17,100.0,300.0,0,5.0,19.0


In [31]:
scaler = MinMaxScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])

train[num_cols].head()

Unnamed: 0,DateOfIncident,IncidentTime,BodilyInjuries,AmountOfTotalClaim,AmountOfInjuryClaim,AmountOfPropertyClaim,AmountOfVehicleDamage,CustomerLoyaltyPeriod,DateOfPolicyCoverage,Policy_Deductible,PolicyAnnualPremium,UmbrellaLimit,InsuredAge,CapitalGains,CapitalLoss,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,VehicleAge,PolicyAge
0,0.458333,0.73913,0.5,0.569408,0.625501,0.256485,0.577765,0.100418,0.350114,0.333333,0.742532,0.090909,0.355556,0.564179,0.563456,0.649966,0.0,0.0,0.35,0.68
1,0.444444,0.434783,1.0,0.533519,0.725408,0.250063,0.500862,0.236402,0.432058,0.333333,0.508226,0.090909,0.377778,0.702488,0.563456,0.550034,0.0,0.0,0.45,0.6
2,0.194444,0.956522,1.0,0.580335,0.542191,0.491339,0.546072,0.34728,0.441757,0.078,0.581576,0.090909,0.311111,0.660697,0.426643,0.2,1.0,1.0,0.8,0.56
3,0.25,0.956522,1.0,0.575873,0.55958,0.507098,0.530239,0.395397,0.607279,0.148,0.559371,0.090909,0.377778,0.476617,0.339334,0.4,1.0,1.0,0.6,0.4
4,0.111111,0.434783,1.0,0.465226,0.411608,0.305619,0.470378,0.238494,0.270568,0.0,0.569381,0.479988,0.222222,0.0,0.626463,0.750034,0.0,0.0,0.25,0.76


In [32]:
one_hot = OneHotEncoder(handle_unknown='ignore')
train_cat = one_hot.fit_transform(train[cat_cols])
train_cat = pd.DataFrame(train_cat.toarray(), columns=one_hot.get_feature_names_out(cat_cols))
train_cat.head()

Unnamed: 0,TypeOfIncident_Multi-vehicle Collision,TypeOfIncident_Parked Car,TypeOfIncident_Single Vehicle Collision,TypeOfIncident_Vehicle Theft,TypeOfCollission_Front Collision,TypeOfCollission_Rear Collision,TypeOfCollission_Side Collision,SeverityOfIncident_Major Damage,SeverityOfIncident_Minor Damage,SeverityOfIncident_Total Loss,...,VehicleModel_X5,VehicleModel_X6,Witnesses_0.0,Witnesses_1.0,Witnesses_2.0,Witnesses_3.0,NumberOfVehicles_1,NumberOfVehicles_2,NumberOfVehicles_3,NumberOfVehicles_4
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
train_cat = train_cat.astype('int64')

In [34]:
train_cat.head()

Unnamed: 0,TypeOfIncident_Multi-vehicle Collision,TypeOfIncident_Parked Car,TypeOfIncident_Single Vehicle Collision,TypeOfIncident_Vehicle Theft,TypeOfCollission_Front Collision,TypeOfCollission_Rear Collision,TypeOfCollission_Side Collision,SeverityOfIncident_Major Damage,SeverityOfIncident_Minor Damage,SeverityOfIncident_Total Loss,...,VehicleModel_X5,VehicleModel_X6,Witnesses_0.0,Witnesses_1.0,Witnesses_2.0,Witnesses_3.0,NumberOfVehicles_1,NumberOfVehicles_2,NumberOfVehicles_3,NumberOfVehicles_4
0,1,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
1,1,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,1,0
2,0,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,1,1,0,0,0
3,0,0,1,0,0,0,1,0,1,0,...,0,0,0,0,0,1,1,0,0,0
4,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0


In [35]:
train_num = train[num_cols]
train_num.head()

Unnamed: 0,DateOfIncident,IncidentTime,BodilyInjuries,AmountOfTotalClaim,AmountOfInjuryClaim,AmountOfPropertyClaim,AmountOfVehicleDamage,CustomerLoyaltyPeriod,DateOfPolicyCoverage,Policy_Deductible,PolicyAnnualPremium,UmbrellaLimit,InsuredAge,CapitalGains,CapitalLoss,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,VehicleAge,PolicyAge
0,0.458333,0.73913,0.5,0.569408,0.625501,0.256485,0.577765,0.100418,0.350114,0.333333,0.742532,0.090909,0.355556,0.564179,0.563456,0.649966,0.0,0.0,0.35,0.68
1,0.444444,0.434783,1.0,0.533519,0.725408,0.250063,0.500862,0.236402,0.432058,0.333333,0.508226,0.090909,0.377778,0.702488,0.563456,0.550034,0.0,0.0,0.45,0.6
2,0.194444,0.956522,1.0,0.580335,0.542191,0.491339,0.546072,0.34728,0.441757,0.078,0.581576,0.090909,0.311111,0.660697,0.426643,0.2,1.0,1.0,0.8,0.56
3,0.25,0.956522,1.0,0.575873,0.55958,0.507098,0.530239,0.395397,0.607279,0.148,0.559371,0.090909,0.377778,0.476617,0.339334,0.4,1.0,1.0,0.6,0.4
4,0.111111,0.434783,1.0,0.465226,0.411608,0.305619,0.470378,0.238494,0.270568,0.0,0.569381,0.479988,0.222222,0.0,0.626463,0.750034,0.0,0.0,0.25,0.76


In [36]:
import pickle

with open('TrainData/train_cat.pkl', 'wb') as f:
    pickle.dump(train_cat, f)
with open('TrainData/train_num.pkl', 'wb') as f:
    pickle.dump(train_num, f)

In [50]:
train_full = pd.concat([target, train_cat, train_num], axis=1)

In [51]:
with open('TrainData/train_full.pkl', 'wb') as f:
    pickle.dump(train_full, f)

In [39]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train[cat_cols] = train[cat_cols].apply(le.fit_transform)

train.head()

Unnamed: 0,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,IncidentTime,NumberOfVehicles,...,CapitalGains,CapitalLoss,VehicleMake,VehicleModel,VehicleYOM,PolicyCombinedLimit,PolicySingleLimit,ReportedFraud,VehicleAge,PolicyAge
0,0.458333,0,2,2,4,4,0,279,0.73913,2,...,0.564179,0.563456,1,5,0.649966,0.0,0.0,0,0.35,0.68
1,0.444444,0,2,2,4,4,4,279,0.434783,2,...,0.702488,0.563456,1,5,0.550034,0.0,0.0,0,0.45,0.6
2,0.194444,2,2,1,3,5,5,980,0.956522,0,...,0.660697,0.426643,13,20,0.2,1.0,1.0,0,0.8,0.56
3,0.25,2,2,1,3,6,5,980,0.956522,0,...,0.476617,0.339334,13,20,0.4,1.0,1.0,0,0.6,0.4
4,0.111111,2,1,1,1,5,5,624,0.434783,0,...,0.0,0.626463,12,8,0.750034,0.0,0.0,0,0.25,0.76


In [40]:
with open('train_label_encoded.pkl', 'wb') as f:
    pickle.dump(train, f)

In [41]:
with open('TrainData/target.pkl', 'wb') as f:
    pickle.dump(train.ReportedFraud, f)

In [42]:
import pickle
import pandas as pd
import numpy as np

with open('TrainData/train_full.pkl', 'rb') as f:
    train_full = pickle.load(f)

with open('TrainData/target.pkl', 'rb') as f:
    target = pickle.load(f)

with open('TrainData/train_cat.pkl', 'rb') as f:
    train_cat = pickle.load(f)

with open('TrainData/train_num.pkl', 'rb') as f:
    train_num = pickle.load(f)


In [43]:
datatypes = pd.DataFrame(train_full.dtypes, columns=['dtype'], index=train_full.columns)
datatypes

Unnamed: 0,dtype
TypeOfIncident_Multi-vehicle Collision,int64
TypeOfIncident_Parked Car,int64
TypeOfIncident_Single Vehicle Collision,int64
TypeOfIncident_Vehicle Theft,int64
TypeOfCollission_Front Collision,int64
...,...
VehicleYOM,float64
PolicyCombinedLimit,float64
PolicySingleLimit,float64
VehicleAge,float64


In [44]:
cat_cols = datatypes[datatypes['dtype'] == 'int64'].index
cat_cols = cat_cols.tolist()

num_cols = datatypes[datatypes['dtype'] == 'float64'].index
num_cols = num_cols.tolist()

train_full[cat_cols] = train_full[cat_cols].astype('category')
train_full[num_cols] = train_full[num_cols].astype('float32')

train_full[cat_cols].dtypes

TypeOfIncident_Multi-vehicle Collision     category
TypeOfIncident_Parked Car                  category
TypeOfIncident_Single Vehicle Collision    category
TypeOfIncident_Vehicle Theft               category
TypeOfCollission_Front Collision           category
                                             ...   
Witnesses_3.0                              category
NumberOfVehicles_1                         category
NumberOfVehicles_2                         category
NumberOfVehicles_3                         category
NumberOfVehicles_4                         category
Length: 1147, dtype: object

In [45]:
with open('TrainData/train_full.pkl', 'wb') as f:
    pickle.dump(train_full, f)

In [46]:
train_num = train_num.astype('float32')

In [47]:
train_cat = train_cat.astype('category')

In [48]:
with open('TrainData/train_cat.pkl', 'wb') as f:
    pickle.dump(train_cat, f)

with open('TrainData/train_num.pkl', 'wb') as f:
    pickle.dump(train_num, f)

In [49]:
target = target.astype('category')

with open('TrainData/target.pkl', 'wb') as f:
    pickle.dump(target, f)