In [1]:
import pandas as pd
import numpy as np

with open('TrainData/train_data_prepr.pkl', 'rb') as f:
    train_data = pd.read_pickle(f)

with open('TestData/test_data_prepr.pkl', 'rb') as f:
    test_data = pd.read_pickle(f)

train_data.head()

Unnamed: 0,CustomerID,ReportedFraud,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,...,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss,PolicyCombinedLimit,PolicySingleLimit
0,20065.0,N,2015-01-09,Multi-vehicle Collision,Rear Collision,Total Loss,Other,9,4,1404,...,58.0,471784,MALE,JD,sales,video-games,0.0,-42700.0,250.0,500.0
1,37589.0,N,2015-02-20,Single Vehicle Collision,Side Collision,Minor Damage,Ambulance,4,3,1675,...,34.0,431937,FEMALE,High School,transport-moving,polo,51300.0,0.0,100.0,300.0
2,24312.0,N,2015-01-14,Multi-vehicle Collision,Side Collision,Major Damage,Ambulance,5,2,1546,...,52.0,479320,FEMALE,PhD,machine-op-inspct,exercise,0.0,0.0,500.0,1000.0
3,5493.0,Y,2015-01-07,Multi-vehicle Collision,Side Collision,Major Damage,Ambulance,7,4,1413,...,25.0,607763,FEMALE,College,exec-managerial,exercise,47400.0,-56100.0,100.0,300.0
4,7704.0,Y,2015-02-26,Multi-vehicle Collision,Side Collision,Major Damage,Other,4,6,1367,...,27.0,441783,MALE,Masters,sales,chess,0.0,0.0,250.0,500.0


In [2]:
train_data.dtypes

CustomerID                float64
ReportedFraud              object
DateOfIncident           category
TypeOfIncident           category
TypeOfCollission         category
SeverityOfIncident       category
AuthoritiesContacted     category
IncidentState            category
IncidentCity             category
IncidentAddress          category
IncidentTime              float64
NumberOfVehicles         category
PropertyDamage           category
BodilyInjuries           category
Witnesses                category
PoliceReport               object
AmountOfTotalClaim        float64
AmountOfInjuryClaim       float64
AmountOfPropertyClaim     float64
AmountOfVehicleDamage     float64
InsurancePolicyNumber     float64
CustomerLoyaltyPeriod     float64
DateOfPolicyCoverage     category
InsurancePolicyState     category
Policy_Deductible         float64
PolicyAnnualPremium       float64
UmbrellaLimit             float64
InsuredRelationship      category
VehicleMake              category
VehicleModel  

In [3]:
num_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64']]
num_cols.remove('InsurancePolicyNumber')
num_cols.remove('CustomerID')
num_cols.remove('IncidentTime')
num_cols

['AmountOfTotalClaim',
 'AmountOfInjuryClaim',
 'AmountOfPropertyClaim',
 'AmountOfVehicleDamage',
 'CustomerLoyaltyPeriod',
 'Policy_Deductible',
 'PolicyAnnualPremium',
 'UmbrellaLimit',
 'InsuredAge',
 'CapitalGains',
 'CapitalLoss',
 'PolicyCombinedLimit',
 'PolicySingleLimit']

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()
train_data_scaled[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data_scaled[num_cols] = scaler.transform(test_data[num_cols])

train_data_scaled.head()

Unnamed: 0,CustomerID,ReportedFraud,DateOfIncident,TypeOfIncident,TypeOfCollission,SeverityOfIncident,AuthoritiesContacted,IncidentState,IncidentCity,IncidentAddress,...,InsuredAge,InsuredZipCode,InsuredGender,InsuredEducationLevel,InsuredOccupation,InsuredHobbies,CapitalGains,CapitalLoss,PolicyCombinedLimit,PolicySingleLimit
0,20065.0,N,2015-01-09,Multi-vehicle Collision,Rear Collision,Total Loss,Other,9,4,1404,...,2.399207,471784,MALE,JD,sales,video-games,-0.834616,-0.636247,-0.151765,-0.290132
1,37589.0,N,2015-02-20,Single Vehicle Collision,Side Collision,Minor Damage,Ambulance,4,3,1675,...,-0.602204,431937,FEMALE,High School,transport-moving,polo,1.021569,0.893521,-1.081375,-0.983964
2,24312.0,N,2015-01-14,Multi-vehicle Collision,Side Collision,Major Damage,Ambulance,5,2,1546,...,1.648854,479320,FEMALE,PhD,machine-op-inspct,exercise,-0.834616,0.893521,1.397586,1.44445
3,5493.0,Y,2015-01-07,Multi-vehicle Collision,Side Collision,Major Damage,Ambulance,7,4,1413,...,-1.727734,607763,FEMALE,College,exec-managerial,exercise,0.880455,-1.116315,-1.081375,-0.983964
4,7704.0,Y,2015-02-26,Multi-vehicle Collision,Side Collision,Major Damage,Other,4,6,1367,...,-1.477616,441783,MALE,Masters,sales,chess,-0.834616,0.893521,-0.151765,-0.290132


In [5]:
train_data_scaled.dtypes


CustomerID                float64
ReportedFraud              object
DateOfIncident           category
TypeOfIncident           category
TypeOfCollission         category
SeverityOfIncident       category
AuthoritiesContacted     category
IncidentState            category
IncidentCity             category
IncidentAddress          category
IncidentTime              float64
NumberOfVehicles         category
PropertyDamage           category
BodilyInjuries           category
Witnesses                category
PoliceReport               object
AmountOfTotalClaim        float64
AmountOfInjuryClaim       float64
AmountOfPropertyClaim     float64
AmountOfVehicleDamage     float64
InsurancePolicyNumber     float64
CustomerLoyaltyPeriod     float64
DateOfPolicyCoverage     category
InsurancePolicyState     category
Policy_Deductible         float64
PolicyAnnualPremium       float64
UmbrellaLimit             float64
InsuredRelationship      category
VehicleMake              category
VehicleModel  

In [6]:
train_data_scaled['CustomerID'] = train_data_scaled['CustomerID'].astype('int64')
train_data_scaled['IncidentTime'] = train_data_scaled['IncidentTime'].astype('int64')
train_data_scaled['InsurancePolicyNumber'] = train_data_scaled['InsurancePolicyNumber'].astype('int64')
train_data_scaled['ReportedFraud'] = train_data_scaled['ReportedFraud'].astype('category')
train_data_scaled['PoliceReport'] = train_data_scaled['PoliceReport'].astype('category')

test_data_scaled['CustomerID'] = test_data_scaled['CustomerID'].astype('int64')
test_data_scaled['IncidentTime'] = test_data_scaled['IncidentTime'].astype('int64')
test_data_scaled['InsurancePolicyNumber'] = test_data_scaled['InsurancePolicyNumber'].astype('int64')
test_data_scaled['PoliceReport'] = test_data_scaled['PoliceReport'].astype('category')

train_data_scaled.dtypes

CustomerID                  int64
ReportedFraud            category
DateOfIncident           category
TypeOfIncident           category
TypeOfCollission         category
SeverityOfIncident       category
AuthoritiesContacted     category
IncidentState            category
IncidentCity             category
IncidentAddress          category
IncidentTime                int64
NumberOfVehicles         category
PropertyDamage           category
BodilyInjuries           category
Witnesses                category
PoliceReport             category
AmountOfTotalClaim        float64
AmountOfInjuryClaim       float64
AmountOfPropertyClaim     float64
AmountOfVehicleDamage     float64
InsurancePolicyNumber       int64
CustomerLoyaltyPeriod     float64
DateOfPolicyCoverage     category
InsurancePolicyState     category
Policy_Deductible         float64
PolicyAnnualPremium       float64
UmbrellaLimit             float64
InsuredRelationship      category
VehicleMake              category
VehicleModel  

In [7]:
import pickle
with open('TrainData/train_data_scaled.pkl', 'wb') as f:
    train_data_scaled.to_pickle(f)

with open('TestData/test_data_scaled.pkl', 'wb') as f:
    test_data_scaled.to_pickle(f)

with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)