## Preparing Data for Model

In [241]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import label_binarize

In [242]:
train_raw = pd.read_csv('data/train_sample.csv', index_col = 0)

In [243]:
test_raw = pd.read_csv('data/test_sample.csv', index_col = 0)

In [244]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 575240 entries, 8068730 to 7306189
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    575240 non-null  int64  
 1   Gender                575240 non-null  object 
 2   Age                   575240 non-null  int64  
 3   Driving_License       575240 non-null  int64  
 4   Region_Code           575240 non-null  float64
 5   Previously_Insured    575240 non-null  int64  
 6   Vehicle_Age           575240 non-null  object 
 7   Vehicle_Damage        575240 non-null  object 
 8   Annual_Premium        575240 non-null  float64
 9   Policy_Sales_Channel  575240 non-null  float64
 10  Vintage               575240 non-null  int64  
 11  Response              575240 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 57.1+ MB


## Creating Numeric and Categorical Lists

#### Training Data

In [245]:
train = train_raw.drop(['id', 'Response'], axis=1)

In [246]:
train['Driving_License'] = train['Driving_License'].astype('category')
train['Previously_Insured'] = train['Previously_Insured'].astype('category')

In [247]:
numeric_columns = train.select_dtypes(include=[int, float]).columns
categorical_columns = train.select_dtypes(exclude=[int, float]).columns

#### Testing Data

In [248]:
test = test_raw.drop('id', axis=1)

In [249]:
test['Driving_License'] = test['Driving_License'].astype('category')
test['Previously_Insured'] = test['Previously_Insured'].astype('category')

## Splitting into Training and Testing

In [250]:
y = train_raw['Response']

In [251]:
xtrain, xtest, ytrain, ytest = train_test_split(train, y, test_size = 0.3, stratify = y, random_state = 123)

## Scaling Data

#### Numeric

In [252]:
scaler = StandardScaler()

In [253]:
# subtraining
xtrain_scaled = scaler.fit_transform(xtrain[numeric_columns])
xtrain[numeric_columns] = xtrain_scaled

# subtesting
xtest_scaled = scaler.transform(xtest[numeric_columns])
xtest[numeric_columns] = xtest_scaled

In [254]:
# full training
train_scaled = scaler.fit_transform(train[numeric_columns])
train[numeric_columns] = train_scaled

# full testing
test_scaled = scaler.transform(test[numeric_columns])
test[numeric_columns] = test_scaled

#### Categorical

In [255]:
transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), 
     ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage']))

Sub Training Set

In [256]:
transformed_xtrain = transformer.fit_transform(xtrain)
# dense_array = transformed_xtrain.toarray()
transformed_df = pd.DataFrame(transformed_xtrain, columns = transformer.get_feature_names_out())
transformed_df.index = xtrain.index

# Joining tables
xtrain = pd.concat([xtrain, transformed_df], axis=1)
# Dropping old categorical columns
xtrain.drop(categorical_columns, axis=1, inplace=True)

Sus Testing Set

In [257]:
transformed_xtest = transformer.transform(xtest)
# dense_array = transformed_xtest.toarray()
transformed_df = pd.DataFrame(transformed_xtest, columns = transformer.get_feature_names_out())
transformed_df.index = xtest.index

# Joining tables
xtest = pd.concat([xtest, transformed_df], axis=1)
# Dropping old categorical columns
xtest.drop(categorical_columns, axis=1, inplace=True)

Full Training Set

In [258]:
transformed_X = transformer.fit_transform(train)
# dense_array = transformed_X.toarray()
transformed_df = pd.DataFrame(transformed_X, columns = transformer.get_feature_names_out())
transformed_df.index = train.index

# Joining tables
X_scaled = pd.concat([train, transformed_df], axis=1)
# Dropping old categorical columns
X_scaled.drop(categorical_columns, axis=1, inplace=True)

Full Testing Set

In [259]:
transformed_test = transformer.transform(test)
# dense_array = transformed_test.toarray()
transformed_df = pd.DataFrame(transformed_test, columns = transformer.get_feature_names_out())
transformed_df.index = test.index

# Joining tables
X_test_scaled = pd.concat([test, transformed_df], axis=1)
# Dropping old categorical columns
X_test_scaled.drop(categorical_columns, axis=1, inplace=True)

Checking the Scaled Data Sets

In [260]:
X_scaled.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
8068730,-0.961419,1.123724,-0.12129,0.73259,1.597757,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
9058636,1.173544,0.198514,-1.690631,0.214669,0.672021,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2243783,0.973391,0.121413,0.845263,0.214669,-0.741605,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2998772,-0.09409,0.815321,-0.443434,0.769584,1.122379,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
7800941,-0.961419,-0.109889,-0.224033,0.73259,-0.428856,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [261]:
X_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 575240 entries, 8068730 to 7306189
Data columns (total 16 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Age                                   575240 non-null  float64
 1   Region_Code                           575240 non-null  float64
 2   Annual_Premium                        575240 non-null  float64
 3   Policy_Sales_Channel                  575240 non-null  float64
 4   Vintage                               575240 non-null  float64
 5   onehotencoder__Gender_Female          575240 non-null  float64
 6   onehotencoder__Gender_Male            575240 non-null  float64
 7   onehotencoder__Driving_License_0      575240 non-null  float64
 8   onehotencoder__Driving_License_1      575240 non-null  float64
 9   onehotencoder__Previously_Insured_0   575240 non-null  float64
 10  onehotencoder__Previously_Insured_1   575240 non-null  float64
 11

In [262]:
X_test_scaled.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
1332833,-0.294243,-1.806106,0.473114,0.214669,1.059829,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
7355591,1.57385,0.121413,1.059377,-1.598053,-1.354593,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3721611,0.639803,-1.497703,0.20772,0.214669,-1.629812,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1567836,-0.894701,0.121413,1.224398,0.214669,-1.129413,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
887255,0.17278,-1.420602,-1.690631,0.806579,0.109072,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [263]:
X_test_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383493 entries, 1332833 to 6395893
Data columns (total 16 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Age                                   383493 non-null  float64
 1   Region_Code                           383493 non-null  float64
 2   Annual_Premium                        383493 non-null  float64
 3   Policy_Sales_Channel                  383493 non-null  float64
 4   Vintage                               383493 non-null  float64
 5   onehotencoder__Gender_Female          383493 non-null  float64
 6   onehotencoder__Gender_Male            383493 non-null  float64
 7   onehotencoder__Driving_License_0      383493 non-null  float64
 8   onehotencoder__Driving_License_1      383493 non-null  float64
 9   onehotencoder__Previously_Insured_0   383493 non-null  float64
 10  onehotencoder__Previously_Insured_1   383493 non-null  float64
 11

In [264]:
xtrain.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
9586534,1.572883,1.509735,0.012464,0.214263,-0.867608,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
9909678,1.639598,0.969619,1.419521,0.214263,1.235226,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3333964,-0.028294,0.120865,-1.690382,0.806398,-1.756305,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
98928,-0.628735,0.120865,0.343675,-1.599148,1.072507,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
11027248,0.83901,1.586895,1.478643,0.214263,1.523114,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [265]:
xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Index: 402668 entries, 9586534 to 10972564
Data columns (total 16 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Age                                   402668 non-null  float64
 1   Region_Code                           402668 non-null  float64
 2   Annual_Premium                        402668 non-null  float64
 3   Policy_Sales_Channel                  402668 non-null  float64
 4   Vintage                               402668 non-null  float64
 5   onehotencoder__Gender_Female          402668 non-null  float64
 6   onehotencoder__Gender_Male            402668 non-null  float64
 7   onehotencoder__Driving_License_0      402668 non-null  float64
 8   onehotencoder__Driving_License_1      402668 non-null  float64
 9   onehotencoder__Previously_Insured_0   402668 non-null  float64
 10  onehotencoder__Previously_Insured_1   402668 non-null  float64
 1

In [266]:
xtest.head()

Unnamed: 0,Age,Region_Code,Annual_Premium,Policy_Sales_Channel,Vintage,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Driving_License_0,onehotencoder__Driving_License_1,onehotencoder__Previously_Insured_0,onehotencoder__Previously_Insured_1,onehotencoder__Vehicle_Age_1-2 Year,onehotencoder__Vehicle_Age_< 1 Year,onehotencoder__Vehicle_Age_> 2 Years,onehotencoder__Vehicle_Damage_No,onehotencoder__Vehicle_Damage_Yes
5659265,0.038422,-0.187773,0.038956,0.214263,1.135091,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1260204,1.506167,-1.422324,0.923281,0.214263,-0.054011,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4673380,1.572883,0.198024,0.208602,0.214263,-1.243114,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2017647,-0.962313,-1.422324,0.911797,0.732381,0.108708,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
10007832,-0.295156,-1.499483,-1.690382,0.824902,0.797136,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [267]:
xtest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 172572 entries, 5659265 to 10671390
Data columns (total 16 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Age                                   172572 non-null  float64
 1   Region_Code                           172572 non-null  float64
 2   Annual_Premium                        172572 non-null  float64
 3   Policy_Sales_Channel                  172572 non-null  float64
 4   Vintage                               172572 non-null  float64
 5   onehotencoder__Gender_Female          172572 non-null  float64
 6   onehotencoder__Gender_Male            172572 non-null  float64
 7   onehotencoder__Driving_License_0      172572 non-null  float64
 8   onehotencoder__Driving_License_1      172572 non-null  float64
 9   onehotencoder__Previously_Insured_0   172572 non-null  float64
 10  onehotencoder__Previously_Insured_1   172572 non-null  float64
 1

## Changing Feature Names

In [268]:
dfs = [xtrain, xtest, X_scaled, X_test_scaled]

for df in dfs:
    df.columns = df.columns.str.replace('[', '(', regex=False)
    df.columns = df.columns.str.replace(']', ')', regex=False)
    df.columns = df.columns.str.replace('<', 'less_than', regex=False)
    df.columns = df.columns.str.replace('>', 'greater_than', regex=False)

## Saving the Data

In [269]:
xtrain.to_csv('scaled_data/xtrain.csv')
xtest.to_csv('scaled_data/xtest.csv')
X_scaled.to_csv('scaled_data/train_scaled.csv')
X_test_scaled.to_csv('scaled_data/test_scaled.csv')
ytrain.to_csv('scaled_data/ytrain.csv')
ytest.to_csv('scaled_data/ytest.csv')