# Notebook 2
### Pipelines (Feature Engineering and Machine Learning Predictors)

#### Importing Libraries

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

##
from imblearn.over_sampling import SMOTE

#deployment
import joblib

#### Dropping id and Vintage features

In [20]:
df = pd.read_csv('./data/train.csv')
df.dropna(inplace=True)
df.drop(['id', 'Vintage'], axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Gender                381109 non-null  object 
 1   Age                   381109 non-null  int64  
 2   Driving_License       381109 non-null  int64  
 3   Region_Code           381109 non-null  float64
 4   Previously_Insured    381109 non-null  int64  
 5   Vehicle_Age           381109 non-null  object 
 6   Vehicle_Damage        381109 non-null  object 
 7   Annual_Premium        381109 non-null  float64
 8   Policy_Sales_Channel  381109 non-null  float64
 9   Response              381109 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 29.1+ MB


#### Splitting data

In [9]:
X, y = df.drop('Response', axis = 1), df['Response']

X_train, X_test, y_train, y_test = train_test_split( X, y,
                                                     test_size = 0.4,
                                                     random_state = 42,
                                                     stratify = y,
                                                     shuffle = True )

#### Creating Pipeline Functions

In [10]:
categ = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age','Vehicle_Damage']
numeri = ['Age','Annual_Premium', 'Policy_Sales_Channel']

In [11]:
def create_pipeline(categorical_features= categ,
                    numerical_features= numeri, 
                    model = GradientBoostingClassifier(random_state = 42),
                    use_ordinal_encoder = True,
                    ):
    ## ## ## ##
    if use_ordinal_encoder:
        cltr = ColumnTransformer(
            transformers = [
            ('StandardScaler',StandardScaler(), numerical_features),
            ('OrdinalEncoder', OrdinalEncoder(), categorical_features)
            ],
            remainder = 'drop')
    else:
        cltr = ColumnTransformer(
            transformers = [
                ('StandardScaler',StandardScaler(), numerical_features),
                ('OneHotEncoder', OneHotEncoder(drop='if_binary'), categorical_features)
            ],
            remainder = 'drop')
    
    ## ## ## ##
    pipeline = Pipeline([
        ('Transformer', cltr),
        ('Estimator', model)
    ])
    
    return pipeline

In [12]:
GBC_pipl = create_pipeline()
GBC_pipl

In [13]:
GBC_pipl.fit(X_train, y_train)

In [14]:
print(classification_report(GBC_pipl.predict(X_test),y_test) )

              precision    recall  f1-score   support

           0       1.00      0.88      0.93    152428
           1       0.00      0.50      0.00        16

    accuracy                           0.88    152444
   macro avg       0.50      0.69      0.47    152444
weighted avg       1.00      0.88      0.93    152444



#### Using SMOTENC to oversample minority class.

In [None]:
def create_transformer( categorical_features= categ, numerical_features= numeri, use_ordinal_encoder = True ):
    ## ## ## ##
    if use_ordinal_encoder:
        cltr = ColumnTransformer(
            transformers = [ ('StandardScaler',StandardScaler(), numerical_features), ('OrdinalEncoder', OrdinalEncoder(), categorical_features) ],
            remainder = 'drop')
    else:
        cltr = ColumnTransformer(
            transformers = [ ('StandardScaler',StandardScaler(), numerical_features), ('OneHotEncoder', OneHotEncoder(drop='if_binary'), categorical_features)],
            remainder = 'drop')
    ## ## ## ##
    pipeline = Pipeline([ ('Transformer', cltr) ])
    return pipeline

In [38]:
X

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0
...,...,...,...,...,...,...,...,...,...
381104,Male,74,1,26.0,1,1-2 Year,No,30170.0,26.0
381105,Male,30,1,37.0,1,< 1 Year,No,40016.0,152.0
381106,Male,21,1,30.0,1,< 1 Year,No,35118.0,160.0
381107,Female,68,1,14.0,0,> 2 Years,Yes,44617.0,124.0


In [39]:
transf_pipl = create_transformer()
X_transf = transf_pipl.fit_transform(X, y)

# Get the feature names
feature_names = transf_pipl.named_steps['Transformer'].get_feature_names_out()

# Create dataframe with new feature names
X_transf = pd.DataFrame(X_transf, columns= feature_names)

X_transf

Unnamed: 0,StandardScaler__Age,StandardScaler__Annual_Premium,StandardScaler__Policy_Sales_Channel,OrdinalEncoder__Gender,OrdinalEncoder__Driving_License,OrdinalEncoder__Region_Code,OrdinalEncoder__Previously_Insured,OrdinalEncoder__Vehicle_Age,OrdinalEncoder__Vehicle_Damage
0,0.333777,0.574539,-1.587234,1.0,1.0,28.0,0.0,2.0,1.0
1,2.396751,0.172636,-1.587234,1.0,1.0,3.0,0.0,0.0,0.0
2,0.527181,0.449053,-1.587234,1.0,1.0,28.0,0.0,2.0,1.0
3,-1.148985,-0.113018,0.737321,1.0,1.0,11.0,1.0,1.0,0.0
4,-0.633242,-0.178259,0.737321,0.0,1.0,41.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
381104,2.267815,-0.022912,-1.587234,1.0,1.0,26.0,1.0,0.0,0.0
381105,-0.568774,0.549093,0.737321,1.0,1.0,37.0,1.0,1.0,0.0
381106,-1.148985,0.264543,0.884912,1.0,1.0,30.0,1.0,1.0,0.0
381107,1.881007,0.816389,0.220753,0.0,1.0,14.0,0.0,2.0,1.0


In [44]:
categorical_features = \
['OrdinalEncoder__Gender',
'OrdinalEncoder__Driving_License',
'OrdinalEncoder__Region_Code',
'OrdinalEncoder__Previously_Insured',
'OrdinalEncoder__Vehicle_Age',
'OrdinalEncoder__Vehicle_Damage']

numerical_features = \
    ['StandardScaler__Age',
     'StandardScaler__Annual_Premium',
     'StandardScaler__Policy_Sales_Channel']

In [45]:
X_resampled, y_resampled = SMOTE( sampling_strategy = 'minority',
                                  random_state = 42
                                  ).fit_resample(X_transf, y)

In [51]:
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = \
    train_test_split( X_resampled, y_resampled,
                      test_size = 0.4,
                      random_state = 42,
                      stratify = y_resampled, 
                      shuffle = True )

In [53]:
y_test_resampled

387224    1
501391    1
418076    1
185031    0
115572    0
         ..
609846    1
421491    1
106746    0
489559    1
298949    1
Name: Response, Length: 267520, dtype: int64

In [57]:
GBC_SMOTE = GradientBoostingClassifier(random_state = 42)
#GBC_SMOTE = create_pipeline(categorical_features= categorical_features, numerical_features= numerical_features)

cltr = \
ColumnTransformer([
    ('StandardScaler',StandardScaler(), numerical_features),
    ('OrdinalEncoder',OrdinalEncoder(), categorical_features)
])

GBC_pipl_Smote = Pipeline([
    #('ColumnTransformer', cltr),
    ('Estimator',GBC_SMOTE)
])

GBC_pipl_Smote.fit(X_train_resampled, y_train_resampled)
#GBC_SMOTE.fit(X_train_resampled, y_train_resampled)

print(classification_report( GBC_pipl_Smote.predict(X_test_resampled),
                            y_test_resampled) )

              precision    recall  f1-score   support

           0       0.69      0.94      0.79     97222
           1       0.96      0.75      0.84    170298

    accuracy                           0.82    267520
   macro avg       0.82      0.85      0.82    267520
weighted avg       0.86      0.82      0.83    267520



#### Save Model and Transformer

In [60]:
joblib.dump(value = GBC_pipl_Smote,
            filename = './models/GBC_SMOTE.sav')

#check:
temp_mdl = joblib.load('./models/GBC_SMOTE.sav')
temp_mdl

In [61]:
joblib.dump(value = transf_pipl,
            filename = './models/transf_pipl.sav')

#check:
temp_transf = joblib.load('./models/transf_pipl.sav')
temp_transf

In [63]:
df['Policy_Sales_Channel'].unique().tolist()

[26.0,
 152.0,
 160.0,
 124.0,
 14.0,
 13.0,
 30.0,
 156.0,
 163.0,
 157.0,
 122.0,
 19.0,
 22.0,
 15.0,
 154.0,
 16.0,
 52.0,
 155.0,
 11.0,
 151.0,
 125.0,
 25.0,
 61.0,
 1.0,
 86.0,
 31.0,
 150.0,
 23.0,
 60.0,
 21.0,
 121.0,
 3.0,
 139.0,
 12.0,
 29.0,
 55.0,
 7.0,
 47.0,
 127.0,
 153.0,
 78.0,
 158.0,
 89.0,
 32.0,
 8.0,
 10.0,
 120.0,
 65.0,
 4.0,
 42.0,
 83.0,
 136.0,
 24.0,
 18.0,
 56.0,
 48.0,
 106.0,
 54.0,
 93.0,
 116.0,
 91.0,
 45.0,
 9.0,
 145.0,
 147.0,
 44.0,
 109.0,
 37.0,
 140.0,
 107.0,
 128.0,
 131.0,
 114.0,
 118.0,
 159.0,
 119.0,
 105.0,
 135.0,
 62.0,
 138.0,
 129.0,
 88.0,
 92.0,
 111.0,
 113.0,
 73.0,
 36.0,
 28.0,
 35.0,
 59.0,
 53.0,
 148.0,
 133.0,
 108.0,
 64.0,
 39.0,
 94.0,
 132.0,
 46.0,
 81.0,
 103.0,
 90.0,
 51.0,
 27.0,
 146.0,
 63.0,
 96.0,
 40.0,
 66.0,
 100.0,
 95.0,
 123.0,
 98.0,
 75.0,
 69.0,
 130.0,
 134.0,
 49.0,
 97.0,
 38.0,
 17.0,
 110.0,
 80.0,
 71.0,
 117.0,
 58.0,
 20.0,
 76.0,
 104.0,
 87.0,
 84.0,
 137.0,
 126.0,
 68.0,
 67.0,
 101.0,


In [64]:
df['Region_Code'].unique().tolist()

[28.0,
 3.0,
 11.0,
 41.0,
 33.0,
 6.0,
 35.0,
 50.0,
 15.0,
 45.0,
 8.0,
 36.0,
 30.0,
 26.0,
 16.0,
 47.0,
 48.0,
 19.0,
 39.0,
 23.0,
 37.0,
 5.0,
 17.0,
 2.0,
 7.0,
 29.0,
 46.0,
 27.0,
 25.0,
 13.0,
 18.0,
 20.0,
 49.0,
 22.0,
 44.0,
 0.0,
 9.0,
 31.0,
 12.0,
 34.0,
 21.0,
 10.0,
 14.0,
 38.0,
 24.0,
 40.0,
 43.0,
 32.0,
 4.0,
 51.0,
 42.0,
 1.0,
 52.0]