# Final Machine Learning Pipeline

The aim of preparing this notebook is to make pipeline to transform data, and check reproducibility of it and classification model.

In [2]:
# import libraries:
import os
import time
import pickle
import pandas as pd
import numpy as np
import warnings
from IPython.display import HTML, display
from datetime import timedelta
from pathlib import Path

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score


pd.set_option('display.float_format', '{:.4f}'.format)
np.set_printoptions(precision=6, suppress=True)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
import sys
import sklearn
print(sys.version)
print(f'pandas {pd.__version__}')
print(f'numpy {np.__version__}')
print(f'scikit-learn {sklearn.__version__}')

3.9.20 (main, Oct  3 2024, 07:38:01) [MSC v.1929 64 bit (AMD64)]
pandas 2.2.3
numpy 1.25.0
scikit-learn 1.4.0


In [4]:
# Base directory:
BASE_DIR = Path().resolve(strict=True)

In [5]:
# load classification model:
with open(f'{BASE_DIR}/model_pickle', 'rb') as file:
    classification_model = pickle.load(file)

In [6]:
data = pd.read_csv(f"{BASE_DIR}/raw_data.csv")

In [7]:
df = data.copy()
print(f"The shape of raw dataset is: {df.shape}")

The shape of raw dataset is: (15420, 33)


In [8]:
# drop reduntant label:
df = df.drop(index=1516)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15419 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15419 non-null  object
 1   WeekOfMonth           15419 non-null  int64 
 2   DayOfWeek             15419 non-null  object
 3   Make                  15419 non-null  object
 4   AccidentArea          15419 non-null  object
 5   DayOfWeekClaimed      15419 non-null  object
 6   MonthClaimed          15419 non-null  object
 7   WeekOfMonthClaimed    15419 non-null  int64 
 8   Sex                   15419 non-null  object
 9   MaritalStatus         15419 non-null  object
 10  Age                   15419 non-null  int64 
 11  Fault                 15419 non-null  object
 12  PolicyType            15419 non-null  object
 13  VehicleCategory       15419 non-null  object
 14  VehiclePrice          15419 non-null  object
 15  FraudFound_P          15419 non-null  int

In [14]:
# save df as csv after drop:
# df.to_csv(f'{BASE_DIR}/data_after_drop.csv', index=False)

#### Set some columns type as object:

In [15]:
# select column to change a type:
num_cols_to_category = ['WeekOfMonth', 'WeekOfMonthClaimed', 'DriverRating', 'RepNumber', 'Deductible', 'Year']

# set object type: 
df[num_cols_to_category] = df[num_cols_to_category].astype('object')

In [16]:
# Split columns for numerical and categorical:
num_col = list(df.select_dtypes(exclude='object').columns)
# num_col.remove('FraudFound_P')
                       
cat_col = list(df.select_dtypes(include='object').columns)

df[num_col] = df[num_col].astype('float')
df[cat_col] = df[cat_col].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15419 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Month                 15419 non-null  category
 1   WeekOfMonth           15419 non-null  category
 2   DayOfWeek             15419 non-null  category
 3   Make                  15419 non-null  category
 4   AccidentArea          15419 non-null  category
 5   DayOfWeekClaimed      15419 non-null  category
 6   MonthClaimed          15419 non-null  category
 7   WeekOfMonthClaimed    15419 non-null  category
 8   Sex                   15419 non-null  category
 9   MaritalStatus         15419 non-null  category
 10  Age                   15419 non-null  float64 
 11  Fault                 15419 non-null  category
 12  PolicyType            15419 non-null  category
 13  VehicleCategory       15419 non-null  category
 14  VehiclePrice          15419 non-null  category
 15  FraudFo

In [17]:
# make corrections of Make column:
make_corrections = {'Mecedes': 'Mercedes', 'Accura': 'Acura', 'Nisson': 'Nissan', 'Porche': 'Porsche'}
df['Make'] = df['Make'].cat.rename_categories(make_corrections)
df['Make'].value_counts()

Make
Pontiac      3837
Toyota       3121
Honda        2800
Mazda        2354
Chevrolet    1681
Acura         472
Ford          450
VW            283
Dodge         109
Saab          108
Mercury        83
Saturn         58
Nissan         30
BMW            15
Jaguar          6
Porsche         5
Mercedes        4
Ferrari         2
Lexus           1
Name: count, dtype: int64

In [18]:
# separate target from dataset:
target = df.pop('FraudFound_P')

In [19]:
# split data for train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42, stratify=target)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_trian shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (12335, 32)
X_test shape: (3084, 32)
y_trian shape: (12335,)
y_test shape: (3084,)


In [20]:
# repeat this step from research notebook, to be sure that X_train and y_train are the same in both notebooks:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_trian shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (9251, 32)
X_test shape: (3084, 32)
y_trian shape: (9251,)
y_test shape: (3084,)


In [21]:
#Config:
cols = ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber',
       'Deductible', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year',
       'BasePolicy']

num_cols_to_category = ['WeekOfMonth', 'WeekOfMonthClaimed', 'DriverRating', 'RepNumber', 'Deductible', 'Year']

reduntant_cols = ['Year', 'RepNumber', 'PolicyType', 'PolicyNumber']


categorical_cols  = ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex', 
                      'MaritalStatus', 'Fault', 'VehicleCategory', 'VehiclePrice', 'DriverRating', 'Days_Policy_Accident', 
                      'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 
                      'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy', 'Deductible']

numerical_cols = ['Age']


# variables to log transformation:
numerical_log_cols = ['Age']

# variables to apply iszero indicator:
is_zero_cols = ['Age']

# variables to map
make_map = ['Make']

# mappings:
make_mapping = { 'Mercedes': 'high_risk_group', 'Acura': 'high_risk_group', 'Saturn': 'high_risk_group', 'Saab': 'high_risk_group',
                 'Ford': 'medium_risk_group', 'Mercury': 'medium_risk_group', 'BMW': 'medium_risk_group', 'Honda': 'medium_risk_group', 
                 'Toyota': 'medium_risk_group', 'Chevrolet': 'medium_risk_group', 'Pontiac': 'medium_risk_group', 'Mazda': 'medium_risk_group',
                 'Nissan': 'low_risk_group', 'VW': 'low_risk_group', 'Dodge': 'low_risk_group', 'Jaguar': 'low_risk_group', 'Lexus': 'low_risk_group',
                 'Ferrari': 'low_risk_group', 'Porsche' : 'low_risk_group' }

# selected variables by RFECV:
selected_cols = ['num__Age', 'cat__Month_Aug', 'cat__Month_Dec', 'cat__Month_Feb',
       'cat__Month_Jan', 'cat__Month_Jul', 'cat__Month_Jun', 'cat__Month_Mar',
       'cat__Month_May', 'cat__Month_Nov', 'cat__Month_Oct', 'cat__Month_Sep',
       'cat__WeekOfMonth_2', 'cat__WeekOfMonth_3', 'cat__WeekOfMonth_4',
       'cat__WeekOfMonth_5', 'cat__DayOfWeek_Monday',
       'cat__DayOfWeek_Saturday', 'cat__DayOfWeek_Sunday',
       'cat__DayOfWeek_Thursday', 'cat__DayOfWeek_Tuesday',
       'cat__DayOfWeek_Wednesday', 'cat__Make_infrequent_sklearn',
       'cat__AccidentArea_Urban', 'cat__DayOfWeekClaimed_Monday',
       'cat__DayOfWeekClaimed_Thursday', 'cat__DayOfWeekClaimed_Tuesday',
       'cat__DayOfWeekClaimed_Wednesday',
       'cat__DayOfWeekClaimed_infrequent_sklearn', 'cat__MonthClaimed_Aug',
       'cat__MonthClaimed_Dec', 'cat__MonthClaimed_Feb',
       'cat__MonthClaimed_Jan', 'cat__MonthClaimed_Jul',
       'cat__MonthClaimed_Jun', 'cat__MonthClaimed_Mar',
       'cat__MonthClaimed_May', 'cat__MonthClaimed_Nov',
       'cat__MonthClaimed_Oct', 'cat__MonthClaimed_Sep',
       'cat__WeekOfMonthClaimed_2', 'cat__WeekOfMonthClaimed_3',
       'cat__WeekOfMonthClaimed_4', 'cat__WeekOfMonthClaimed_5',
       'cat__Sex_Male', 'cat__MaritalStatus_Single',
       'cat__MaritalStatus_infrequent_sklearn', 'cat__Fault_Third Party',
       'cat__VehicleCategory_Sport', 'cat__VehicleCategory_infrequent_sklearn',
       'cat__VehiclePrice_30000 to 39000', 'cat__VehiclePrice_less than 20000',
       'cat__VehiclePrice_more than 69000',
       'cat__VehiclePrice_infrequent_sklearn', 'cat__DriverRating_2',
       'cat__DriverRating_3', 'cat__DriverRating_4',
       'cat__Days_Policy_Accident_infrequent_sklearn',
       'cat__Days_Policy_Claim_infrequent_sklearn',
       'cat__PastNumberOfClaims_2 to 4', 'cat__PastNumberOfClaims_more than 4',
       'cat__PastNumberOfClaims_none', 'cat__AgeOfVehicle_6 years',
       'cat__AgeOfVehicle_7 years', 'cat__AgeOfVehicle_more than 7',
       'cat__AgeOfVehicle_infrequent_sklearn',
       'cat__AgeOfPolicyHolder_36 to 40', 'cat__AgeOfPolicyHolder_41 to 50',
       'cat__AgeOfPolicyHolder_51 to 65',
       'cat__AgeOfPolicyHolder_infrequent_sklearn',
       'cat__PoliceReportFiled_infrequent_sklearn',
       'cat__NumberOfSuppliments_3 to 5',
       'cat__NumberOfSuppliments_more than 5', 'cat__NumberOfSuppliments_none',
       'cat__AddressChange_Claim_infrequent_sklearn',
       'cat__NumberOfCars_infrequent_sklearn', 'cat__BasePolicy_Collision',
       'cat__BasePolicy_Liability', 'cat__Deductible_infrequent_sklearn']

In [22]:
# initialise DropColumn class (to use it in pipeline):
class DropColumn(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        X = X.drop(self.variables, axis=1)
        return X

# initialise LogTransformer class (to use it in pipeline):
class LogTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for variable in self.variables:
            X[variable] = np.log(1 + X[variable])
        return X
        
# initialise IsZero class (to use it in pipeline):
class IsZeroIndicator(BaseEstimator, TransformerMixin):

    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for variable in self.variables:
            X[f'{variable}_is_0'] = np.nan
            X.loc[X[variable] == 0, f'{variable}_is_0'] = 1
            X.loc[X[variable] != 0, f'{variable}_is_0'] = 0
        return X

# initialise mapper class (to use it in pipeline):
class Mapper(BaseEstimator, TransformerMixin):

    def __init__(self, variables, mappings):

        if not isinstance(variables, list):
            raise ValueError('variables should be a list')

        self.variables = variables
        self.mappings = mappings

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.mappings)
        return X

# initialise class to select the most informative columns:
class SelectColumn(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        if not isinstance(variables, list):
            raise ValueError('variables should be a list')
        self.variables = variables
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X[self.variables]
        return X

In [23]:
#apply transformer:
transformer = ColumnTransformer(
    transformers = [
        #apply column transformer for categorical variables:
        (
            'cat',
             OneHotEncoder(
                 categories = "auto",
                 drop = "first", sparse_output = False,
                 min_frequency = 0.05,
                 handle_unknown = "infrequent_if_exist"),
             categorical_cols
        ),
        #apply column transformer for numerical variables:
        (
            'num',
             StandardScaler(), 
             numerical_cols
        )],
    remainder = 'passthrough',
    n_jobs=-1)

transformer.set_output(transform = "pandas")

# apply pipeline:
prep = Pipeline([
    # drop reduntant columns:
    ('drop', DropColumn(variables = reduntant_cols)),
    # apply mapping:
    ('mapper', Mapper(variables = make_map, mappings = make_mapping)),
    # apply log transformation:
    ('log_trans', LogTransformer(variables = numerical_log_cols)),
    # make iszero indicator:
    ('is_zero', IsZeroIndicator(variables = is_zero_cols)),
    #apply column transformer:
    ('prep', transformer),
    #select the most informative columns:
    ('select_columns', SelectColumn(variables = selected_cols))
])

In [24]:
#fit preprocessor:
prep.fit(X_train)

In [25]:
X_train_prep = prep.transform(X_train)
X_train_prep.head()

Unnamed: 0,num__Age,cat__Month_Aug,cat__Month_Dec,cat__Month_Feb,cat__Month_Jan,cat__Month_Jul,cat__Month_Jun,cat__Month_Mar,cat__Month_May,cat__Month_Nov,...,cat__AgeOfPolicyHolder_infrequent_sklearn,cat__PoliceReportFiled_infrequent_sklearn,cat__NumberOfSuppliments_3 to 5,cat__NumberOfSuppliments_more than 5,cat__NumberOfSuppliments_none,cat__AddressChange_Claim_infrequent_sklearn,cat__NumberOfCars_infrequent_sklearn,cat__BasePolicy_Collision,cat__BasePolicy_Liability,cat__Deductible_infrequent_sklearn
8666,-0.5256,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
14490,-0.4649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2516,0.942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2888,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4665,-0.7935,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
X_test_prep = prep.transform(X_test)
X_test_prep.head()

Unnamed: 0,num__Age,cat__Month_Aug,cat__Month_Dec,cat__Month_Feb,cat__Month_Jan,cat__Month_Jul,cat__Month_Jun,cat__Month_Mar,cat__Month_May,cat__Month_Nov,...,cat__AgeOfPolicyHolder_infrequent_sklearn,cat__PoliceReportFiled_infrequent_sklearn,cat__NumberOfSuppliments_3 to 5,cat__NumberOfSuppliments_more than 5,cat__NumberOfSuppliments_none,cat__AddressChange_Claim_infrequent_sklearn,cat__NumberOfCars_infrequent_sklearn,cat__BasePolicy_Collision,cat__BasePolicy_Liability,cat__Deductible_infrequent_sklearn
8923,-0.1904,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4276,-0.8677,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3409,-0.0451,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
10676,0.601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3286,0.5368,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [27]:
#Genereate some predictions for data from train set:
y_train_pred = classification_model.predict(X_train_prep)
y_test_pred = classification_model.predict(X_test_prep)

y_train_proba = classification_model.predict_proba(X_train_prep)[:,1]
y_test_proba = classification_model.predict_proba(X_test_prep)[:,1]

In [28]:
#generate metrics for train set:
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_roc_auc = roc_auc_score(y_train, y_train_proba)

#generate metrics for tes set:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_proba)

# make a table to compare metrics
tmp = pd.DataFrame(data = {
    'set': ['train', 'test'],
    'accuracy': [train_accuracy, test_accuracy], 
    'recall': [train_recall, test_recall], 
    'roc_auc': [train_roc_auc, test_roc_auc]
    })

tmp

Unnamed: 0,set,accuracy,recall,roc_auc
0,train,0.6196,0.9476,0.8497
1,test,0.6196,0.9459,0.825


#### Notes:
- model achieved exactly the same metrics as in research notebook - it means pipeline and model is reproducible;

# Create final model and save as pickle file:

In [29]:
#Final pipeline:
final_pipeline = Pipeline([
    ('preprocessing', prep),
    ('model', classification_model)
])

In [30]:
y_pred_ = final_pipeline.predict(X_test)

In [31]:
accuracy_score(y_test, y_pred_)

0.6196498054474708

In [32]:
# # save as pickle file:
# with open(f'{BASE_DIR}/trained_pipeline-0.0.1.pkl', 'wb') as file:
#     pickle.dump(final_pipeline, file)

# TESTING:

In [35]:
test_data = X_train.loc[[0]]
test_data

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability


In [38]:
#get sample as json format for testing:
test_data_json = test_data.to_json(orient="records")
test_data_json

'[{"Month":"Dec","WeekOfMonth":5,"DayOfWeek":"Wednesday","Make":"Honda","AccidentArea":"Urban","DayOfWeekClaimed":"Tuesday","MonthClaimed":"Jan","WeekOfMonthClaimed":1,"Sex":"Female","MaritalStatus":"Single","Age":21.0,"Fault":"Policy Holder","PolicyType":"Sport - Liability","VehicleCategory":"Sport","VehiclePrice":"more than 69000","PolicyNumber":1.0,"RepNumber":12,"Deductible":300,"DriverRating":1,"Days_Policy_Accident":"more than 30","Days_Policy_Claim":"more than 30","PastNumberOfClaims":"none","AgeOfVehicle":"3 years","AgeOfPolicyHolder":"26 to 30","PoliceReportFiled":"No","WitnessPresent":"No","AgentType":"External","NumberOfSuppliments":"none","AddressChange_Claim":"1 year","NumberOfCars":"3 to 4","Year":1994,"BasePolicy":"Liability"}]'