# Pipeline

In Python, a pipeline refers to a series of interconnected steps that perform a specific data processing task. Each step in the pipeline typically takes input data, processes it, and outputs the transformed data to the next step in the pipeline until the final output is produced. Pipelines are commonly used in machine learning workflows to preprocess data before model training or to postprocess model predictions.

In [172]:
#import libraries
import numpy as np
#pandas 
import pandas as pd
#train_test_split
from sklearn.model_selection import train_test_split
#OnehotEncoder,
from sklearn.preprocessing import OneHotEncoder,StandardScaler
#ColumnTransformer
from sklearn.compose import ColumnTransformer
#pipeline
from sklearn.pipeline import Pipeline
#LinearRegression
from sklearn.linear_model import LinearRegression
#widgets
import ipywidgets as widgets
#display
from IPython.display import display
from sklearn.ensemble import RandomForestRegressor

# Loading the Dataset

In [173]:
df=pd.read_csv('/kaggle/input/insurance/insurance.csv')
#showing the dataset
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# Getting the Preliminary Information

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# Preprocessing the Dataset

In [175]:
def preprocess_inputs(df):
    df=df.copy()
    y=df['charges']
    x=df.drop('charges',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    return x_train,x_test,y_train,y_test

In [176]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(936, 6)
(402, 6)
(936,)
(402,)


# Building the Pipeline and Training

In [177]:
nominal_features=['sex','smoker','region']
nominal_transformer=Pipeline(steps=[
    ('onehot',OneHotEncoder(sparse=False,drop='if_binary'))
])



preprocessor=ColumnTransformer(transformers=[
    ('nominal',nominal_transformer,nominal_features)
],remainder='passthrough')
model=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('regressor',RandomForestRegressor())
])





In [178]:
model.fit(x_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='if_binary',
                                                                                 sparse=False))]),
                                                  ['sex', 'smoker',
                                                   'region'])])),
                ('scaler', StandardScaler()),
                ('regressor', RandomForestRegressor())])

In [179]:
x_train.describe()

Unnamed: 0,age,bmi,children
count,936.0,936.0,936.0
mean,39.518162,30.737388,1.117521
std,14.005182,6.173574,1.227611
min,18.0,16.815,0.0
25%,27.0,26.22,0.0
50%,40.0,30.495,1.0
75%,51.0,34.865,2.0
max,64.0,53.13,5.0


In [180]:
{column:list(df[column].unique()) for column in df.select_dtypes('object')}

{'sex': ['female', 'male'],
 'smoker': ['yes', 'no'],
 'region': ['southwest', 'southeast', 'northwest', 'northeast']}

# Results

In [181]:
model.score(x_test,y_test)

0.8111420972555999

# Interactive Widget

# Slider for Age

In [190]:
age_widget=widgets.IntSlider(value=38,min=18,max=64,step=1,description='Age:')
children_widget=widgets.IntSlider(value=1,min=0,max=5,step=1,description='Children:')
bmi_widget=widgets.FloatSlider(value=30,min=15,max=54,step=0.01,description='BMI:')
sex_widget=widgets.ToggleButtons(options=['Female','Male'],
    description='Sex')
smoker_widget=widgets.ToggleButtons(options=['Yes','No'],
    description='Smoker')
location_widget=widgets.Dropdown(options=['northeast','northwest','southeast','southwest'],
                description='Region')

display(age_widget,children_widget,bmi_widget,sex_widget,smoker_widget,location_widget)

predict_btn=widgets.Button(description='Predict')
prediction_out=widgets.Output()
def make_prediction(btn):
    try:
        x=pd.DataFrame({'age':age_widget.value,
                       'sex':sex_widget.value,
                       'bmi':bmi_widget.value,
                       'smoker':smoker_widget.value,
                       'children':children_widget.value,
                       'region':region_widget.value
                       },index=[0])
        prediction=model.predict(x)
        with prediction_out:
            prediction_out.clear_output()
            print("Prediction",prediction[0])
    except Exception as e:
        ("Error",e)
    
predict_btn.on_click(make_prediction)

display(predict_btn,prediction_out)
print("Widget Displayed")

IntSlider(value=38, description='Age:', max=64, min=18)

IntSlider(value=1, description='Children:', max=5)

FloatSlider(value=30.0, description='BMI:', max=54.0, min=15.0, step=0.01)

ToggleButtons(description='Sex', options=('Female', 'Male'), value='Female')

ToggleButtons(description='Smoker', options=('Yes', 'No'), value='Yes')

Dropdown(description='Region', options=('northeast', 'northwest', 'southeast', 'southwest'), value='northeast'…

Button(description='Predict', style=ButtonStyle())

Output()

Widget Displayed


IntSlider(value=38, description='Age:', max=64, min=18)

IntSlider(value=1, description='Children:', max=5)

FloatSlider(value=30.0, description='BMI:', max=54.0, min=15.0, step=0.01)

ToggleButtons(description='Sex', options=('Female', 'Male'), value='Female')

ToggleButtons(description='Smoker', options=('Yes', 'No'), value='Yes')

Dropdown(description='Region', options=('northeast', 'northwest', 'southeast', 'southwest'), value='northeast'…

Button(description='Predict', style=ButtonStyle())

Output()

Widget Displayed


In [185]:
prediction_out=widgets.Output()


In [186]:
widgets.Button(description='Predict')

Button(description='Predict', style=ButtonStyle())

# Slider for Number of Children

# FloatSlider for Body Mass Index

In [187]:
widgets.FloatSlider()

FloatSlider(value=0.0)

# Toggle Button for Sex

# Toggle Button for Whether Smoker

# Drop Down for Location Button