In [1]:
import pandas as pd 
import numpy as np

In [2]:
data = pd.read_csv('./data/insurance.csv')

In [3]:
# Drop duplicate rows
data.drop_duplicates(inplace=True)

In [4]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [11]:
X = data.drop('expenses', axis=1)

In [10]:
Y = data[['expenses']]

In [12]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest
1334,18,female,31.9,0,no,northeast
1335,18,female,36.9,0,no,southeast
1336,21,female,25.8,0,no,southwest


In [13]:
Y

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86
...,...
1333,10600.55
1334,2205.98
1335,1629.83
1336,2007.95


In [14]:
categorical_columns = X.select_dtypes(include='object').columns

In [15]:
numerical_columns = X.select_dtypes(exclude='object').columns

In [16]:
numerical_columns

Index(['age', 'bmi', 'children'], dtype='object')

In [17]:
categorical_columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [18]:
sex_categories = ['male', 'female']
smoker_categories = ['yes', 'no']
region_categories = ['southwest', 'southeast', 'northwest', 'northeast']

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

#pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [38]:
# numerical_pipeline=Pipeline(
#     steps=[    
#         ('imputer',SimpleImputer()),
#         ('scaler', StandardScaler())
#     ]
# )

# categorical_pipeline=Pipeline( 
#     steps=[
#         ('imputer',SimpleImputer(strategy='most_frequent')),
#         ('ordinalencoder',OrdinalEncoder(categories=[sex_categories,smoker_categories,region_categories]))
#     ]  
# )


numerical_pipeline = Pipeline(
    steps=[    
        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    ]
)

categorical_pipeline = Pipeline( 
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder())
    ]  
)

preprocessor = ColumnTransformer(
    [     
        ('numerical_columns', numerical_pipeline, numerical_columns),
        ('categorical_columns', categorical_pipeline, categorical_columns)
    ]
)

In [39]:
## Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)


In [40]:
preprocessor.fit_transform(X_train)

array([[ 0.08369308,  1.72593632, -0.0816607 ,  0.        ,  0.        ,
         1.        ],
       [-0.97952744,  0.48618749,  2.43546318,  1.        ,  0.        ,
         2.        ],
       [-0.6251206 ,  2.17675408,  0.75738059,  1.        ,  0.        ,
         2.        ],
       ...,
       [-0.69600197,  0.59889193, -0.920702  ,  1.        ,  1.        ,
         3.        ],
       [-1.47569701,  1.53272871, -0.920702  ,  0.        ,  0.        ,
         0.        ],
       [ 1.5722018 ,  0.84040144, -0.920702  ,  1.        ,  1.        ,
         2.        ]])

In [41]:
preprocessor.transform(X_test)

array([[ 0.79250676,  0.26077861, -0.0816607 ,  1.        ,  1.        ,
         0.        ],
       [-0.12895102,  0.55059003,  2.43546318,  1.        ,  1.        ,
         3.        ],
       [-0.97952744, -0.15783787,  1.59642188,  1.        ,  1.        ,
         3.        ],
       ...,
       [-0.41247649, -0.38324675, -0.0816607 ,  0.        ,  0.        ,
         2.        ],
       [ 0.93426949,  0.61499256,  1.59642188,  1.        ,  1.        ,
         1.        ],
       [ 1.21779497,  1.09801159,  0.75738059,  0.        ,  0.        ,
         2.        ]])

In [42]:
preprocessor.get_feature_names_out()

array(['numerical_columns__age', 'numerical_columns__bmi',
       'numerical_columns__children', 'categorical_columns__sex',
       'categorical_columns__smoker', 'categorical_columns__region'],
      dtype=object)

In [43]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [44]:
X_train

Unnamed: 0,numerical_columns__age,numerical_columns__bmi,numerical_columns__children,categorical_columns__sex,categorical_columns__smoker,categorical_columns__region
0,0.083693,1.725936,-0.081661,0.0,0.0,1.0
1,-0.979527,0.486187,2.435463,1.0,0.0,2.0
2,-0.625121,2.176754,0.757381,1.0,0.0,2.0
3,1.288676,-0.447649,-0.081661,1.0,0.0,2.0
4,-0.766883,-1.075574,1.596422,1.0,1.0,2.0
...,...,...,...,...,...,...
930,1.643083,-0.914568,-0.920702,0.0,0.0,3.0
931,-0.270714,-0.737461,-0.920702,0.0,0.0,0.0
932,-0.696002,0.598892,-0.920702,1.0,1.0,3.0
933,-1.475697,1.532729,-0.920702,0.0,0.0,0.0


NameError: name 'X_test' is not defined