In [1]:
# LOADING DATASET

import seaborn as sns
df=sns.load_dataset("tips")

In [2]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
import numpy as np
import pandas as pd
# Fill missing values in numerical columns
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

In [4]:
df = df.drop(columns=['smoker', 'day', 'time'])

In [5]:
df.head(10)

Unnamed: 0,total_bill,tip,sex,size
0,16.99,1.01,Female,2
1,10.34,1.66,Male,3
2,21.01,3.5,Male,3
3,23.68,3.31,Male,2
4,24.59,3.61,Female,4
5,25.29,4.71,Male,4
6,8.77,2.0,Male,2
7,26.88,3.12,Male,4
8,15.04,1.96,Male,2
9,14.78,3.23,Male,2


In [6]:
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [7]:
#Separate X & Y with (X -> all cols except Tip, y -> Tip Column)
X = df.drop("tip", axis=1)
y = df["tip"]

In [8]:
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

In [9]:
print("Numerical:",numerical_cols)
print("Categorical:",categorical_cols)

Numerical: Index(['total_bill', 'size'], dtype='object')
Categorical: Index(['sex'], dtype='object')


In [10]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())                 # Scale numerical features
])

In [11]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))    # One-hot encode categories
])

In [12]:
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [13]:
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [14]:
X_transformed = pipeline.fit_transform(X)

In [17]:
X_transformed

array([[-3.14711305e-01, -6.00192629e-01,  1.00000000e+00,
         0.00000000e+00],
       [-1.06323531e+00,  4.53382921e-01,  0.00000000e+00,
         1.00000000e+00],
       [ 1.37779900e-01,  4.53382921e-01,  0.00000000e+00,
         1.00000000e+00],
       [ 4.38315103e-01, -6.00192629e-01,  0.00000000e+00,
         1.00000000e+00],
       [ 5.40744704e-01,  1.50695847e+00,  1.00000000e+00,
         0.00000000e+00],
       [ 6.19536705e-01,  1.50695847e+00,  0.00000000e+00,
         1.00000000e+00],
       [-1.23995452e+00, -6.00192629e-01,  0.00000000e+00,
         1.00000000e+00],
       [ 7.98507107e-01,  1.50695847e+00,  0.00000000e+00,
         1.00000000e+00],
       [-5.34203307e-01, -6.00192629e-01,  0.00000000e+00,
         1.00000000e+00],
       [-5.63468908e-01, -6.00192629e-01,  0.00000000e+00,
         1.00000000e+00],
       [-1.07111451e+00, -6.00192629e-01,  0.00000000e+00,
         1.00000000e+00],
       [ 1.74175992e+00,  1.50695847e+00,  1.00000000e+00,
      