In [1]:
import numpy as np 
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



In [2]:
import seaborn as sns

In [3]:
df = sns.load_dataset('tips')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
X = df.iloc[:, 1:]
y = df['total_bill']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state =42)

In [7]:
numerical_processor = Pipeline(
steps = [
    ("imputation_mean", SimpleImputer(missing_values = np.nan, strategy = 'mean')),
    ("scaler", StandardScaler())])

In [8]:
cateogrical_preprocessor = Pipeline(steps = [
    ("Imputation_Constant", SimpleImputer(fill_value="missing", strategy = 'constant')),
    ("one_hot_encoding", OneHotEncoder(handle_unknown="ignore")),
])

In [9]:
preprocessor = ColumnTransformer([("categorical", cateogrical_preprocessor,["sex", "smoker", "day", "time"]), ("numerical", numerical_processor, ["tip","size"])])

In [10]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

In [11]:
from sklearn import set_config

In [12]:
set_config(display = 'diagram')

In [13]:
pipe

In [14]:
pipe.fit(X_train,y_train)

In [15]:
pipe.predict(X_test)

array([17.2621    , 13.53845333, 20.44263333, 29.54368   , 13.381685  ,
       14.30257167, 16.36155   , 16.21983833, 21.6147    , 21.52797333,
       19.0091    , 13.6003    , 10.45872   , 14.30257167, 11.758435  ,
       13.7112    , 21.2154    , 19.5584    , 14.483     , 27.6192    ,
       19.686     , 18.9096    , 20.0383    , 13.6003    , 24.7529    ,
       16.30493333, 13.02266667, 28.49      , 20.44263333, 24.7787    ,
       22.5438    , 12.9787    , 19.0516    , 18.5541    , 20.4722    ,
       21.78906667, 12.7015    , 28.6363    , 19.7191    , 14.10055   ,
       13.323     , 11.88731095, 16.227415  , 14.838     , 14.45956667,
       13.08146667, 18.777025  , 17.7183    , 11.05145   , 17.2331    ,
       14.2244    , 19.74576667, 26.2048    , 13.67089333, 21.5796    ,
       12.04579   , 26.4728    , 12.42125333, 18.9615    , 30.4645    ,
       32.7367    , 19.2011    , 26.3041    , 12.8044    , 13.02196667,
       19.0428    , 14.65145   , 14.6432    , 30.2321    , 20.98

In [16]:
import warnings 
warnings.filterwarnings('ignore')

In [17]:
## Hyper Parameter Tuning

param_grid = {"regressor__n_estimators": [200,500],
             "regressor__max_features": ["auto", "sqrt", "log2"],
             "regressor__max_depth": [4,5,6,7,8]}

In [18]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1)

In [19]:
grid_search.fit(X_train,y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [20]:
grid_search.best_params_

{'regressor__max_depth': 6,
 'regressor__max_features': 'sqrt',
 'regressor__n_estimators': 200}

In [21]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(max_depth = 6, max_features = 'sqrt', n_estimators = 200))]

)

In [22]:
pipe.fit(X_train, y_train)

In [None]:
pipe.predict(X_test)