In [1]:
# Import necessary libraries
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

In [2]:
# Load the 'tips' dataset from Seaborn
tips = sns.load_dataset('tips')

In [3]:
# Display the first few rows of the dataset
print("Original 'tips' dataset:")
tips.head()

Original 'tips' dataset:


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [5]:
X = tips.copy()
y = X.pop('tip')

In [6]:
categorical_features = ['sex', 'smoker', 'day', 'time']
numerical_features = ['total_bill', 'size']

# Transform Categorical Data

In [7]:
pd.get_dummies(X.loc[:, categorical_features]).astype(int)

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,1,0,1
2,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
239,1,0,0,1,0,0,1,0,0,1
240,0,1,1,0,0,0,1,0,0,1
241,1,0,1,0,0,0,1,0,0,1
242,1,0,0,1,0,0,1,0,0,1


In [8]:
cat_features = X.loc[:, categorical_features]
num_features = X.loc[:, numerical_features]

In [9]:
# Initialize the OneHotEncoder for features
encoder_features = OneHotEncoder(sparse_output=False)
encoded_features = encoder_features.fit_transform(cat_features)

In [10]:
encoded_features

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [11]:
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder_features.get_feature_names_out(['sex', 'smoker', 'day', 'time']))

In [12]:
encoded_features_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Transform Numerical Data

In [13]:
scaler = StandardScaler()

In [14]:
scaled_features = scaler.fit_transform(num_features)

In [15]:
scaled_features_df = pd.DataFrame(scaled_features, columns=scaler.get_feature_names_out())

# Train-Test Split

In [16]:
X = pd.concat([scaled_features_df, encoded_features_df], axis=1)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train.shape, y_train.shape

((195, 12), (195,))

In [20]:
X_test.shape, y_test.shape

((49, 12), (49,))