In [1]:
# Import necessary libraries
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [2]:
# Load the 'tips' dataset
tips = sns.load_dataset('tips')

In [3]:
# Separate features and target
X = tips.drop(columns='tip')  # Features
y = tips['tip']               # Target (label)

In [5]:
# Define the numerical and categorical columns
numeric_features = ['total_bill', 'size']
categorical_features = ['sex', 'smoker', 'day', 'time']

In [6]:
# Create a column transformer: scale numeric features and one-hot encode categorical ones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [7]:
# Create a pipeline with preprocessing and a linear regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [10]:
# Score the model on the test data
score = pipeline.score(X_test, y_test)

In [11]:
# Output the model score
print(f"Model R^2 score: {score:.3f}")

Model R^2 score: 0.437


In [12]:
import joblib

In [13]:
# Save the trained pipeline model to a file with .joblib extension
model_filename = 'tips_model_pipeline.joblib'

In [14]:
joblib.dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")

Model saved to tips_model_pipeline.joblib


In [15]:
# Load the saved pipeline model from the file
loaded_pipeline = joblib.load(model_filename)
print(f"Model loaded from {model_filename}")

Model loaded from tips_model_pipeline.joblib


In [16]:
# Use the loaded model to make predictions on the test data
y_pred = loaded_pipeline.predict(X_test)

In [17]:
y_pred

array([2.91436632, 2.00292613, 3.96425583, 3.76380832, 2.14836306,
       2.67423448, 3.63934628, 2.29147245, 2.57207155, 2.45851225,
       2.90446763, 2.0573337 , 2.11817193, 2.35130838, 1.82976215,
       3.10830675, 2.95140176, 3.21602976, 2.56640892, 5.73957295,
       3.43490366, 3.22645102, 2.17139823, 1.94180002, 3.16394533,
       2.24547894, 2.14497574, 3.21025435, 3.20097595, 6.66803147,
       5.01111235, 1.57804024, 3.1909877 , 2.76652194, 2.98412862,
       3.85695724, 2.17008741, 5.46673999, 2.35586827, 2.99190732,
       2.03271177, 2.48465991, 3.44046814, 2.35532237, 1.92528104,
       0.87348926, 1.81911521, 3.04083954, 1.85436902])

# Metrics

- Mean Absolute Error $$\text{MAE} = \frac{1}{n}\sum_{i=1}^n|y_i - \hat y_i|$$
- Mean Square Error $$\text{MSE} = \frac{1}{n}\sum_{i=1}^n(y_i - \hat y_i)^2$$
- Root Mean Square Error $$\text{RMSE} = \sqrt{\frac{1}{n}\sum_{i=1}^n(y_i - \hat y_i)^2}$$
- Adjusted R-squared $$R^2 = 1 - \frac{\sum_{i=1}^n(y_i - \hat y_i)^2}{\sum_{i=1}^n(y_i - \bar y)^2}$$
- Mean Absolute Percentage Error $$\text{MAPE} = \frac{100}{n}\sum_{i=1}^n\left|\frac{y_i - \hat y_i}{y_i}\right|$$

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

In [19]:
# Calculate regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

In [20]:
# Print the results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape * 100:.2f}%")

Mean Absolute Error (MAE): 0.6671
Mean Squared Error (MSE): 0.7034
Root Mean Squared Error (RMSE): 0.8387
R-squared (R2): 0.4373
Mean Absolute Percentage Error (MAPE): 27.57%
