In [7]:
import joblib
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_validate, cross_val_score

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

import plotly.express as px


import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action="ignore")

In [8]:
df = px.data.gapminder()

In [9]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.853030,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.100710,AFG,4
3,Afghanistan,Asia,1967,34.020,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4
...,...,...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306,ZWE,716
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786,ZWE,716
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960,ZWE,716
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623,ZWE,716


In [27]:
X = df[['year', 'pop', 'gdpPercap']]
y = df['lifeExp']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
models = [('LR', LinearRegression()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor(verbose= -100)),
          ("CatBoost", CatBoostRegressor(verbose=False))]

for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_root_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name})")

RMSE: 3.1986 (LR)
RMSE: 3.7445 (KNN)
RMSE: 3.0523 (CART)
RMSE: 2.7051 (RF)
RMSE: 2.6593 (GBM)
RMSE: 2.8408 (XGBoost)
RMSE: 2.7639 (LightGBM)
RMSE: 2.7405 (CatBoost)


In [30]:
gbm_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

rf_params = {
    'n_estimators': [100, 200, 500],
    'max_features': [5, 7, "sqrt"],
    'max_depth': [None, 10, 20, 30],
}

catboost_params = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [500, 1000, 1500],
}

models = [
    ("GBM", GradientBoostingRegressor(), gbm_params),
    ("RF", RandomForestRegressor(), rf_params),
    ('CatBoost', CatBoostRegressor(verbose=False), catboost_params)]

def hyperparameter_optimization(X, y, cv=5, scoring="neg_root_mean_squared_error"):
    print("Hyperparameter Optimization....")
    best_models = {}
    for name, regressor, params in models:
        print(f"########## {name} ##########")
        cv_results = cross_validate(regressor, X, y, cv=cv, scoring=scoring)
        mean_score = np.mean(np.sqrt(-cv_results['test_score']))
        print(f"RMSE (Before): {round(mean_score, 4)}")

        gs_best = GridSearchCV(regressor, params, cv=cv, n_jobs=-1, verbose=False, scoring=scoring).fit(X, y)
        final_regressor = regressor.set_params(**gs_best.best_params_)

        cv_results = cross_validate(final_regressor, X, y, cv=cv, scoring=scoring)
        mean_score = np.mean(np.sqrt(-cv_results['test_score']))
        print(f"RMSE (After): {round(mean_score, 4)}")
        print(f"{name} best params: {gs_best.best_params_}", end="\n\n")
        best_models[name] = final_regressor
    return best_models


In [31]:
best_models = hyperparameter_optimization(X_train, y_train)

Hyperparameter Optimization....
########## GBM ##########
RMSE (Before): 2.582
RMSE (After): 2.5479
GBM best params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300}

########## RF ##########
RMSE (Before): 2.5412
RMSE (After): 2.5293
RF best params: {'max_depth': 10, 'max_features': 5, 'n_estimators': 500}

########## CatBoost ##########
RMSE (Before): 2.5459
RMSE (After): 2.5143
CatBoost best params: {'depth': 8, 'iterations': 1000, 'learning_rate': 0.01}



In [32]:
from sklearn.ensemble import VotingRegressor

def voting_regressor(best_models, X, y):
    print("Voting Regressor...")
    voting_regressor = VotingRegressor(estimators=[
        ('GBM', best_models["GBM"]),
        ('RF', best_models["RF"]),
        ('CatBoost', best_models["CatBoost"])
    ]).fit(X, y)

    cv_results = cross_validate(voting_regressor, X, y, cv=5, scoring="neg_root_mean_squared_error")
    
    rmse_scores = np.sqrt(-cv_results['test_score'])
    print(f"RMSE: {rmse_scores.mean()}")
    return voting_regressor

In [33]:
voting_reg = voting_regressor(best_models, X_train, y_train)

Voting Regressor...
RMSE: 2.516459855547181


In [34]:
gbm_best = GradientBoostingRegressor(n_estimators=300, learning_rate=0.01, max_depth=5)
rf_best = RandomForestRegressor(n_estimators=500, max_features=5, max_depth=10)
catboost_best = CatBoostRegressor(depth=8, learning_rate=0.01, iterations=1000, verbose=False)

voting_regressor = VotingRegressor(estimators=[
    ('GBM', gbm_best),
    ('RF', rf_best),
    ('CatBoost', catboost_best)
])

pipeline = Pipeline([
    ('voting_regressor', voting_regressor)
])

pipeline.fit(X_train, y_train)

In [38]:
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 5.705895469370951


In [39]:
model_filename = 'gapminder.joblib'
joblib.dump(pipeline, model_filename)
print(f'Model saved as {model_filename}')

Model saved as gapminder.joblib


In [None]:
def get_pipeline():
    pipeline = joblib.load("wine_quality.joblib")
    return pipeline

In [None]:
pipeline = get_pipeline()

In [42]:
data = {
    'year': [1997],
    'pop': [11404948],
    'gdpPercap': [792.449960],
}

sample_df = pd.DataFrame(data)

In [43]:
pipeline.predict(sample_df)

array([48.54893832])