# Import Libraries
Import necessary libraries such as pandas, numpy, matplotlib, seaborn, and machine learning libraries like scikit-learn, XGBoost, and CatBoost.

In [11]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Load and Inspect Data
Load the dataset using pandas, display its shape, columns, and check for missing values.

In [12]:
# Step 2: Load & Inspect Dataset

# Load the dataset
data = pd.read_csv("global_inflation_data.csv") 

# Display basic information
print("Data Shape:", data.shape)
print("\nData Columns:\n", data.columns)

# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())


Data Shape: (196, 47)

Data Columns:
 Index(['country_name', 'indicator_name', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
       '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022', '2023', '2024'],
      dtype='object')

Missing Values:
 country_name       0
indicator_name     0
1980              56
1981              52
1982              51
1983              51
1984              51
1985              51
1986              51
1987              49
1988              49
1989              49
1990              46
1991              41
1992              38
1993              27
1994              25
1995              24
1996              20
1997              17
1998              15
1999              14
2000              13
20

# Data Preprocessing
Fill missing values, separate features and target, and perform train-test split.

In [13]:
# Step 3: Preprocessing

# Remove rows with missing target (2024)
data = data.dropna(subset=['2024'])

# Convert year columns to float and drop rows with too many missing values
year_cols = [str(year) for year in range(1980, 2024)]
data[year_cols] = data[year_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with more than 5 missing feature values
data = data[data[year_cols].isnull().sum(axis=1) <= 5]

# Fill remaining missing values with column median
data[year_cols] = data[year_cols].fillna(data[year_cols].median())

# Define features and target
X = data[year_cols]
y = data['2024']

print("Final shape after preprocessing:")
print("X:", X.shape)
print("y:", y.shape)


Final shape after preprocessing:
X: (141, 44)
y: (141,)


# Data Visualization
Create visualizations including a line plot for a sample country, a correlation heatmap, and a histogram of 2024 inflation rates.

In [14]:
# Step 4: Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (112, 44)
Test set size: (29, 44)


# Logistic Regression (Classifying High vs Low Inflation)
Classify inflation as high or low using logistic regression, scale features, train the model, and evaluate its performance.

In [15]:
# Step 5: Baseline Model

# Predict mean of training target
y_pred_baseline = np.full(shape=y_test.shape, fill_value=y_train.mean())

# Evaluation metrics
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)

print(f"Baseline RMSE: {rmse_baseline:.4f}")
print(f"Baseline MAE: {mae_baseline:.4f}")


Baseline RMSE: 36.2398
Baseline MAE: 13.0289


# Ensemble Models: RF, XGBoost, CatBoost
Train Random Forest, XGBoost, and CatBoost models, make predictions, and perform ensemble averaging.

In [16]:
# Step 6: Train Individual Models

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# CatBoost
cat = CatBoostRegressor(iterations=100, random_state=42, verbose=0)
cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_test)

# Evaluation
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{model_name} RMSE: {rmse:.4f}")
    print(f"{model_name} MAE: {mae:.4f}")
    print()

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")
evaluate_model(y_test, y_pred_cat, "CatBoost")


Random Forest RMSE: 12.9172
Random Forest MAE: 4.2158

XGBoost RMSE: 9.5396
XGBoost MAE: 3.4549

CatBoost RMSE: 23.0374
CatBoost MAE: 6.5220



# Model Evaluation
Evaluate the performance of individual models and the ensemble using RMSE.

In [17]:
# Step 7: Ensemble Model - Stacking

# Define base models
base_learners = [
    ('rf', rf),
    ('xgb', xgb),
    ('cat', cat)
]

# Define the stacking regressor with a Linear Regression meta-model
stacking_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=LinearRegression()
)

# Train the stacking model
stacking_model.fit(X_train, y_train)

# Make predictions
y_pred_stack = stacking_model.predict(X_test)

# Evaluate the stacking model
evaluate_model(y_test, y_pred_stack, "Stacking Regressor")


Stacking Regressor RMSE: 50.5461
Stacking Regressor MAE: 12.8916



# Conclusion
Summarize findings, discuss the effectiveness of ensemble methods, and suggest future work.

In [18]:
# Step 8: Hyperparameter Tuning - Random Forest

# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)

# Best parameters and best score
print("Best Random Forest Parameters:", grid_search_rf.best_params_)
print("Best Random Forest Score:", np.sqrt(-grid_search_rf.best_score_))

# Evaluate the best Random Forest model
best_rf = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
evaluate_model(y_test, y_pred_best_rf, "Tuned Random Forest")


Best Random Forest Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Best Random Forest Score: 18.545188072209495
Tuned Random Forest RMSE: 12.2708
Tuned Random Forest MAE: 4.3052



In [19]:
# Step 9: Hyperparameter Tuning - XGBoost

# Define parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform GridSearchCV
grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42, verbosity=0), param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_search_xgb.fit(X_train, y_train)

# Best parameters and best score
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)
print("Best XGBoost Score:", np.sqrt(-grid_search_xgb.best_score_))

# Evaluate the best XGBoost model
best_xgb = grid_search_xgb.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test)
evaluate_model(y_test, y_pred_best_xgb, "Tuned XGBoost")


Best XGBoost Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 1.0}
Best XGBoost Score: 12.342216647767778
Tuned XGBoost RMSE: 9.6498
Tuned XGBoost MAE: 3.7314



In [20]:
# Step 10: Hyperparameter Tuning - CatBoost

# Define parameter grid for CatBoost
param_grid_cat = {
    'iterations': [100, 200],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Perform GridSearchCV
grid_search_cat = GridSearchCV(CatBoostRegressor(random_state=42, verbose=0), param_grid_cat, cv=5, scoring='neg_mean_squared_error')
grid_search_cat.fit(X_train, y_train)

# Best parameters and best score
print("Best CatBoost Parameters:", grid_search_cat.best_params_)
print("Best CatBoost Score:", np.sqrt(-grid_search_cat.best_score_))

# Evaluate the best CatBoost model
best_cat = grid_search_cat.best_estimator_
y_pred_best_cat = best_cat.predict(X_test)
evaluate_model(y_test, y_pred_best_cat, "Tuned CatBoost")


Best CatBoost Parameters: {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.05}
Best CatBoost Score: 22.83101795044936
Tuned CatBoost RMSE: 23.3246
Tuned CatBoost MAE: 6.1382



In [21]:
# Step 11: Ensemble Model - Updated Stacking with Tuned Models

# Define base models (tuned)
base_learners_tuned = [
    ('rf', best_rf),
    ('xgb', best_xgb)
]

# Define the stacking regressor with a Linear Regression meta-model
stacking_model_tuned = StackingRegressor(
    estimators=base_learners_tuned,
    final_estimator=LinearRegression()
)

# Train the stacking model
stacking_model_tuned.fit(X_train, y_train)

# Make predictions
y_pred_stack_tuned = stacking_model_tuned.predict(X_test)

# Evaluate the stacking model
evaluate_model(y_test, y_pred_stack_tuned, "Updated Stacking Regressor")


Updated Stacking Regressor RMSE: 12.5296
Updated Stacking Regressor MAE: 4.8589

