In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump
import numpy as np

In [2]:
df = pd.read_csv("../../Data/filled_data.csv")
df.head()

Unnamed: 0,Country,ISO_Code,Year,Co2_MtCO2,Co2_Capita_tCO2,GDP,Population,Industry_on_GDP,HDI,Government_Expenditure_on_Education,Global_Climate_Risk_Index,Area_ha,Forest_Area_ha,Forest_Area_Percent,Deforest_Area_ha,Deforest_Percent,Energy_Capita_kWh,Energy_MWh,Renewable_Energy_MWh,Renewable_Energy_Percent
0,Afghanistan,AFG,2001.0,1.069,5.3e-05,2813572000.0,20284307.0,39.823426,0.355,18.405412,20.008285,65286400.0,1208440.0,1.852782,88.092712,0.00729,179.8919,3648983.0,1663936.0,45.6
1,Afghanistan,AFG,2002.0,1.341,6.3e-05,3825701000.0,21378117.0,23.810127,0.383,20.537858,17.59513,65286400.0,1208440.0,1.852782,178.947855,0.014808,155.69543,3328475.0,1258164.0,37.8
2,Afghanistan,AFG,2003.0,1.56,6.9e-05,4520947000.0,22733049.0,22.710864,0.392,17.413352,16.27154,65286400.0,1208440.0,1.852782,244.336255,0.020219,174.55792,3968234.0,1456342.0,36.7
3,Afghanistan,AFG,2004.0,1.237,5.3e-05,5224897000.0,23560654.0,26.22679,0.408,21.735324,18.795908,65286400.0,1208440.0,1.852782,201.322404,0.01666,157.75752,3716870.0,1642857.0,44.2
4,Afghanistan,AFG,2005.0,1.89,7.7e-05,6203257000.0,24404567.0,26.812099,0.417,13.073585,12.682471,65286400.0,1208440.0,1.852782,236.021558,0.019531,205.46812,5014361.0,1699868.0,33.9


In [3]:
FEATURE_CORE = [
    'Population',
    'GDP',
    'Industry_on_GDP',
    'Government_Expenditure_on_Education',
    'Global_Climate_Risk_Index',
    'HDI',
    'Renewable_Energy_Percent',
    'Deforest_Percent',
    'Energy_Capita_kWh']
TARGET_CORE = "Co2_MtCO2"

In [4]:
X = df[FEATURE_CORE]
y = df[TARGET_CORE]

train_list = []
test_list = []

for country, group in df.groupby("Country"):
    if len(group) < 5:
        test_size = 1 / len(group)
    else:
        test_size = 0.2
    
    train_split, test_split = train_test_split(
        group,
        test_size=test_size,
        shuffle=True,
        random_state=42
    )
    train_list.append(train_split)
    test_list.append(test_split)

df_train = pd.concat(train_list).reset_index(drop=True)
df_test = pd.concat(test_list).reset_index(drop=True)

X_train = df_train[FEATURE_CORE]
y_train = df_train[TARGET_CORE]

X_test = df_test[FEATURE_CORE]
y_test = df_test[TARGET_CORE]

In [5]:
scaler_x = MinMaxScaler()
X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)

In [6]:
model_rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=6,
    random_state=42
)
model_rf.fit(X_train_scaled, y_train)

y_pred = model_rf.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\n RMSE: {rmse:.4f}, MAE: {mae:.4f},R2: {r2:.4f}")



 RMSE: 75.5493, MAE: 26.2923,R2: 0.9887


In [7]:
dump(model_rf, "co2_model_rf.joblib")
dump(scaler_x, "scaler_x.joblib")

['scaler_x.joblib']