In [1]:
from src.data.get_dataset import get_merged_datasets
dataframe = get_merged_datasets()

dataframe.shape

(3104, 204)

In [2]:
numeric_cols = dataframe.select_dtypes(include=['number']).columns.difference(['year']).tolist() #we don't want year to be a feature in our prediction model
#we want to limit our feature set to columns without any data leaks, since in real prediction we wouldn't know those values.
data_leak_list = ["co2", "ghg", "greenhouse_gas", "nitrous_oxide", "methane", "ch4", "n2o", "change"]

def is_data_leak(s):
    for data_leak in data_leak_list: 
        if data_leak in s:
            return True
        
    return False
feature_list = []    
for x in numeric_cols: 
    is_leak = is_data_leak(x)
    if not is_leak: 
        feature_list.append(x)
        
print(feature_list)

['agricultural_land_area_in_hectares', 'biofuel_cons_per_capita', 'biofuel_consumption', 'biofuel_elec_per_capita', 'biofuel_electricity', 'biofuel_share_elec', 'biofuel_share_energy', 'carbon_intensity_elec', 'coal_cons_per_capita', 'coal_consumption', 'coal_elec_per_capita', 'coal_electricity', 'coal_prod_per_capita', 'coal_production', 'coal_share_elec', 'coal_share_energy', 'cropland_area_in_hectares', 'electricity_demand', 'electricity_generation', 'electricity_share_energy', 'energy_per_capita', 'energy_per_gdp', 'forest_land_area_in_hectares', 'fossil_elec_per_capita', 'fossil_electricity', 'fossil_energy_per_capita', 'fossil_fuel_consumption', 'fossil_share_elec', 'fossil_share_energy', 'gas_consumption', 'gas_elec_per_capita', 'gas_electricity', 'gas_energy_per_capita', 'gas_prod_per_capita', 'gas_production', 'gas_share_elec', 'gas_share_energy', 'gdp', 'hydro_consumption', 'hydro_elec_per_capita', 'hydro_electricity', 'hydro_energy_per_capita', 'hydro_share_elec', 'hydro_sha

In [6]:
#checking correlations between various features and co2 data
dataframe[feature_list + ['co2']].corr()['co2'].sort_values(ascending=False) #Reduced feature list to 97, need to check if any of these are unrelated. 

co2                           1.000000
fossil_fuel_consumption       0.992006
fossil_electricity            0.990423
primary_energy_consumption    0.987311
electricity_generation        0.979749
                                ...   
hydro_share_elec             -0.112958
renewables_share_elec        -0.115668
oil_share_elec               -0.146162
nuclear_share_energy         -0.164512
oil_share_energy             -0.173952
Name: co2, Length: 98, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

X = dataframe[feature_list]
Y = dataframe['co2']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

simple_imputer = SimpleImputer(strategy='mean')

#fit imputer on training set, and fit both train and test
simple_imputer.fit(X_train)

X_train = simple_imputer.transform(X_train)
X_test = simple_imputer.transform(X_test)

scalar = StandardScaler() #RobustScaler(with_scaling=True, with_centering=True) I tried the robust scalar to reduce outliers, it didn't make a difference. 
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [8]:
# Let's train a basic model for testing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

linear_model = LinearRegression()
ridge_model = Ridge(random_state=42)
lasso_model = Lasso(random_state=42, tol=0.001) #reducing the tolerance so the model will converge
sgd_model = SGDRegressor(random_state=42)
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)

models = [linear_model, ridge_model, lasso_model, decision_tree_model, random_forest_model, sgd_model]
[x.fit(X_train, y_train) for x in models ] #train all models

[LinearRegression(),
 Ridge(random_state=42),
 Lasso(random_state=42, tol=0.001),
 DecisionTreeRegressor(random_state=42),
 RandomForestRegressor(random_state=42),
 SGDRegressor(random_state=42)]

In [9]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

training_predictions = [m.predict(X_train) for m in models]
mean_sq_errors = [mean_squared_error(y_train, y) for y in training_predictions]
r2_scores = [r2_score(y_train, y) for y in training_predictions]
adj_r2_scores = [((1 - score)*(len(y_train)-1)/(len(y_train)-X_test.shape[1]-1)) for score in r2_scores]
print("Train data scores")
[print(f" {models[i].__class__.__name__}: Mean Squared Error: {mean_sq_errors[i]}, R2_score {r2_scores[i]}, Adjusted R2 score: {adj_r2_scores[i]}") for i in range(0, len(models))]


Train data scores
 LinearRegression: Mean Squared Error: 295.94390614645977, R2_score 0.9994201275890725, Adjusted R2 score: 0.0006034563203027438
 Ridge: Mean Squared Error: 336.138434767967, R2_score 0.9993413704403906, Adjusted R2 score: 0.0006854165899163448
 Lasso: Mean Squared Error: 500.6714989649896, R2_score 0.9990189843981993, Adjusted R2 score: 0.0010209143495469438
 DecisionTreeRegressor: Mean Squared Error: 3.723102590841213e-36, R2_score 1.0, Adjusted R2 score: 0.0
 RandomForestRegressor: Mean Squared Error: 194.2770771175954, R2_score 0.9996193335468094, Adjusted R2 score: 0.00039614848503941095
 SGDRegressor: Mean Squared Error: 3367.786408166783, R2_score 0.9934011602082924, Adjusted R2 score: 0.00686722027799509


[None, None, None, None, None, None]

In [10]:

predictions = [m.predict(X_test) for m in models]
mean_sq_errors = [mean_squared_error(y_test, y_) for y_ in predictions]
r2_scores = [r2_score(y_test, y_) for y_ in predictions]
adj_r2_scores = [((1 - score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)) for score in r2_scores]
print("Test data scores")
[print(f" {models[i].__class__.__name__}: Mean Squared Error: {mean_sq_errors[i]}, R2_score {r2_scores[i]}, Adjusted R2 score: {adj_r2_scores[i]}") for i in range(0, len(models))]


Test data scores
 LinearRegression: Mean Squared Error: 397.98040874000674, R2_score 0.9996263076353629, Adjusted R2 score: 0.0004430005087476551
 Ridge: Mean Squared Error: 433.076847900092, R2_score 0.9995933530701329, Adjusted R2 score: 0.000482067106152248
 Lasso: Mean Squared Error: 539.0103644798452, R2_score 0.999493884489681, Adjusted R2 score: 0.0005999839701678191
 DecisionTreeRegressor: Mean Squared Error: 6154.5432769806785, R2_score 0.9942210576703564, Adjusted R2 score: 0.006850753813344229
 RandomForestRegressor: Mean Squared Error: 739.8698048332967, R2_score 0.9993052831475622, Adjusted R2 score: 0.0008235649111117639
 SGDRegressor: Mean Squared Error: 4798.813677777677, R2_score 0.9954940494775126, Adjusted R2 score: 0.00534166218726992


[None, None, None, None, None, None]

In [None]:
#It seems lasso performed better than Random Forest Regression or gradient decent, so we might want to go with that going forward.
lasso_predictions = predictions[2]

In [None]:
#Seems like Lasso performs the best which is to be expected.

import matplotlib.pyplot as plt

# Plotting predicted vs actual values
plt.figure(figsize=(6, 6))
plt.scatter(y_test, lasso_predictions, alpha=0.5)
plt.xlabel('Actual CO2 Values')
plt.ylabel('Predicted CO2 Values')
plt.title('Predicted vs Actual CO2 Values')
plt.show()

In [None]:
# Assuming you have your actual and predicted values as follows:
actual_values = y_test
predicted_values = lasso_predictions

# Calculate residuals
residuals = actual_values - predicted_values

# Plotting the residuals
plt.figure(figsize=(10, 6))
plt.scatter(predicted_values, residuals, alpha=0.5)
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted CO2 Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle='--')  # Adds a horizontal line at zero
plt.show()