In [None]:
import pandas as pd
from src.pipeline.select_features import get_feature_importance
from src.pipeline.pipeline import pipeline
from src.data.get_dataset import get_merged_datasets
# get merged dataframe
dataframe = get_merged_datasets()
# Remove feature we do not want to use.
dataframe.drop(columns=['year','country','iso_code'], inplace=True)

## Transform data and perform test train split

In [None]:
x_train, x_test, y_train, y_test, feature_list = pipeline(dataframe)

In [None]:
feature_importance_linear = get_feature_importance(x_train, y_train,
                                                   feature_list, type="Linear")
feature_importance_forest = get_feature_importance(x_train, y_train,
                                                   feature_list,type="RandomForest")
feature_importance_tree = get_feature_importance(x_train, y_train,
                                                 feature_list,
                                                 type="Tree")

feature_importance = pd.merge(feature_importance_linear, feature_importance_tree, left_index=True, right_index=True, how='inner',
                              suffixes=("_linear", "_tree"))
feature_importance = pd.merge(feature_importance, feature_importance_forest, left_index=True, right_index=True, how='inner')
feature_importance.rename(columns={'Importance':'Importance_forest'}, inplace=True)

In [None]:
feature_list = sorted(feature_importance.index.to_list())

## Scale feature importance

In [None]:
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()
feature_importance_abs = feature_importance.abs()
scalar.fit(feature_importance_abs)
feature_importance_abs_scaled = scalar.transform(feature_importance_abs)
feature_importance_abs_scaled = pd.DataFrame(feature_importance_abs_scaled,
                                             index=feature_importance_abs.index,
                                             columns=feature_importance_abs.columns)
feature_importance_abs_scaled

## Choosing feature importance by scaled abs value

In [None]:
import matplotlib.pyplot as plt
# Set cut off value
suggested_cut_off = 0.05

plot = feature_importance_abs_scaled.plot(kind='bar', figsize=(40,20))
plot.hlines(suggested_cut_off,linewidth=2, xmin=0, xmax=len(feature_list), color='r')
plt.savefig('Feature_selection.png')

Inspect cutoff Value in saved fig. This seems reasonable.

### Get set of Selected features

In [None]:
linear_top = feature_importance_abs_scaled.index[feature_importance_abs_scaled.Importance_linear>=suggested_cut_off].tolist()
tree_top = feature_importance_abs_scaled.index[feature_importance_abs_scaled.Importance_tree>=suggested_cut_off].tolist()
forest_top = feature_importance_abs_scaled.index[feature_importance_abs_scaled.Importance_forest>=suggested_cut_off].tolist()
top_features = linear_top+tree_top+forest_top
top_features = set(top_features)
len(top_features)

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

linear_model = LinearRegression()
lasso_model = Lasso(random_state=42)
ridge_model = Ridge(random_state=42)
dec_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)
models = [linear_model, lasso_model, ridge_model, dec_tree_model, random_forest_model]
x_train = x_train[list(top_features)]
x_test = x_test[list(top_features)]
for m in models:
    m.fit(x_train, y_train)

In [None]:
from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_error
from src.models.predict_model import  adjusted_r2
training_predictions = [m.predict(x_train) for m in models]
mean_sq_errors = [mean_squared_error(y_train, y) for y in training_predictions]
r2_scores = [r2_score(y_train, y) for y in training_predictions]
adj_r2_scores = [adjusted_r2(score, len(x_train),len(x_train.columns)) for score in r2_scores]
mean_absolute_errors = [mean_absolute_error(y_train, y) for y in training_predictions]
print("Train data scores")
[print(f" {model.__class__.__name__}:, Adjusted R2 score: {adj_r2_scores[i]}, Mean absolute Error: {mean_absolute_errors[i]}") for i, model in enumerate(models)]

In [None]:
from sklearn.metrics import  mean_squared_error, r2_score, mean_absolute_error
from src.models.predict_model import  adjusted_r2
predictions = [m.predict(x_test) for m in models]
mean_sq_errors = [mean_squared_error(y_test, y) for y in predictions]
r2_scores = [r2_score(y_test, y) for y in predictions]
adj_r2_scores = [adjusted_r2(score, len(x_test),len(x_test.columns)) for score in r2_scores]
mean_absolute_errors = [mean_absolute_error(y_test, y) for y in predictions]
print("Test data scores")
[print(f" {model.__class__.__name__}:, Adjusted R2 score: {adj_r2_scores[i]}, Mean absolute Error: {mean_absolute_errors[i]}") for i, model in enumerate(models)]

In [None]:
from sklearn.model_selection import GridSearchCV

best_params_records = []
grid = {'fit_intercept':[True, False]}
CV_lin = GridSearchCV(estimator=linear_model, param_grid=grid, cv=5, scoring='neg_median_absolute_error', n_jobs=-1)
CV_lin.fit(x_train, y_train)
best_params_records.append({'model': 'linear regression', **CV_lin.best_params_})

grid = {'alpha': [0.1,2], 'max_iter': [100, 2000], 'tol': [0.01, 0.0001]}
CV_lasso = GridSearchCV(estimator=lasso_model, param_grid=grid, cv=5, scoring='neg_median_absolute_error', n_jobs=-1)
CV_lasso.fit(x_train, y_train)
best_params_records.append({'model': 'lesso', **CV_lasso.best_params_})
CV_ridge = GridSearchCV(estimator=ridge_model, param_grid=grid, cv=5, scoring='neg_median_absolute_error', n_jobs=-1)
CV_ridge.fit(x_train, y_train)
best_params_records.append({'model': 'ridge', **CV_ridge.best_params_})

In [None]:
grid = {'n_estimators': [100,200,500,1000],
               'max_features': ['sqrt', 'log2'],
               'max_depth': [10,20,50,100],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

CV_forest = GridSearchCV(estimator=random_forest_model, param_grid=grid, cv=5, scoring='neg_median_absolute_error', n_jobs=-1)
CV_forest.fit(x_train, y_train)
best_params_records.append({'model': 'Forest', **CV_forest.best_params_})

In [None]:
grid = {'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [10,20,50,100],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}

CV_tree = GridSearchCV(estimator=dec_tree_model, param_grid=grid, cv=5, scoring='neg_median_absolute_error', n_jobs=-1)
CV_tree.fit(x_train, y_train)
best_params_records.append({'model': 'Tree', **CV_tree.best_params_})
best_hyper_params = pd.DataFrame.from_records(best_params_records)
best_hyper_params

In [None]:
random_forest_preds = [4]
#Seems like random forest performs the best

import matplotlib.pyplot as plt

# Plotting predicted vs actual values
plt.figure(figsize=(6, 6))
plt.scatter(y_test, random_forest_preds, alpha=0.5)
plt.xlabel('Actual CO2 Values')
plt.ylabel('Predicted CO2 Values')
plt.title('Predicted vs Actual CO2 Values')
plt.show()

In [None]:
# Assuming you have your actual and predicted values as follows:
actual_values = y_test
predicted_values = random_forest_preds

# Calculate residuals
residuals = actual_values - predicted_values

# Plotting the residuals
plt.figure(figsize=(20, 12))
plt.scatter(predicted_values, residuals, alpha=0.5)
plt.title('Residuals vs Predicted Values')
plt.xlabel('Predicted CO2 Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='r', linestyle=':')  # Adds a horizontal line at zero
plt.show()

## Ensmble Modelling via Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor
