# Tasks

1. Using the `codecarbon` library, track the carbon emissions of the following exploratory data analysis.
2. Compare the emissions of the different models at the bottom of the notebook.
3. Discuss possible improvements to reduce the carbon footprint of the models.

In [None]:
#!pip install numpy pandas seaborn plotly matplotlib scikit-learn tensorflow nbformat

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
sns.set_theme(style='darkgrid', palette='colorblind')
sns.set()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 21)


In [None]:
df = pd.read_csv('https://github.com/tjisousa/green_it_practical/raw/main/notebooks/global-data-on-sustainable-energy.csv')
df

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df = df.rename(columns = {"Density\\n(P/Km2)":"Density (P/km2)",
                  "Value_co2_emissions_kt_by_country":"CO2 emissions value by country (kT)",
                  "Access to clean fuels for cooking":"Access to clean fuels for cooking (% of population)",
                  "gdp_growth":"GDP growth",
                  "gdp_per_capita":"GDP per capita",
                  "Renewable-electricity-generating-capacity-per-capita":"Renewable electricity Generating Capacity per capita"
                  })


In [None]:
df

In [None]:
df.dtypes

In [None]:
df['Density (P/km2)'] = df['Density (P/km2)'].str.replace(',', '')
df['Density (P/km2)'] = pd.to_numeric(df['Density (P/km2)'])
df.dtypes

In [None]:
df.shape

In [None]:
df = df.loc[~df.duplicated(subset = ['Entity', 'Year'])] \
    .reset_index(drop = True).copy()
df

In [None]:
df = df.drop(columns = ['Latitude', 'Longitude']) \
    .copy()

In [None]:
df

In [None]:
df_subset = df[['Entity', 'Year', 'Access to electricity (% of population)', 'Access to clean fuels for cooking (% of population)']]

In [None]:
df_subset

In [None]:
def plot_map(df, column, title):
    """
    Create an animated choropleth map with specified data and parameters.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        column (str): The name of the column to be used as the color metric.
        title (str): The title of the choropleth map.

    Returns:
        fig: The Plotly figure object representing the choropleth map.
    """

    # Create a choropleth map using Plotly Express
    fig = px.choropleth(
        df,
        locations = 'Entity',
        locationmode = 'country names',
        color = column,
        hover_name = 'Entity',
        color_continuous_scale = 'RdYlGn',
        animation_frame = 'Year',
        range_color = [0, 100])

    # Update geographic features
    fig.update_geos(
        showcoastlines = True,
        coastlinecolor = "Black",
        showland = True,
        landcolor = "white",
        showcountries = True,
        showocean = True,
        oceancolor = "LightBlue")

    # Update the layout of the figure
    fig.update_layout(
        title_text = title,
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection_type = 'equirectangular',
            showland = True,
            landcolor = "white",
            showcountries = True,
            showocean = True,
            oceancolor = "LightBlue"),
        width = 1000,
        height = 850,
        dragmode = 'pan',
        hovermode = 'closest',
        coloraxis_colorbar = dict(
            title = column,
            title_font_size = 14,
            title_side = 'right',
            lenmode = 'pixels',
            len = 300,
            thicknessmode = 'pixels',
            thickness = 15),
        updatemenus = [
            {"type": "buttons", "showactive": False, "x": 0.1, "y": 0.9, "buttons": [{"label": "Play", "method": "animate"}]},
            {"type": "buttons", "showactive": False, "x": 0.18, "y": 0.9, "buttons": [{"label": "Pause", "method": "animate"}]},
            {"type": "buttons", "showactive": False, "x": 0.26, "y": 0.9, "buttons": [{"label": "Stop", "method": "animate"}]}],
        sliders = [{"yanchor": "top", "xanchor": "left", "currentvalue": {"font": {"size": 20}}, "steps": []}])

    # Create slider steps for animation
    slider_steps = []

    for year in df['Year'].unique():
        step = {
            "args": [
                [year],
                {"frame": {"duration": 300, "redraw": False}, "mode": "immediate", "transition": {"duration": 0}}],
            "label": str(year),
            "method": "animate"}
        slider_steps.append(step)

    # Assign slider steps to the figure layout
    fig.layout.updatemenus[0].buttons[0].args[1]['steps'] = slider_steps

    return fig

In [None]:
plot_map(df_subset, 'Access to electricity (% of population)', 'Access to Electricity (% of Population) Over Years')

In [None]:


plot_map(df_subset, 'Access to clean fuels for cooking (% of population)', 'Access to Clean Fuels for Cooking (% of Population) Over Years')

In [None]:
average_co2_emission_by_country = df.groupby('Entity')['CO2 emissions value by country (kT)'].mean()

In [None]:
top_5_countries = average_co2_emission_by_country.nlargest(5)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = top_5_countries.index, y = top_5_countries.values)
plt.xlabel('Country')
plt.ylabel('Average CO2 Emissions (kT x 1e6)')
plt.title('Top 5 Countries with Highest Average CO2 Emissions')

plt.xticks(rotation = 45, ha = 'center')

plt.tight_layout()
plt.show()


In [None]:
top_5_lowest_emissions = average_co2_emission_by_country.nsmallest(5)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = top_5_lowest_emissions.values, y = top_5_lowest_emissions.index)
plt.xlabel('Average CO2 Emissions (kT)')
plt.ylabel('Country')
plt.title('Top 5 Countries with the Lowest CO2 Emissions')

plt.show()

In [None]:
average_co2_by_year = df.groupby('Year')['CO2 emissions value by country (kT)'].mean()
average_co2_by_year = average_co2_by_year.reset_index()

In [None]:
plt.figure(figsize = (10, 6))
sns.lineplot(data = average_co2_by_year, x = 'Year', y = 'CO2 emissions value by country (kT)', color = 'black')
plt.title('Average Growth of CO2 Emissions Over the Years')
plt.xlabel('Year')
plt.ylabel('Average CO2 Emissions (kT)')

plt.xticks(average_co2_by_year['Year'], rotation = 0, ha = 'center')
plt.xlim(2000, 2019) #2020 doesn't containt data and will be predicted later

plt.tight_layout()
plt.show()

In [None]:
china_data = df[df['Entity'] == 'China']
tuvalu_data = df[df['Entity'] == 'Tuvalu']

In [None]:
plt.figure(figsize = (12, 6))
sns.lineplot(data = china_data, x = 'Year', y = 'CO2 emissions value by country (kT)', label = 'China')
sns.lineplot(data = average_co2_by_year, x = 'Year', y = 'CO2 emissions value by country (kT)', label = 'World Average', color = 'black')
plt.xlabel('Year')
plt.ylabel('CO2 Emissions (kT x 1e7)')
plt.title('CO2 Emissions Over the Years: China vs. World Average')

plt.xticks(average_co2_by_year['Year'], rotation = 0, ha = 'center')
plt.xlim(2000, 2019)

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize = (12, 6))
sns.lineplot(data = tuvalu_data, x = 'Year', y = 'CO2 emissions value by country (kT)', label = 'Tuvalu')
sns.lineplot(data = average_co2_by_year, x = 'Year', y = 'CO2 emissions value by country (kT)', label = 'World Average', color = 'black')
plt.xlabel('Year')
plt.ylabel('CO2 Emissions (kT)')
plt.title('CO2 Emissions Over the Years: Tuvalu vs. World Average')

plt.xticks(average_co2_by_year['Year'], rotation = 0, ha = 'center')
plt.xlim(2000, 2019)

plt.tight_layout()
plt.show()


## Models

### Tasks
1. Read the **CodeCarbon** [documentation](https://mlco2.github.io/codecarbon/)
2. Start to estimate your impact ([help link](https://github.com/mlco2/codecarbon/tree/master?tab=readme-ov-file#start-to-estimate-your-impact-))
3. Instantiate a `EmissionsTracker` object and call the start function to start tracking and estimating the CO₂ of the compute section **of each model**.
4. Fit each model and stop the tracking.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [None]:
target = 'Primary energy consumption per capita (kWh/person)'

In [None]:
features = [
    'Access to electricity (% of population)',
    'GDP per capita',
    'Financial flows to developing countries (US $)',
    'Renewable electricity Generating Capacity per capita',
    'Electricity from fossil fuels (TWh)']

In [None]:
ml_subset = df.dropna(subset=[target])

In [None]:
x = ml_subset[features]
y = ml_subset[target]

In [None]:
imputer = SimpleImputer(strategy='mean')
x_imputed = imputer.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_imputed, y, test_size = 0.4, random_state = 42)

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

Random Forest

In [None]:
rf_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
rf_model = RandomForestRegressor(random_state=42)
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, scoring='neg_mean_squared_error', cv=5)
rf_grid_search.fit(x_train_scaled, y_train)
best_rf_params = rf_grid_search.best_params_

Gradient Boosting

In [None]:
gb_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.2]}
gb_model = GradientBoostingRegressor(random_state=42)
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, scoring='neg_mean_squared_error', cv=5)
gb_grid_search.fit(x_train_scaled, y_train)
best_gb_params = gb_grid_search.best_params_

Initializing Models

In [None]:
random_forest_model = RandomForestRegressor(**best_rf_params, random_state=42)
gradient_boosting_model = GradientBoostingRegressor(**best_gb_params, random_state=42)
linear_regression_model = LinearRegression()

Training

In [None]:
random_forest_model.fit(x_train_scaled, y_train)
gradient_boosting_model.fit(x_train_scaled, y_train)
linear_regression_model.fit(x_train_scaled, y_train)

Predictions and Evaluation

In [None]:
rforest_predictions = random_forest_model.predict(x_test_scaled)
linreg_predictions = linear_regression_model.predict(x_test_scaled)
gradboost_predictions = gradient_boosting_model.predict(x_test_scaled)

In [None]:
rf_mse = mean_squared_error(y_test, rforest_predictions)
lr_mse = mean_squared_error(y_test, linreg_predictions)
gb_mse = mean_squared_error(y_test, gradboost_predictions)

In [None]:
rf_r2 = r2_score(y_test, rforest_predictions)
lr_r2 = r2_score(y_test, linreg_predictions)
gb_r2 = r2_score(y_test, gradboost_predictions)

In [None]:
results = pd.DataFrame({
    'Model': ['Random Forest', 'Linear Regression', 'Gradient Boosting'],
    'MSE': [rf_mse, lr_mse, gb_mse],
    'R-squared': [rf_r2, lr_r2, gb_r2]
})

In [None]:
results

### Question

How can we effectively evaluate and balance the environmental impact, in terms of CO₂ emissions, of the different models against their performance improvements, especially when the increase in computational resources leads to only minor enhancements in model efficiency or accuracy?