# Energy Dataset

## Importing Dataset

Importing the dataset from the Github repository of the project

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/renewable-energy-data-scrapping.csv'

df_energy = pd.read_csv(url)

In [3]:
df_energy.tail(3)

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
21887,Zimbabwe,2019,ZWE,15354606.0,,,,,,24.748,...,0.364,,,,,0.0,0.0,,0.0,
21888,Zimbabwe,2020,ZWE,15669663.0,,,,,,22.336,...,0.395,,,,,0.0,0.0,,0.0,
21889,Zimbabwe,2021,ZWE,15993525.0,,,,,,23.76,...,0.498,,,,,0.0,0.0,,0.0,


## Filtering the rows and columns

Selecting the columns that contain information about renewable energy production 

In [4]:
columns_to_keep = ['country', 'year', 'iso_code', 'biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production']
columns_to_drop = list(set(df_energy.columns) - set(columns_to_keep))
df_energy_filtered1 = df_energy.drop(columns=columns_to_drop)

In [5]:
df_energy_filtered1.head(3)

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity
0,Afghanistan,1900,AFG,,,,0.0,,,,,,,,
1,Afghanistan,1901,AFG,,,,0.0,,,,,,,,
2,Afghanistan,1902,AFG,,,,0.0,,,,,,,,


## Cleaning rows from years previous to 2000

In [6]:
df_energy_filtered2 = df_energy_filtered1[df_energy_filtered1.year > 1999].reset_index()

In [7]:
df_energy_filtered2.shape

(6457, 16)

## Cleaning rows related to regions (not countries)

Removing the data points related to regions, as we intend to do the analysis on countries

In [8]:
# All countries have a respective 'iso_code', so we will remove the ones that do note have one
df_energy_filtered_countries_only = df_energy_filtered2.dropna(subset=['iso_code'])

In [9]:
df_energy_filtered_countries_only.shape

(4814, 16)

## Creating column with total production of energy

We are not using this at first!!! (This was supposed to be used if we had a lot of zeros/NaNs using the sources sepparated)

In [10]:
# df_energy_filtered_countries_only.loc[:, 'total_renewable_electricity'] = df_energy_filtered_countries_only[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']].sum(axis=1)

In [11]:
# df_energy_filtered_countries_only.head()

In [12]:
# count_sum_equals_zero = (df_energy_filtered_countries_only.total_renewable_electricity == 0).sum()/len(df_energy_filtered)
# count_sum_equals_zero

## Analyzing missing values

How many NaNs we have for each feature?

In [13]:
df_energy_filtered_countries_only.isnull().sum()/len(df_energy_filtered_countries_only)

index                       0.000000
country                     0.000000
year                        0.000000
iso_code                    0.000000
biofuel_electricity         0.047570
coal_electricity            0.043000
fossil_electricity          0.020980
gas_production              0.190694
greenhouse_gas_emissions    0.020980
hydro_electricity           0.029290
low_carbon_electricity      0.020773
net_elec_imports            0.020980
nuclear_electricity         0.021811
oil_production              0.190694
solar_electricity           0.027005
wind_electricity            0.029082
dtype: float64

In [14]:
## creates a dictionary with all features and the respective amount of Zeros for each
# countries = df_energy_filtered.country.unique()
# missing_values_dict = {}
# for country in countries:
#     percentage_missing_values = ((df_energy_filtered.country == country) & (df_energy_filtered.total_renewable_electricity == 0)).sum()\
#     /len(df_energy_filtered[df_energy_filtered.country == country])
#     missing_values_dict[country] = percentage_missing_values
# missing_values_dict

# Economic Growth Dataset

## Importing Dataset

Importing the dataset from the Github repository of the project

In [15]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/World_Development_Indicators.xlsx'

df_gdp = pd.read_excel(url)

In [16]:
df_gdp.head(3)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],...,1969 [YR1969],1968 [YR1968],1967 [YR1967],1966 [YR1966],1965 [YR1965],1964 [YR1964],1963 [YR1963],1962 [YR1962],1961 [YR1961],1960 [YR1960]
0,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Brazil,BRA,6086.08487,6745.865881,9216.14336,9183.470768,8783.225984,8426.853352,...,3304.860245,3095.225632,2892.687723,2850.904774,2746.261344,2758.866607,2746.517052,2811.630814,2717.004894,2578.432125
1,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Afghanistan,AFG,..,..,608.386715,602.516979,592.476537,590.427739,...,..,..,..,..,..,..,..,..,..,..
2,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Albania,ALB,1606.296047,1960.881946,3780.698202,3855.759734,3952.802538,4090.371657,...,..,..,..,..,..,..,..,..,..,..


## Filtering the rows and columns

In [17]:
df_gdp_filtered = df_gdp.drop(columns=['Series Name', 'Series Code'])

In [18]:
df_gdp_filtered.shape

(266, 65)

## Flattening the years to be in one column (instead of one column per year)

Moving the year columns to become lines respective to their country.<br>
We will use this to merge with the Energy dataset

In [19]:
df_gdp_filtered_flattened = pd.melt(df_gdp_filtered, id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='GDP_per_capita')

In [20]:
df_gdp_filtered_flattened.head(3)

Unnamed: 0,Country Name,Country Code,Year,GDP_per_capita
0,Brazil,BRA,1990 [YR1990],6086.08487
1,Afghanistan,AFG,1990 [YR1990],..
2,Albania,ALB,1990 [YR1990],1606.296047


## Turning 'Year' column into an int

Removing the brackets string in the Year column and turning it into an integer

In [21]:
df_gdp_filtered_flattened['Year'] = df_gdp_filtered_flattened['Year'].str.extract(r'(\d{4})').astype(int)

In [22]:
df_gdp_filtered_flattened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16758 entries, 0 to 16757
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country Name    16758 non-null  object
 1   Country Code    16758 non-null  object
 2   Year            16758 non-null  int64 
 3   GDP_per_capita  16758 non-null  object
dtypes: int64(1), object(3)
memory usage: 523.8+ KB


Renaming the columns so they match the ones in the Energy dataset, easing the merge

In [23]:
df_gdp_filtered_flattened.rename(columns={"Year": "year", "Country Code": "iso_code"}, inplace=True)

# Merging the features and target datasets

In [24]:
full_df = pd.merge(df_energy_filtered_countries_only, df_gdp_filtered_flattened, on=['year', 'iso_code'], how='left')

In [25]:
full_df.drop(columns=['index', 'Country Name'], inplace=True)

In [26]:
full_df.sample(8)

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
542,Bosnia and Herzegovina,2015,BIH,0.0,9.36,9.47,0.0,7.89,5.49,5.5,-2.13,0.0,0.0,0.01,0.0,4599.901029
1571,French Polynesia,2005,PYF,0.0,0.0,0.42,0.0,0.29,0.18,0.18,0.0,0.0,0.0,0.0,0.0,23276.11317
2223,Japan,2014,JPN,22.86,341.57,892.18,33.156,583.07,83.16,136.53,0.0,0.0,8.097,22.95,5.22,34386.905855
3516,Puerto Rico,2013,PRI,0.0,0.0,20.2,0.0,14.14,0.08,0.29,0.0,0.0,0.0,0.04,0.17,29426.157603
4229,Taiwan,2007,TWN,1.9,129.6,191.89,3.936,141.38,4.42,47.3,0.0,40.54,0.219,0.0,0.44,
3706,Saint Pierre and Miquelon,2004,SPM,0.0,0.0,0.05,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
703,Burundi,2021,BDI,0.01,0.0,0.12,,0.09,0.22,0.24,0.1,0.0,,0.01,0.0,261.01938
3394,Paraguay,2003,PRY,0.0,0.0,0.02,0.0,1.24,51.25,51.25,-45.18,0.0,0.0,0.0,0.0,4015.500749


# Treating the raw dataset

## Removing the countries with 'NaN's in 'GDP_per_capita' (target)

In [27]:
# The countries we remove here do not affect our model
regions_to_remove = full_df['country'][full_df.GDP_per_capita.isna()].unique()
regions_to_remove

array(['Antarctica', 'Cook Islands', 'Falkland Islands', 'French Guiana',
       'Guadeloupe', 'Martinique', 'Montserrat', 'Netherlands Antilles',
       'Niue', 'Reunion', 'Saint Helena', 'Saint Pierre and Miquelon',
       'Taiwan', 'Western Sahara'], dtype=object)

In [28]:
full_df_gdpclean1 = full_df.dropna(subset=['GDP_per_capita']).reset_index(drop=True)
full_df_gdpclean1

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.00,0.00,0.16,2.495,0.12,0.31,0.31,0.10,0.0,0.0,0.00,0.0,..
1,Afghanistan,2001,AFG,0.00,0.00,0.09,0.542,0.07,0.50,0.50,0.10,0.0,0.0,0.00,0.0,..
2,Afghanistan,2002,AFG,0.00,0.00,0.13,0.542,0.10,0.56,0.56,0.10,0.0,0.0,0.00,0.0,359.766343
3,Afghanistan,2003,AFG,0.00,0.00,0.31,0.217,0.24,0.63,0.63,0.10,0.0,0.0,0.00,0.0,363.101481
4,Afghanistan,2004,AFG,0.00,0.00,0.33,0.217,0.24,0.56,0.56,0.10,0.0,0.0,0.00,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4505,Zimbabwe,2017,ZWE,0.32,3.02,3.05,,2.67,3.97,4.30,2.22,0.0,,0.01,0.0,1421.24077
4506,Zimbabwe,2018,ZWE,0.39,3.69,3.73,,3.27,5.05,5.46,1.02,0.0,,0.02,0.0,1462.59028
4507,Zimbabwe,2019,ZWE,0.38,3.62,3.66,,3.19,4.17,4.58,1.11,0.0,,0.03,0.0,1342.989587
4508,Zimbabwe,2020,ZWE,0.35,3.37,3.41,,2.96,3.81,4.19,1.98,0.0,,0.03,0.0,1213.117058


## Removing the countries with missing data in 'GDP_per_capita' (target)

Removing the countries that have more than 9 missing data in gdp

In [29]:
total_countries = full_df_gdpclean1['country'].unique()
no_gdp_countries = []

for i in total_countries:
    if full_df_gdpclean1['GDP_per_capita'][(full_df_gdpclean1.GDP_per_capita == '..') & (full_df_gdpclean1.country == i)].count() > 9:
        no_gdp_countries.append(i)

no_gdp_countries

['British Virgin Islands',
 'Djibouti',
 'Eritrea',
 'Faroe Islands',
 'Gibraltar',
 'New Caledonia',
 'North Korea',
 'Somalia',
 'Turks and Caicos Islands',
 'Venezuela']

In [30]:
full_df_gdpclean2 = full_df_gdpclean1[~full_df_gdpclean1['country'].isin(no_gdp_countries)].reset_index(drop=True)
full_df_gdpclean2

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.00,0.00,0.16,2.495,0.12,0.31,0.31,0.10,0.0,0.0,0.00,0.0,..
1,Afghanistan,2001,AFG,0.00,0.00,0.09,0.542,0.07,0.50,0.50,0.10,0.0,0.0,0.00,0.0,..
2,Afghanistan,2002,AFG,0.00,0.00,0.13,0.542,0.10,0.56,0.56,0.10,0.0,0.0,0.00,0.0,359.766343
3,Afghanistan,2003,AFG,0.00,0.00,0.31,0.217,0.24,0.63,0.63,0.10,0.0,0.0,0.00,0.0,363.101481
4,Afghanistan,2004,AFG,0.00,0.00,0.33,0.217,0.24,0.56,0.56,0.10,0.0,0.0,0.00,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4285,Zimbabwe,2017,ZWE,0.32,3.02,3.05,,2.67,3.97,4.30,2.22,0.0,,0.01,0.0,1421.24077
4286,Zimbabwe,2018,ZWE,0.39,3.69,3.73,,3.27,5.05,5.46,1.02,0.0,,0.02,0.0,1462.59028
4287,Zimbabwe,2019,ZWE,0.38,3.62,3.66,,3.19,4.17,4.58,1.11,0.0,,0.03,0.0,1342.989587
4288,Zimbabwe,2020,ZWE,0.35,3.37,3.41,,2.96,3.81,4.19,1.98,0.0,,0.03,0.0,1213.117058


## Input value in missing data in target column

Inputing in the countries that have less than 10 missing data in gdp

In [31]:
# All the indexes of rows that have missing data
nan_target_indexes = full_df_gdpclean2[full_df_gdpclean2['GDP_per_capita'] == '..'].index
nan_target_indexes

Int64Index([   0,    1,   65,   66,  241,  396,  617,  750,  751,  752,  753,
             754,  755,  992, 1037, 1060, 1105, 1279, 1368, 1391, 1502, 1547,
            1569, 1592, 1593, 1790, 1923, 1968, 2122, 2189, 2300, 2323, 2478,
            2714, 2715, 2716, 2717, 2780, 2891, 2892, 3131, 3154, 3221, 3354,
            3530, 3553, 3624, 3625, 3626, 3627, 3628, 3629, 3652, 3741, 3785,
            3982, 3983, 4114, 4115, 4135, 4243, 4244, 4245],
           dtype='int64')

In [32]:
# Getting the parameters for calculating the average GDP to be inputed (MANUAL CALCULATION FOR THE 'get_parameters_input_gdp_mean' FUNCTION

# index = item.index[0]
# country = item.loc[index, 'country'] # Country of the input row
# year = item.loc[index, 'year'] # Year of the input row
# years = np.arange(year-5, year+6, 1) # Range of years used to calculate the mean
# years = np.delete(years, np.where(years == year))
# index, country, year, years

In [33]:
# Function for getting the parameters for calculating the average GDP to be inputed

def get_parameters_input_gdp_mean(nan_index):
    country = full_df_gdpclean2.loc[nan_index, 'country'] # Country of the input row
    year = full_df_gdpclean2.loc[nan_index, 'year'] # Year of the input row
    years = np.arange(year-6, year+7, 1) # Range of years used to calculate the mean
    years = np.delete(years, np.where(years == year))
    # index, country, year, years
    return country, years

In [34]:
# Function for calculating the mean value to be inputed

def calculate_input_gdp_mean(country, years):
    gdp_values = []
    for i in years:
        gdp = full_df_gdpclean2.loc[(full_df_gdpclean2['country'] == country) & (full_df_gdpclean2['year'] == i), 'GDP_per_capita']
        if not gdp.empty:
            gdp_value = pd.to_numeric(gdp.item(), errors='coerce')
            if type(gdp_value) == float:
                gdp_values.append(gdp_value)
                # print(gdp_values)
    
    input_gdp_mean = np.mean(gdp_values)
    # print(gdp_input)
    return input_gdp_mean

In [35]:
# Interating over the rows that have missing values in the target and inputing the calculated meand from +/- 6 years
for nan_target_indexe in nan_target_indexes:
    country, years = get_parameters_input_gdp_mean(nan_target_indexe)
    input_gdp_mean = calculate_input_gdp_mean(country, years)
    print(f'Inputed {round(input_gdp_mean)} GDP mean for {country} in {years[5] + 1}')
    full_df_gdpclean2.at[nan_target_indexe, 'GDP_per_capita'] = input_gdp_mean

Inputed 368 GDP mean for Afghanistan in 2000
Inputed 378 GDP mean for Afghanistan in 2001
Inputed 12943 GDP mean for American Samoa in 2000
Inputed 12958 GDP mean for American Samoa in 2001
Inputed 45228 GDP mean for Austria in 2022
Inputed 42023 GDP mean for Belgium in 2022
Inputed 7938 GDP mean for Bulgaria in 2022
Inputed 96180 GDP mean for Cayman Islands in 2000
Inputed 96232 GDP mean for Cayman Islands in 2001
Inputed 95211 GDP mean for Cayman Islands in 2002
Inputed 92416 GDP mean for Cayman Islands in 2003
Inputed 89837 GDP mean for Cayman Islands in 2004
Inputed 87930 GDP mean for Cayman Islands in 2005
Inputed 13533 GDP mean for Croatia in 2022
Inputed 26960 GDP mean for Cyprus in 2022
Inputed 19402 GDP mean for Czechia in 2022
Inputed 56386 GDP mean for Denmark in 2022
Inputed 19769 GDP mean for Estonia in 2022
Inputed 45361 GDP mean for Finland in 2022
Inputed 37583 GDP mean for France in 2022
Inputed 42485 GDP mean for Germany in 2022
Inputed 18382 GDP mean for Greece in 20

In [36]:
# Check if there are no more missing values in the target
check_nan_target_indexes = full_df_gdpclean2[full_df_gdpclean2['GDP_per_capita'] == '..'].index
if check_nan_target_indexes.empty == True:
    print('Inputing succeded!')
else:
    print('not succeded')

Inputing succeded!


In [37]:
full_df_gdpclean2

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.00,0.00,0.16,2.495,0.12,0.31,0.31,0.10,0.0,0.0,0.00,0.0,368.187174
1,Afghanistan,2001,AFG,0.00,0.00,0.09,0.542,0.07,0.50,0.50,0.10,0.0,0.0,0.00,0.0,378.391401
2,Afghanistan,2002,AFG,0.00,0.00,0.13,0.542,0.10,0.56,0.56,0.10,0.0,0.0,0.00,0.0,359.766343
3,Afghanistan,2003,AFG,0.00,0.00,0.31,0.217,0.24,0.63,0.63,0.10,0.0,0.0,0.00,0.0,363.101481
4,Afghanistan,2004,AFG,0.00,0.00,0.33,0.217,0.24,0.56,0.56,0.10,0.0,0.0,0.00,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4285,Zimbabwe,2017,ZWE,0.32,3.02,3.05,,2.67,3.97,4.30,2.22,0.0,,0.01,0.0,1421.24077
4286,Zimbabwe,2018,ZWE,0.39,3.69,3.73,,3.27,5.05,5.46,1.02,0.0,,0.02,0.0,1462.59028
4287,Zimbabwe,2019,ZWE,0.38,3.62,3.66,,3.19,4.17,4.58,1.11,0.0,,0.03,0.0,1342.989587
4288,Zimbabwe,2020,ZWE,0.35,3.37,3.41,,2.96,3.81,4.19,1.98,0.0,,0.03,0.0,1213.117058


At this point, we have the full dataset with with all target values filled

## Treating 'NaN's from the features

In [38]:
# Checking countries that have more than 20 NaN values in its features (>25%)

columns = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production']
total_countries = full_df_gdpclean2['country'].unique()
no_energy_countries = []

for i in total_countries:
    mask = full_df_gdpclean2['country'] == i
    nan_sum = full_df_gdpclean2.loc[mask, columns].isna().sum().sum()
    if nan_sum > 20:
        no_energy_countries.append(i)
print(no_energy_countries)

['Bermuda', 'Chile', 'Micronesia (country)', 'Northern Mariana Islands', 'Tuvalu']


In [39]:
# Dropping countries that have more than 20 NaN values in its features (>25%)
full_df_gdp_feat_clean = full_df_gdpclean2[~full_df_gdpclean2['country'].isin(no_energy_countries)].reset_index(drop=True)
full_df_gdp_feat_clean

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.00,0.00,0.16,2.495,0.12,0.31,0.31,0.10,0.0,0.0,0.00,0.0,368.187174
1,Afghanistan,2001,AFG,0.00,0.00,0.09,0.542,0.07,0.50,0.50,0.10,0.0,0.0,0.00,0.0,378.391401
2,Afghanistan,2002,AFG,0.00,0.00,0.13,0.542,0.10,0.56,0.56,0.10,0.0,0.0,0.00,0.0,359.766343
3,Afghanistan,2003,AFG,0.00,0.00,0.31,0.217,0.24,0.63,0.63,0.10,0.0,0.0,0.00,0.0,363.101481
4,Afghanistan,2004,AFG,0.00,0.00,0.33,0.217,0.24,0.56,0.56,0.10,0.0,0.0,0.00,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4181,Zimbabwe,2017,ZWE,0.32,3.02,3.05,,2.67,3.97,4.30,2.22,0.0,,0.01,0.0,1421.24077
4182,Zimbabwe,2018,ZWE,0.39,3.69,3.73,,3.27,5.05,5.46,1.02,0.0,,0.02,0.0,1462.59028
4183,Zimbabwe,2019,ZWE,0.38,3.62,3.66,,3.19,4.17,4.58,1.11,0.0,,0.03,0.0,1342.989587
4184,Zimbabwe,2020,ZWE,0.35,3.37,3.41,,2.96,3.81,4.19,1.98,0.0,,0.03,0.0,1213.117058


In [40]:
# Filling remaining NaN values that were supposed to be zero (values from 2021 with all historical values equals to zero)
full_df_gdp_feat_clean['biofuel_electricity'] = full_df_gdp_feat_clean['biofuel_electricity'].fillna(0)
full_df_gdp_feat_clean['hydro_electricity'] = full_df_gdp_feat_clean['hydro_electricity'].fillna(0)
full_df_gdp_feat_clean['gas_production'] = full_df_gdp_feat_clean['gas_production'].fillna(0)
full_df_gdp_feat_clean['oil_production'] = full_df_gdp_feat_clean['oil_production'].fillna(0)
full_df_gdp_feat_clean['coal_electricity'] = full_df_gdp_feat_clean['coal_electricity'].fillna(0)
full_df_gdp_feat_clean['fossil_electricity'] = full_df_gdp_feat_clean['fossil_electricity'].fillna(0)
full_df_gdp_feat_clean['greenhouse_gas_emissions'] = full_df_gdp_feat_clean['greenhouse_gas_emissions'].fillna(0)
full_df_gdp_feat_clean['net_elec_imports'] = full_df_gdp_feat_clean['net_elec_imports'].fillna(0)
full_df_gdp_feat_clean['nuclear_electricity'] = full_df_gdp_feat_clean['nuclear_electricity'].fillna(0)

In [41]:
full_df_gdp_feat_clean.isnull().sum()

country                     0
year                        0
iso_code                    0
biofuel_electricity         0
coal_electricity            0
fossil_electricity          0
gas_production              0
greenhouse_gas_emissions    0
hydro_electricity           0
low_carbon_electricity      0
net_elec_imports            0
nuclear_electricity         0
oil_production              0
solar_electricity           0
wind_electricity            0
GDP_per_capita              0
dtype: int64

In [42]:
(full_df_gdp_feat_clean == 0).sum()/len(full_df_gdp_feat_clean)

country                     0.000000
year                        0.000000
iso_code                    0.000000
biofuel_electricity         0.489011
coal_electricity            0.618490
fossil_electricity          0.034161
gas_production              0.584568
greenhouse_gas_emissions    0.006211
hydro_electricity           0.248447
low_carbon_electricity      0.172480
net_elec_imports            0.387721
nuclear_electricity         0.838748
oil_production              0.561873
solar_electricity           0.621596
wind_electricity            0.645007
GDP_per_capita              0.000000
dtype: float64

In [43]:
# full_df.to_csv('/home/pedroabisamara/code/final_project/cleaned_dataset.csv')

# Creating two versions of the treated dataset

Creates the version that will keep Zero values (*base*), and the one that will have 0.0001 inplace of Zero values (*final*)

In [44]:
full_cleaned_df_base = full_df_gdp_feat_clean
full_cleaned_df_final = full_df_gdp_feat_clean

# Pre-processing the features

## Splitting the *base* dataset into training and testing

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
# Creates X and y for base dataset (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_base = full_cleaned_df_base[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']]
# y_base = full_cleaned_df_base['GDP_per_capita']

full_cleaned_df_base.head(3)

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.0,0.0,0.16,2.495,0.12,0.31,0.31,0.1,0.0,0.0,0.0,0.0,368.187174
1,Afghanistan,2001,AFG,0.0,0.0,0.09,0.542,0.07,0.5,0.5,0.1,0.0,0.0,0.0,0.0,378.391401
2,Afghanistan,2002,AFG,0.0,0.0,0.13,0.542,0.1,0.56,0.56,0.1,0.0,0.0,0.0,0.0,359.766343


In [47]:
# Split base X and y into training and testing datasets (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_base, y_base, test_size=0.2, random_state=0)

# Split full base into training and testing datasets
full_cleaned_df_base_train, full_cleaned_df_base_test = train_test_split(full_cleaned_df_base, test_size=0.2, random_state=0)

## Scaling *base* dataset

In [48]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.compose import ColumnTransformer

In [49]:
# # Step 0 - Instanciate Standard Scaler (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# mm_scaler_base = MinMaxScaler()

# # Step 1- Fit the scaler to the features
# mm_scaler_base.fit(X_base_train)

# # 2-Scale/Transform; <-> apply the transformation and store it in a df
# X_base_train_scaled = pd.DataFrame(mm_scaler_base.transform(X_base_train), columns=X_base_train.columns)

In [50]:
# Creating Pipeline for scaling only the features, passing through the target and the reference columns country/year

columns_to_scale = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production']
ct_base = ColumnTransformer([
                ("scale", StandardScaler(), columns_to_scale)],
                remainder = "passthrough"
)

In [51]:
# Fit and transforming the scaling on base dataset

full_cleaned_df_base_train_scaled = pd.DataFrame(ct_base.fit_transform(full_cleaned_df_base_train),
            columns=ct_base.get_feature_names_out())

In [52]:
full_cleaned_df_base_train_scaled.head(3)

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,scale__nuclear_electricity,scale__oil_production,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,-0.228221,0.360233,-0.114169,-0.139168,-0.149637,-0.207675,-0.242276,-0.188327,0.048207,-3.127102,-0.196459,-0.294759,Paraguay,2020,PRY,6095.391375
1,-0.220828,-0.171372,-0.105097,-0.076304,-0.149637,-0.205162,-0.242276,-0.188823,-0.172107,0.029803,-0.196459,-0.294759,Kenya,2021,KEN,1704.966576
2,-0.122257,-0.101737,-0.114169,-0.108777,-0.149637,-0.203312,-0.242276,-0.187264,-0.169939,-0.128379,-0.196459,-0.294759,Uruguay,2014,URY,15649.41434


## In *final* dataset, replacing Zeros in features, by 0.0001

In [53]:
# Checking the minimum value of energy prduction we currently have
min_value = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']][full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']] != 0].min().min()

In [54]:
min_value

0.006

In [55]:
# Replacing Zeros by 0.0001
full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production']] = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production']].replace(0, 0.0001)

In [56]:
(full_cleaned_df_final == 0).sum()

country                     0
year                        0
iso_code                    0
biofuel_electricity         0
coal_electricity            0
fossil_electricity          0
gas_production              0
greenhouse_gas_emissions    0
hydro_electricity           0
low_carbon_electricity      0
net_elec_imports            0
nuclear_electricity         0
oil_production              0
solar_electricity           0
wind_electricity            0
GDP_per_capita              0
dtype: int64

## Splitting the *final* dataset into training and testing

In [57]:
# Creates X and y for final dataset (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_final = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']]
# y_final = full_cleaned_df_final['GDP_per_capita']

full_cleaned_df_final.head(3)

Unnamed: 0,country,year,iso_code,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_electricity,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,solar_electricity,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.0001,0.0001,0.16,2.495,0.12,0.31,0.31,0.1,0.0001,0.0001,0.0001,0.0001,368.187174
1,Afghanistan,2001,AFG,0.0001,0.0001,0.09,0.542,0.07,0.5,0.5,0.1,0.0001,0.0001,0.0001,0.0001,378.391401
2,Afghanistan,2002,AFG,0.0001,0.0001,0.13,0.542,0.1,0.56,0.56,0.1,0.0001,0.0001,0.0001,0.0001,359.766343


In [58]:
# Split final X and y into training and testing datasets (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_final_train, X_final_test, y_final_train, y_final_test = train_test_split(X_final, y_final, test_size=0.2, random_state=0)

# Split full final into training and testing datasets
full_cleaned_df_final_train, full_cleaned_df_final_test = train_test_split(full_cleaned_df_final, test_size=0.2, random_state=0)

## Scaling *final* dataset

In [59]:
# # Step 0 - Instanciate Standard Scaler (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# mm_scaler_final = MinMaxScaler()

# # Step 1- Fit the scaler to the features
# mm_scaler_final.fit(X_final_train)

# # 2-Scale/Transform; <-> apply the transformation and store it in a df
# X_final_train_scaled = pd.DataFrame(mm_scaler_final.transform(X_final_train), columns=X_final_train.columns)

In [60]:
# Creating Pipeline for scaling only the features, passing through the target and the reference columns country/year

columns_to_scale = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production']
ct_final = ColumnTransformer([
                ("scale", StandardScaler(), columns_to_scale)],
                remainder = "passthrough"
)

In [61]:
# Fit and transforming the scaling on final dataset

full_cleaned_df_final_train_scaled = pd.DataFrame(ct_final.fit_transform(full_cleaned_df_final_train),
            columns=ct_final.get_feature_names_out())

In [62]:
full_cleaned_final_test = pd.DataFrame(ct_final.transform(full_cleaned_df_final_test),
            columns=ct_final.get_feature_names_out())
full_cleaned_final_test

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,scale__nuclear_electricity,scale__oil_production,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,-0.235608,-0.178649,-0.113268,-0.139167,-0.149637,-0.19268,-0.242276,-0.178545,-0.218052,0.014103,-0.196458,-0.294758,Sri Lanka,2005,LKA,2348.033855
1,-0.233156,-0.209765,-0.114166,-0.139167,-0.149637,-0.204334,-0.236386,-0.189213,-0.233346,0.053357,-0.196458,-0.138522,Gabon,2016,GAB,7038.577254
2,-0.235608,-0.147784,-0.113268,-0.139167,-0.118798,-0.181331,-0.242276,-0.164084,-0.202819,-0.304515,-0.196458,-0.294758,Bosnia and Herzegovina,2014,BIH,4403.653938
3,0.159896,-0.154434,-0.112361,-0.139167,-0.057607,0.183483,0.240792,0.090324,-0.186163,0.680475,-0.196458,-0.079385,Thailand,2010,THA,5082.256116
4,-0.235608,-0.201484,-0.114166,-0.139167,-0.149637,-0.196767,-0.217688,-0.185102,-0.229383,-0.025173,-0.196458,-0.261094,Cote d'Ivoire,2010,CIV,1640.315029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,-0.234388,-0.220681,-0.114166,-0.139167,-0.149637,-0.20762,-0.242276,-0.192261,-0.238795,0.042139,-0.196458,-0.294758,Belize,2005,BLZ,6301.915865
834,-0.235608,-0.217168,-0.114166,-0.139167,-0.138632,-0.191742,-0.242276,-0.178332,-0.237123,0.177883,-0.196458,-0.294758,Moldova,2009,MDA,2102.710936
835,-0.235608,-0.214533,-0.114166,-0.139167,-0.149637,-0.206957,-0.241965,-0.191623,-0.235823,0.025311,-0.196458,-0.294758,Afghanistan,2004,AFG,354.033913
836,-0.235608,-0.15343,-0.112361,-0.139167,-0.149637,-0.200026,-0.242276,-0.18496,-0.205544,0.016336,-0.196458,-0.294758,Cameroon,2020,CMR,1419.328204


In [63]:
full_cleaned_df_final_train_scaled

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,scale__nuclear_electricity,scale__oil_production,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,-0.228227,0.360233,-0.114166,-0.139167,-0.149637,-0.207675,-0.242276,-0.188327,0.048207,-3.127107,-0.196458,-0.294758,Paraguay,2020,PRY,6095.391375
1,-0.220835,-0.171372,-0.105103,-0.076307,-0.149637,-0.205162,-0.242276,-0.188823,-0.172107,0.029798,-0.196458,-0.294758,Kenya,2021,KEN,1704.966576
2,-0.122264,-0.101738,-0.114166,-0.10878,-0.149637,-0.203312,-0.242276,-0.187264,-0.16994,-0.128384,-0.196458,-0.294758,Uruguay,2014,URY,15649.41434
3,-0.233156,-0.216917,-0.114166,-0.139167,-0.149637,-0.20773,-0.239479,-0.192297,-0.236876,0.060088,-0.196458,-0.093956,Congo,2006,COG,2228.585458
4,-0.235608,-0.15067,-0.114166,-0.139167,-0.115974,-0.080925,0.558685,-0.099118,-0.204306,-0.084631,-0.196458,-0.24997,Uzbekistan,2013,UZB,2487.244416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3343,-0.235608,-0.108513,-0.114166,-0.139167,-0.149637,-0.207841,-0.242276,-0.191659,-0.1835,-0.030782,-0.196458,-0.27941,Democratic Republic of Congo,2016,COD,476.814885
3344,-0.235608,-0.221183,-0.112361,-0.139167,-0.149637,-0.20762,-0.242276,-0.192261,-0.238981,0.014103,-0.196458,-0.294758,Samoa,2016,WSM,4329.777682
3345,-0.217138,-0.221558,-0.114166,-0.139167,-0.149637,-0.205135,-0.242276,-0.189922,-0.238362,0.014103,-0.196458,-0.294758,Guyana,2017,GUY,6038.273456
3346,-0.217138,-0.198975,-0.114166,-0.139167,-0.149637,-0.200854,-0.143735,-0.187618,-0.227216,0.014103,-0.196458,-0.286824,Myanmar,2001,MMR,353.95157


In [64]:
full_cleaned_final = full_cleaned_df_final_train_scaled.sort_values('remainder__year', ascending=True)

In [65]:
full_cleaned_final

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,scale__nuclear_electricity,scale__oil_production,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
2173,-0.235608,-0.178775,-0.114166,-0.139167,-0.149637,-0.207758,-0.242276,-0.192084,-0.218176,0.014103,-0.196458,-0.228896,Cameroon,2000,CMR,1137.624523
2203,-0.191263,0.135897,-0.114166,-0.137506,-0.144183,-0.068775,0.278992,-0.098054,-0.023312,0.150959,-0.113018,0.289962,Argentina,2000,ARG,10672.722035
1545,-0.226995,1.837108,-0.114166,-0.139167,0.38865,1.302087,7.452639,0.99362,1.535788,-1.562116,1.50943,4.324612,Russia,2000,RUS,5323.662598
2199,-0.235608,-0.221558,-0.114166,-0.139167,-0.149637,-0.207786,-0.242276,-0.192403,-0.23929,0.014103,-0.196458,-0.294758,Guinea-Bissau,2000,GNB,563.624309
906,-0.214674,0.165884,-0.114166,-0.137922,-0.028456,0.049415,-0.232526,0.022664,-0.046346,0.389914,-0.196458,-0.253664,Turkey,2000,TUR,6454.593119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,0.743929,-0.221308,0.06637,0.649338,-0.137886,-0.192625,-0.242276,-0.170429,-0.06034,0.211539,-0.196458,-0.294758,Denmark,2022,DNK,56386.120691
2052,0.975571,-0.220681,1.459014,0.715117,-0.098379,-0.01766,-0.242276,-0.037908,0.082511,-0.464941,-0.138092,-0.294758,Netherlands,2022,NLD,47301.904915
728,1.48814,-0.045654,-0.086957,0.326691,-0.139996,-0.186495,-0.242276,-0.158732,0.160469,1.425388,0.152631,-0.294758,Finland,2022,FIN,45361.232992
1954,0.604697,0.014319,2.858916,2.435765,-0.124901,0.085231,-0.242276,0.026952,0.867045,-2.202697,0.617481,-0.294758,Spain,2022,ESP,26748.793053


In [66]:
full_cleaned_final.to_csv('full_cleaned_final.csv')

In [67]:
full_cleaned_base = full_cleaned_df_base_train_scaled.sort_values('remainder__year', ascending=True)

In [68]:
full_cleaned_base.to_csv('full_cleaned_base.csv')

In [69]:
full_cleaned_df_final_test.to_csv('full_cleaned_final_test.csv')

In [70]:
full_cleaned_final_test.to_csv('full_cleaned_final_test.csv')