# Energy Dataset

## Importing Dataset

Importing the dataset from the Github repository of the project

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/renewable-energy-data-scrapping.csv'

df_energy = pd.read_csv(url)

In [3]:
df_energy.tail(3)

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
21887,Zimbabwe,2019,ZWE,15354606.0,,,,,,24.748,...,0.364,,,,,0.0,0.0,,0.0,
21888,Zimbabwe,2020,ZWE,15669663.0,,,,,,22.336,...,0.395,,,,,0.0,0.0,,0.0,
21889,Zimbabwe,2021,ZWE,15993525.0,,,,,,23.76,...,0.498,,,,,0.0,0.0,,0.0,


## Filtering the rows and columns

Selecting the columns that contain information about renewable energy production 

In [4]:
columns_to_keep = ['country', 'year', 'iso_code', 'biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity',
                   'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity',
                   'oil_production','primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita','hydro_elec_per_capita',
                   'solar_elec_per_capita','wind_elec_per_capita']
columns_to_drop = list(set(df_energy.columns) - set(columns_to_keep))
df_energy_filtered1 = df_energy.drop(columns=columns_to_drop)

In [5]:
df_energy_filtered1.head(3)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,low_carbon_electricity,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity
0,Afghanistan,1900,AFG,,,,,0.0,,,...,,,,,,,,,,
1,Afghanistan,1901,AFG,,,,,0.0,,,...,,,,,,,,,,
2,Afghanistan,1902,AFG,,,,,0.0,,,...,,,,,,,,,,


## Cleaning rows from years previous to 2000

In [6]:
df_energy_filtered2 = df_energy_filtered1[df_energy_filtered1.year > 1999].reset_index()

In [7]:
df_energy_filtered2.shape

(6457, 23)

## Cleaning rows related to regions (not countries)

Removing the data points related to regions, as we intend to do the analysis on countries

In [8]:
# All countries have a respective 'iso_code', so we will remove the ones that do note have one
df_energy_filtered_countries_only = df_energy_filtered2.dropna(subset=['iso_code'])

In [9]:
df_energy_filtered_countries_only.shape

(4814, 23)

## Creating column with total production of energy

We are not using this at first!!! (This was supposed to be used if we had a lot of zeros/NaNs using the sources sepparated)

In [10]:
# df_energy_filtered_countries_only.loc[:, 'total_renewable_electricity'] = df_energy_filtered_countries_only[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']].sum(axis=1)

In [11]:
# df_energy_filtered_countries_only.head()

In [12]:
# count_sum_equals_zero = (df_energy_filtered_countries_only.total_renewable_electricity == 0).sum()/len(df_energy_filtered)
# count_sum_equals_zero

## Analyzing missing values

How many NaNs we have for each feature?

In [13]:
df_energy_filtered_countries_only.isnull().sum()/len(df_energy_filtered_countries_only)

index                         0.000000
country                       0.000000
year                          0.000000
iso_code                      0.000000
biofuel_elec_per_capita       0.047570
biofuel_electricity           0.047570
coal_electricity              0.043000
fossil_electricity            0.020980
gas_production                0.190694
greenhouse_gas_emissions      0.020980
hydro_consumption             0.640424
hydro_elec_per_capita         0.029290
hydro_electricity             0.029290
low_carbon_electricity        0.020773
net_elec_imports              0.020980
nuclear_electricity           0.021811
oil_production                0.190694
per_capita_electricity        0.020773
primary_energy_consumption    0.067096
solar_elec_per_capita         0.027005
solar_electricity             0.027005
wind_elec_per_capita          0.029082
wind_electricity              0.029082
dtype: float64

In [14]:
## creates a dictionary with all features and the respective amount of Zeros for each
# countries = df_energy_filtered.country.unique()
# missing_values_dict = {}
# for country in countries:
#     percentage_missing_values = ((df_energy_filtered.country == country) & (df_energy_filtered.total_renewable_electricity == 0)).sum()\
#     /len(df_energy_filtered[df_energy_filtered.country == country])
#     missing_values_dict[country] = percentage_missing_values
# missing_values_dict

# Economic Growth Dataset

## Importing Dataset

Importing the dataset from the Github repository of the project

In [15]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/World_Development_Indicators.xlsx'

df_gdp = pd.read_excel(url)

In [16]:
df_gdp.head(3)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],...,1969 [YR1969],1968 [YR1968],1967 [YR1967],1966 [YR1966],1965 [YR1965],1964 [YR1964],1963 [YR1963],1962 [YR1962],1961 [YR1961],1960 [YR1960]
0,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Brazil,BRA,6086.08487,6745.865881,9216.14336,9183.470768,8783.225984,8426.853352,...,3304.860245,3095.225632,2892.687723,2850.904774,2746.261344,2758.866607,2746.517052,2811.630814,2717.004894,2578.432125
1,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Afghanistan,AFG,..,..,608.386715,602.516979,592.476537,590.427739,...,..,..,..,..,..,..,..,..,..,..
2,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Albania,ALB,1606.296047,1960.881946,3780.698202,3855.759734,3952.802538,4090.371657,...,..,..,..,..,..,..,..,..,..,..


## Filtering the rows and columns

In [17]:
df_gdp_filtered = df_gdp.drop(columns=['Series Name', 'Series Code'])

In [18]:
df_gdp_filtered.shape

(266, 65)

## Flattening the years to be in one column (instead of one column per year)

Moving the year columns to become lines respective to their country.<br>
We will use this to merge with the Energy dataset

In [19]:
df_gdp_filtered_flattened = pd.melt(df_gdp_filtered, id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='GDP_per_capita')

In [20]:
df_gdp_filtered_flattened.head(3)

Unnamed: 0,Country Name,Country Code,Year,GDP_per_capita
0,Brazil,BRA,1990 [YR1990],6086.08487
1,Afghanistan,AFG,1990 [YR1990],..
2,Albania,ALB,1990 [YR1990],1606.296047


## Turning 'Year' column into an int

Removing the brackets string in the Year column and turning it into an integer

In [21]:
df_gdp_filtered_flattened['Year'] = df_gdp_filtered_flattened['Year'].str.extract(r'(\d{4})').astype(int)

In [22]:
df_gdp_filtered_flattened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16758 entries, 0 to 16757
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country Name    16758 non-null  object
 1   Country Code    16758 non-null  object
 2   Year            16758 non-null  int64 
 3   GDP_per_capita  16758 non-null  object
dtypes: int64(1), object(3)
memory usage: 523.8+ KB


Renaming the columns so they match the ones in the Energy dataset, easing the merge

In [23]:
df_gdp_filtered_flattened.rename(columns={"Year": "year", "Country Code": "iso_code"}, inplace=True)

# Merging the features and target datasets

In [24]:
full_df = pd.merge(df_energy_filtered_countries_only, df_gdp_filtered_flattened, on=['year', 'iso_code'], how='left')

In [25]:
full_df.drop(columns=['index', 'Country Name'], inplace=True)

In [26]:
full_df.sample(8)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
3032,Netherlands,2012,NLD,428.947,7.2,24.21,86.69,684.028,53.47,0.286,...,17.11,3.91,17.583,6140.493,1059.124,11.319,0.19,296.688,4.98,44242.973526
4588,United States,2016,USA,191.803,62.76,1239.15,2656.96,7273.561,1755.35,706.105,...,66.5,805.69,6310.358,12492.153,25723.152,167.69,54.87,693.713,226.99,57292.538783
3272,Norway,2013,NOR,78.73,0.4,0.04,2.85,1079.304,4.75,349.273,...,-5.01,0.0,964.477,26230.809,526.881,0.0,0.0,370.03,1.88,73046.905029
211,Aruba,2016,ABW,0.0,0.0,0.0,0.76,0.0,0.53,,...,0.0,0.0,0.0,8580.418,5.246,95.338,0.01,1239.394,0.13,28852.237067
966,Congo,2020,COG,10.522,0.06,0.0,2.87,,1.5,,...,0.0,0.0,183.236,671.673,,0.0,0.0,0.0,0.0,1586.318297
1932,Haiti,2012,HTI,0.0,0.0,0.0,0.83,0.0,0.59,,...,0.0,0.0,0.0,103.873,10.494,0.0,0.0,0.0,0.0,1348.191835
2297,Kiribati,2000,KIR,0.0,0.0,0.0,0.01,0.0,0.01,,...,0.0,0.0,0.0,112.562,0.145,0.0,0.0,0.0,0.0,1477.077622
237,Australia,2020,AUS,123.88,3.18,133.21,186.92,1459.506,138.89,37.874,...,0.0,0.0,221.672,9774.425,1593.595,929.098,23.85,864.821,22.2,58117.452027


# Treating the raw dataset

## Removing the countries with 'NaN's in 'GDP_per_capita' (target)

In [27]:
# The countries we remove here do not affect our model
regions_to_remove = full_df['country'][full_df.GDP_per_capita.isna()].unique()
regions_to_remove

array(['Antarctica', 'Cook Islands', 'Falkland Islands', 'French Guiana',
       'Guadeloupe', 'Martinique', 'Montserrat', 'Netherlands Antilles',
       'Niue', 'Reunion', 'Saint Helena', 'Saint Pierre and Miquelon',
       'Taiwan', 'Western Sahara'], dtype=object)

In [28]:
full_df_gdpclean1 = full_df.dropna(subset=['GDP_per_capita']).reset_index(drop=True)
full_df_gdpclean1

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.000,0.00,0.00,0.16,2.495,0.12,,...,0.10,0.0,0.0,24.050,5.914,0.000,0.00,0.0,0.0,..
1,Afghanistan,2001,AFG,0.000,0.00,0.00,0.09,0.542,0.07,,...,0.10,0.0,0.0,29.967,4.664,0.000,0.00,0.0,0.0,..
2,Afghanistan,2002,AFG,0.000,0.00,0.00,0.13,0.542,0.10,,...,0.10,0.0,0.0,32.857,4.428,0.000,0.00,0.0,0.0,359.766343
3,Afghanistan,2003,AFG,0.000,0.00,0.00,0.31,0.217,0.24,,...,0.10,0.0,0.0,41.510,5.208,0.000,0.00,0.0,0.0,363.101481
4,Afghanistan,2004,AFG,0.000,0.00,0.00,0.33,0.217,0.24,,...,0.10,0.0,0.0,37.786,4.810,0.000,0.00,0.0,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4505,Zimbabwe,2017,ZWE,21.693,0.32,3.02,3.05,,2.67,,...,2.22,0.0,,498.268,45.936,0.678,0.01,0.0,0.0,1421.24077
4506,Zimbabwe,2018,ZWE,25.910,0.39,3.69,3.73,,3.27,,...,1.02,0.0,,610.542,47.502,1.329,0.02,0.0,0.0,1462.59028
4507,Zimbabwe,2019,ZWE,24.748,0.38,3.62,3.66,,3.19,,...,1.11,0.0,,536.647,49.427,1.954,0.03,0.0,0.0,1342.989587
4508,Zimbabwe,2020,ZWE,22.336,0.35,3.37,3.41,,2.96,,...,1.98,0.0,,485.014,,1.915,0.03,0.0,0.0,1213.117058


## Removing the countries with missing data in 'GDP_per_capita' (target)

Removing the countries that have more than 9 missing data in gdp

In [29]:
total_countries = full_df_gdpclean1['country'].unique()
no_gdp_countries = []

for i in total_countries:
    if full_df_gdpclean1['GDP_per_capita'][(full_df_gdpclean1.GDP_per_capita == '..') & (full_df_gdpclean1.country == i)].count() > 9:
        no_gdp_countries.append(i)

no_gdp_countries

['British Virgin Islands',
 'Djibouti',
 'Eritrea',
 'Faroe Islands',
 'Gibraltar',
 'New Caledonia',
 'North Korea',
 'Somalia',
 'Turks and Caicos Islands',
 'Venezuela']

In [30]:
full_df_gdpclean2 = full_df_gdpclean1[~full_df_gdpclean1['country'].isin(no_gdp_countries)].reset_index(drop=True)
full_df_gdpclean2

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.000,0.00,0.00,0.16,2.495,0.12,,...,0.10,0.0,0.0,24.050,5.914,0.000,0.00,0.0,0.0,..
1,Afghanistan,2001,AFG,0.000,0.00,0.00,0.09,0.542,0.07,,...,0.10,0.0,0.0,29.967,4.664,0.000,0.00,0.0,0.0,..
2,Afghanistan,2002,AFG,0.000,0.00,0.00,0.13,0.542,0.10,,...,0.10,0.0,0.0,32.857,4.428,0.000,0.00,0.0,0.0,359.766343
3,Afghanistan,2003,AFG,0.000,0.00,0.00,0.31,0.217,0.24,,...,0.10,0.0,0.0,41.510,5.208,0.000,0.00,0.0,0.0,363.101481
4,Afghanistan,2004,AFG,0.000,0.00,0.00,0.33,0.217,0.24,,...,0.10,0.0,0.0,37.786,4.810,0.000,0.00,0.0,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4285,Zimbabwe,2017,ZWE,21.693,0.32,3.02,3.05,,2.67,,...,2.22,0.0,,498.268,45.936,0.678,0.01,0.0,0.0,1421.24077
4286,Zimbabwe,2018,ZWE,25.910,0.39,3.69,3.73,,3.27,,...,1.02,0.0,,610.542,47.502,1.329,0.02,0.0,0.0,1462.59028
4287,Zimbabwe,2019,ZWE,24.748,0.38,3.62,3.66,,3.19,,...,1.11,0.0,,536.647,49.427,1.954,0.03,0.0,0.0,1342.989587
4288,Zimbabwe,2020,ZWE,22.336,0.35,3.37,3.41,,2.96,,...,1.98,0.0,,485.014,,1.915,0.03,0.0,0.0,1213.117058


## Input value in missing data in target column

Inputing in the countries that have less than 10 missing data in gdp

In [31]:
# All the indexes of rows that have missing data
nan_target_indexes = full_df_gdpclean2[full_df_gdpclean2['GDP_per_capita'] == '..'].index
nan_target_indexes

Int64Index([   0,    1,   65,   66,  241,  396,  617,  750,  751,  752,  753,
             754,  755,  992, 1037, 1060, 1105, 1279, 1368, 1391, 1502, 1547,
            1569, 1592, 1593, 1790, 1923, 1968, 2122, 2189, 2300, 2323, 2478,
            2714, 2715, 2716, 2717, 2780, 2891, 2892, 3131, 3154, 3221, 3354,
            3530, 3553, 3624, 3625, 3626, 3627, 3628, 3629, 3652, 3741, 3785,
            3982, 3983, 4114, 4115, 4135, 4243, 4244, 4245],
           dtype='int64')

In [32]:
# Getting the parameters for calculating the average GDP to be inputed (MANUAL CALCULATION FOR THE 'get_parameters_input_gdp_mean' FUNCTION

# index = item.index[0]
# country = item.loc[index, 'country'] # Country of the input row
# year = item.loc[index, 'year'] # Year of the input row
# years = np.arange(year-5, year+6, 1) # Range of years used to calculate the mean
# years = np.delete(years, np.where(years == year))
# index, country, year, years

In [33]:
# Function for getting the parameters for calculating the average GDP to be inputed

def get_parameters_input_gdp_mean(nan_index):
    country = full_df_gdpclean2.loc[nan_index, 'country'] # Country of the input row
    year = full_df_gdpclean2.loc[nan_index, 'year'] # Year of the input row
    years = np.arange(year-6, year+7, 1) # Range of years used to calculate the mean
    years = np.delete(years, np.where(years == year))
    # index, country, year, years
    return country, years

In [34]:
# Function for calculating the mean value to be inputed

def calculate_input_gdp_mean(country, years):
    gdp_values = []
    for i in years:
        gdp = full_df_gdpclean2.loc[(full_df_gdpclean2['country'] == country) & (full_df_gdpclean2['year'] == i), 'GDP_per_capita']
        if not gdp.empty:
            gdp_value = pd.to_numeric(gdp.item(), errors='coerce')
            if type(gdp_value) == float:
                gdp_values.append(gdp_value)
                # print(gdp_values)
    
    input_gdp_mean = np.mean(gdp_values)
    # print(gdp_input)
    return input_gdp_mean

In [35]:
# Interating over the rows that have missing values in the target and inputing the calculated meand from +/- 6 years
for nan_target_indexe in nan_target_indexes:
    country, years = get_parameters_input_gdp_mean(nan_target_indexe)
    input_gdp_mean = calculate_input_gdp_mean(country, years)
    print(f'Inputed {round(input_gdp_mean)} GDP mean for {country} in {years[5] + 1}')
    full_df_gdpclean2.at[nan_target_indexe, 'GDP_per_capita'] = input_gdp_mean

Inputed 368 GDP mean for Afghanistan in 2000
Inputed 378 GDP mean for Afghanistan in 2001
Inputed 12943 GDP mean for American Samoa in 2000
Inputed 12958 GDP mean for American Samoa in 2001
Inputed 45228 GDP mean for Austria in 2022
Inputed 42023 GDP mean for Belgium in 2022
Inputed 7938 GDP mean for Bulgaria in 2022
Inputed 96180 GDP mean for Cayman Islands in 2000
Inputed 96232 GDP mean for Cayman Islands in 2001
Inputed 95211 GDP mean for Cayman Islands in 2002
Inputed 92416 GDP mean for Cayman Islands in 2003
Inputed 89837 GDP mean for Cayman Islands in 2004
Inputed 87930 GDP mean for Cayman Islands in 2005
Inputed 13533 GDP mean for Croatia in 2022
Inputed 26960 GDP mean for Cyprus in 2022
Inputed 19402 GDP mean for Czechia in 2022
Inputed 56386 GDP mean for Denmark in 2022
Inputed 19769 GDP mean for Estonia in 2022
Inputed 45361 GDP mean for Finland in 2022
Inputed 37583 GDP mean for France in 2022
Inputed 42485 GDP mean for Germany in 2022
Inputed 18382 GDP mean for Greece in 20

In [36]:
# Check if there are no more missing values in the target
check_nan_target_indexes = full_df_gdpclean2[full_df_gdpclean2['GDP_per_capita'] == '..'].index
if check_nan_target_indexes.empty == True:
    print('Inputing succeded!')
else:
    print('not succeded')

Inputing succeded!


In [37]:
full_df_gdpclean2

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
0,Afghanistan,2000,AFG,0.000,0.00,0.00,0.16,2.495,0.12,,...,0.10,0.0,0.0,24.050,5.914,0.000,0.00,0.0,0.0,368.187174
1,Afghanistan,2001,AFG,0.000,0.00,0.00,0.09,0.542,0.07,,...,0.10,0.0,0.0,29.967,4.664,0.000,0.00,0.0,0.0,378.391401
2,Afghanistan,2002,AFG,0.000,0.00,0.00,0.13,0.542,0.10,,...,0.10,0.0,0.0,32.857,4.428,0.000,0.00,0.0,0.0,359.766343
3,Afghanistan,2003,AFG,0.000,0.00,0.00,0.31,0.217,0.24,,...,0.10,0.0,0.0,41.510,5.208,0.000,0.00,0.0,0.0,363.101481
4,Afghanistan,2004,AFG,0.000,0.00,0.00,0.33,0.217,0.24,,...,0.10,0.0,0.0,37.786,4.810,0.000,0.00,0.0,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4285,Zimbabwe,2017,ZWE,21.693,0.32,3.02,3.05,,2.67,,...,2.22,0.0,,498.268,45.936,0.678,0.01,0.0,0.0,1421.24077
4286,Zimbabwe,2018,ZWE,25.910,0.39,3.69,3.73,,3.27,,...,1.02,0.0,,610.542,47.502,1.329,0.02,0.0,0.0,1462.59028
4287,Zimbabwe,2019,ZWE,24.748,0.38,3.62,3.66,,3.19,,...,1.11,0.0,,536.647,49.427,1.954,0.03,0.0,0.0,1342.989587
4288,Zimbabwe,2020,ZWE,22.336,0.35,3.37,3.41,,2.96,,...,1.98,0.0,,485.014,,1.915,0.03,0.0,0.0,1213.117058


At this point, we have the full dataset with with all target values filled

## Treating 'NaN's from the features

In [38]:
# Checking countries that have more than 20 NaN values in its features (>25%)

columns = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity',
           'coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions',
           'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production',
           'primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita',
           'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']
total_countries = full_df_gdpclean2['country'].unique()
no_energy_countries = []

for i in total_countries:
    mask = full_df_gdpclean2['country'] == i
    nan_sum = full_df_gdpclean2.loc[mask, columns].isna().sum().sum()
    if nan_sum > 20:
        no_energy_countries.append(i)
print(no_energy_countries)

['Afghanistan', 'Albania', 'American Samoa', 'Angola', 'Antigua and Barbuda', 'Armenia', 'Aruba', 'Bahamas', 'Bahrain', 'Barbados', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brunei', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'Comoros', 'Congo', 'Costa Rica', "Cote d'Ivoire", 'Cuba', 'Democratic Republic of Congo', 'Dominica', 'Dominican Republic', 'East Timor', 'El Salvador', 'Equatorial Guinea', 'Eswatini', 'Ethiopia', 'Fiji', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Ghana', 'Greenland', 'Grenada', 'Guam', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Jamaica', 'Jordan', 'Kenya', 'Kiribati', 'Kyrgyzstan', 'Laos', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Macao', 'Madagascar', 'Malawi', 'Maldives', 'Mali', 'Malta', 'Mauritania', 'Mauritius', 'Micronesia (country)', 'Moldova', 'Mongolia', 'Montenegro', 'Mozambique', 'M

In [39]:
# Dropping countries that have more than 20 NaN values in its features (>25%)
full_df_gdp_feat_clean = full_df_gdpclean2[~full_df_gdpclean2['country'].isin(no_energy_countries)].reset_index(drop=True)
full_df_gdp_feat_clean

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
0,Algeria,2000,DZA,0.000,0.00,0.00,23.84,918.886,11.88,0.160,...,-0.07,0.0,776.733,776.289,299.946,0.000,0.00,0.000,0.00,3138.231048
1,Algeria,2001,DZA,0.000,0.00,0.00,24.96,864.725,12.41,0.203,...,-0.05,0.0,764.684,802.218,310.813,0.000,0.00,0.000,0.00,3188.207271
2,Algeria,2002,DZA,0.000,0.00,0.00,25.93,889.416,12.89,0.166,...,-0.02,0.0,824.497,821.826,321.951,0.000,0.00,0.000,0.00,3321.638849
3,Algeria,2003,DZA,0.000,0.00,0.00,27.55,938.576,13.69,0.769,...,0.01,0.0,918.828,867.547,336.922,0.000,0.00,0.000,0.00,3512.9002
4,Algeria,2004,DZA,0.000,0.00,0.00,29.14,920.549,14.47,0.723,...,0.01,0.0,971.698,904.024,349.778,0.000,0.00,0.000,0.00,3612.754164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1703,Vietnam,2017,VNM,0.851,0.08,62.61,103.56,95.207,73.73,233.126,...,0.71,0.0,161.076,2037.582,967.944,0.106,0.01,3.722,0.35,2903.02829
1704,Vietnam,2018,VNM,1.370,0.13,83.85,123.98,96.596,90.54,223.535,...,1.62,0.0,144.328,2203.988,1086.148,1.054,0.10,5.163,0.49,3090.772241
1705,Vietnam,2019,VNM,1.357,0.13,111.18,154.78,98.430,114.72,175.403,...,1.25,0.0,133.073,2374.481,1205.405,54.815,5.25,7.517,0.72,3288.353073
1706,Vietnam,2020,VNM,1.345,0.13,114.76,149.96,88.394,113.94,192.722,...,1.51,0.0,116.578,2435.625,1172.967,112.366,10.86,11.071,1.07,3352.060157


In [40]:
# Filling remaining NaN values that were supposed to be zero (values from 2021 with all historical values equals to zero)
full_df_gdp_feat_clean['biofuel_electricity'] = full_df_gdp_feat_clean['biofuel_electricity'].fillna(0)
full_df_gdp_feat_clean['hydro_electricity'] = full_df_gdp_feat_clean['hydro_electricity'].fillna(0)
full_df_gdp_feat_clean['gas_production'] = full_df_gdp_feat_clean['gas_production'].fillna(0)
full_df_gdp_feat_clean['oil_production'] = full_df_gdp_feat_clean['oil_production'].fillna(0)
full_df_gdp_feat_clean['coal_electricity'] = full_df_gdp_feat_clean['coal_electricity'].fillna(0)
full_df_gdp_feat_clean['fossil_electricity'] = full_df_gdp_feat_clean['fossil_electricity'].fillna(0)
full_df_gdp_feat_clean['greenhouse_gas_emissions'] = full_df_gdp_feat_clean['greenhouse_gas_emissions'].fillna(0)
full_df_gdp_feat_clean['net_elec_imports'] = full_df_gdp_feat_clean['net_elec_imports'].fillna(0)
full_df_gdp_feat_clean['nuclear_electricity'] = full_df_gdp_feat_clean['nuclear_electricity'].fillna(0)
full_df_gdp_feat_clean['primary_energy_consumption'] = full_df_gdp_feat_clean['primary_energy_consumption'].fillna(0)
full_df_gdp_feat_clean['hydro_consumption'] = full_df_gdp_feat_clean['hydro_consumption'].fillna(0)
full_df_gdp_feat_clean['per_capita_electricity'] = full_df_gdp_feat_clean['per_capita_electricity'].fillna(0)
full_df_gdp_feat_clean['biofuel_elec_per_capita'] = full_df_gdp_feat_clean['biofuel_elec_per_capita'].fillna(0)
full_df_gdp_feat_clean['hydro_elec_per_capita'] = full_df_gdp_feat_clean['hydro_elec_per_capita'].fillna(0)
full_df_gdp_feat_clean['solar_elec_per_capita'] = full_df_gdp_feat_clean['solar_elec_per_capita'].fillna(0)
full_df_gdp_feat_clean['wind_elec_per_capita'] = full_df_gdp_feat_clean['wind_elec_per_capita'].fillna(0)


In [41]:
full_df_gdp_feat_clean.isnull().sum()

country                       0
year                          0
iso_code                      0
biofuel_elec_per_capita       0
biofuel_electricity           0
coal_electricity              0
fossil_electricity            0
gas_production                0
greenhouse_gas_emissions      0
hydro_consumption             0
hydro_elec_per_capita         0
hydro_electricity             0
low_carbon_electricity        0
net_elec_imports              0
nuclear_electricity           0
oil_production                0
per_capita_electricity        0
primary_energy_consumption    0
solar_elec_per_capita         0
solar_electricity             0
wind_elec_per_capita          0
wind_electricity              0
GDP_per_capita                0
dtype: int64

In [42]:
(full_df_gdp_feat_clean == 0).sum()/len(full_df_gdp_feat_clean)

country                       0.000000
year                          0.000000
iso_code                      0.000000
biofuel_elec_per_capita       0.257026
biofuel_electricity           0.257026
coal_electricity              0.288056
fossil_electricity            0.011710
gas_production                0.279274
greenhouse_gas_emissions      0.000585
hydro_consumption             0.140515
hydro_elec_per_capita         0.138173
hydro_electricity             0.138173
low_carbon_electricity        0.077283
net_elec_imports              0.217213
nuclear_electricity           0.617681
oil_production                0.294496
per_capita_electricity        0.000000
primary_energy_consumption    0.016393
solar_elec_per_capita         0.432670
solar_electricity             0.432670
wind_elec_per_capita          0.322600
wind_electricity              0.322600
GDP_per_capita                0.000000
dtype: float64

In [43]:
# full_df.to_csv('/home/pedroabisamara/code/final_project/cleaned_dataset.csv')

# Creating two versions of the treated dataset

Creates the version that will keep Zero values (*base*), and the one that will have 0.0001 inplace of Zero values (*final*)

In [44]:
full_cleaned_df_base = full_df_gdp_feat_clean
full_cleaned_df_final = full_df_gdp_feat_clean

# Pre-processing the features

## Splitting the *base* dataset into training and testing

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
# Creates X and y for base dataset (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_base = full_cleaned_df_base[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']]
# y_base = full_cleaned_df_base['GDP_per_capita']

full_cleaned_df_base.head(3)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
0,Algeria,2000,DZA,0.0,0.0,0.0,23.84,918.886,11.88,0.16,...,-0.07,0.0,776.733,776.289,299.946,0.0,0.0,0.0,0.0,3138.231048
1,Algeria,2001,DZA,0.0,0.0,0.0,24.96,864.725,12.41,0.203,...,-0.05,0.0,764.684,802.218,310.813,0.0,0.0,0.0,0.0,3188.207271
2,Algeria,2002,DZA,0.0,0.0,0.0,25.93,889.416,12.89,0.166,...,-0.02,0.0,824.497,821.826,321.951,0.0,0.0,0.0,0.0,3321.638849


In [47]:
# Split base X and y into training and testing datasets (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_base, y_base, test_size=0.2, random_state=0)

# Split full base into training and testing datasets
full_cleaned_df_base_train, full_cleaned_df_base_test = train_test_split(full_cleaned_df_base, test_size=0.2, random_state=0)

## Scaling *base* dataset

In [48]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.compose import ColumnTransformer

In [49]:
# # Step 0 - Instanciate Standard Scaler (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# mm_scaler_base = MinMaxScaler()

# # Step 1- Fit the scaler to the features
# mm_scaler_base.fit(X_base_train)

# # 2-Scale/Transform; <-> apply the transformation and store it in a df
# X_base_train_scaled = pd.DataFrame(mm_scaler_base.transform(X_base_train), columns=X_base_train.columns)

In [50]:
# Creating Pipeline for scaling only the features, passing through the target and the reference columns country/year

columns_to_scale = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity',
                    'coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions',
                    'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production',
                    'primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita',
                    'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']
ct_base = ColumnTransformer([
                ("scale", MinMaxScaler(), columns_to_scale)],
                remainder = "passthrough"
)

In [51]:
# Fit and transforming the scaling on base dataset

full_cleaned_df_base_train_scaled = pd.DataFrame(ct_base.fit_transform(full_cleaned_df_base_train),
            columns=ct_base.get_feature_names_out())

In [52]:
full_cleaned_df_base_train_scaled.head(3)

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,...,scale__hydro_consumption,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.032366,0.000242,0.009358,0.008496,0.000388,0.004902,0.0,0.003623,0.014178,0.664561,...,0.000247,0.106057,0.193654,0.000683,0.251483,0.153792,Belgium,2015,BEL,41008.296719
1,0.056317,0.003889,6.1e-05,0.01086,0.023296,0.05521,0.077896,0.042938,0.025993,0.59126,...,0.004162,0.109178,0.061387,0.002,0.0003,0.035815,United Kingdom,2008,GBR,44007.549277
2,0.000647,0.002716,0.0,0.0,0.001017,0.001042,5e-06,0.001052,0.003234,0.510613,...,0.002944,0.131589,0.021636,0.042835,0.0,0.0,Slovenia,2006,SVN,20280.151905


## In *final* dataset, replacing Zeros in features, by 0.0001

In [53]:
# Checking the minimum value of energy prduction we currently have
min_value = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']][full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']] != 0].min().min()

In [54]:
min_value

0.006

In [55]:
# Replacing Zeros by 0.0001
full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions',
                       'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production','primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita',
                       'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']] = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity',
                       'solar_electricity', 'wind_electricity','coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports',
                       'nuclear_electricity', 'oil_production','primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita','hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']].replace(0, 0.0001)

In [56]:
(full_cleaned_df_final == 0).sum()

country                       0
year                          0
iso_code                      0
biofuel_elec_per_capita       0
biofuel_electricity           0
coal_electricity              0
fossil_electricity            0
gas_production                0
greenhouse_gas_emissions      0
hydro_consumption             0
hydro_elec_per_capita         0
hydro_electricity             0
low_carbon_electricity        0
net_elec_imports              0
nuclear_electricity           0
oil_production                0
per_capita_electricity        0
primary_energy_consumption    0
solar_elec_per_capita         0
solar_electricity             0
wind_elec_per_capita          0
wind_electricity              0
GDP_per_capita                0
dtype: int64

## Splitting the *final* dataset into training and testing

In [57]:
# Creates X and y for final dataset (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_final = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']]
# y_final = full_cleaned_df_final['GDP_per_capita']

full_cleaned_df_final.head(3)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,biofuel_electricity,coal_electricity,fossil_electricity,gas_production,greenhouse_gas_emissions,hydro_consumption,...,net_elec_imports,nuclear_electricity,oil_production,per_capita_electricity,primary_energy_consumption,solar_elec_per_capita,solar_electricity,wind_elec_per_capita,wind_electricity,GDP_per_capita
0,Algeria,2000,DZA,0.0001,0.0001,0.0001,23.84,918.886,11.88,0.16,...,-0.07,0.0001,776.733,776.289,299.946,0.0001,0.0001,0.0001,0.0001,3138.231048
1,Algeria,2001,DZA,0.0001,0.0001,0.0001,24.96,864.725,12.41,0.203,...,-0.05,0.0001,764.684,802.218,310.813,0.0001,0.0001,0.0001,0.0001,3188.207271
2,Algeria,2002,DZA,0.0001,0.0001,0.0001,25.93,889.416,12.89,0.166,...,-0.02,0.0001,824.497,821.826,321.951,0.0001,0.0001,0.0001,0.0001,3321.638849


In [58]:
# Split final X and y into training and testing datasets (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_final_train, X_final_test, y_final_train, y_final_test = train_test_split(X_final, y_final, test_size=0.2, random_state=0)

# Split full final into training and testing datasets
full_cleaned_df_final_train, full_cleaned_df_final_test = train_test_split(full_cleaned_df_final, test_size=0.2, random_state=0)

## Scaling *final* dataset

In [59]:
# # Step 0 - Instanciate Standard Scaler (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# mm_scaler_final = MinMaxScaler()

# # Step 1- Fit the scaler to the features
# mm_scaler_final.fit(X_final_train)

# # 2-Scale/Transform; <-> apply the transformation and store it in a df
# X_final_train_scaled = pd.DataFrame(mm_scaler_final.transform(X_final_train), columns=X_final_train.columns)

In [60]:
# Creating Pipeline for scaling only the features, passing through the target and the reference columns country/year

columns_to_scale = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity',
                    'coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions',
                    'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production',
                    'primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita',
                    'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']
ct_final = ColumnTransformer([
                ("scale", MinMaxScaler(), columns_to_scale)],
                remainder = "passthrough"
)

In [61]:
# Fit and transforming the scaling on final dataset

full_cleaned_df_final_train_scaled = pd.DataFrame(ct_final.fit_transform(full_cleaned_df_final_train),
            columns=ct_final.get_feature_names_out())

In [62]:
full_cleaned_final_test = pd.DataFrame(ct_final.transform(full_cleaned_df_final_test),
            columns=ct_final.get_feature_names_out())
full_cleaned_final_test

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,...,scale__hydro_consumption,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.003471,0.043777,0.0,0.008938,0.012423,0.030969,0.000721,0.023723,0.0228,0.531399,...,0.045708,0.055346,0.003104,0.018466,0.0,0.024177,Turkey,2012,TUR,9506.996382
1,0.0,0.000166,0.0,0.0,0.0,0.005852,0.087025,0.003529,0.000077,0.510907,...,0.000178,0.016748,0.0,0.000158,0.0,0.0,Algeria,2006,DZA,3783.314204
2,0.041958,0.329717,0.000275,0.005659,0.431234,0.420371,0.06317,0.419847,0.175369,0.459787,...,0.357223,0.037758,0.002151,0.007974,0.000064,0.000877,China,2006,CHN,3800.759082
3,0.000294,0.002618,0.0,0.000076,0.0,0.000357,0.0,0.000232,0.001245,0.522438,...,0.002781,0.045466,0.009313,0.039095,0.0,0.007303,Latvia,2009,LVA,11237.108929
4,0.006473,0.005394,0.000458,0.003142,0.000273,0.000816,0.0,0.000691,0.003682,0.539405,...,0.005392,0.064939,0.107301,0.042191,0.034153,0.157576,Croatia,2021,HRV,15166.413288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,0.117342,0.087599,0.006422,0.041839,0.147426,0.158864,0.039893,0.153266,0.069366,0.5444,...,0.091453,0.014382,0.006197,0.002183,0.001523,0.006684,India,2012,IND,1337.475763
338,0.0,0.0,0.0,0.0,0.0,0.002968,0.063114,0.002057,0.0,0.490342,...,0.000003,0.053187,0.0,0.0,0.0,0.0,Turkmenistan,2012,TKM,5066.231736
339,0.004884,0.017659,0.0,0.000946,0.001322,0.002509,0.004585,0.002191,0.00978,0.51032,...,0.019323,0.17678,0.078642,0.134151,0.0,0.046066,New Zealand,2006,NZL,35624.554072
340,0.0,0.0,0.0,0.0,0.0,0.00072,0.0,0.000615,0.0,0.51032,...,0.0,0.070661,0.0,0.0,0.0,0.0,Cyprus,2003,CYP,23930.564453


In [63]:
full_cleaned_df_final_train_scaled

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,...,scale__hydro_consumption,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.032366,0.000242,0.009357,0.008496,0.000388,0.004902,0.0,0.003623,0.014178,0.664561,...,0.000247,0.106057,0.193654,0.000683,0.251483,0.153792,Belgium,2015,BEL,41008.296719
1,0.056317,0.003889,0.000061,0.01086,0.023296,0.05521,0.077896,0.042938,0.025993,0.59126,...,0.004162,0.109178,0.061387,0.002,0.000299,0.035815,United Kingdom,2008,GBR,44007.549277
2,0.000647,0.002716,0.0,0.0,0.001017,0.001042,0.000005,0.001052,0.003234,0.510613,...,0.002944,0.131589,0.021636,0.042835,0.0,0.0,Slovenia,2006,SVN,20280.151905
3,0.0,0.0,0.0,0.0,0.0,0.027544,0.057659,0.019546,0.0,0.51032,...,0.0,0.12001,0.0,0.0,0.0,0.0,Saudi Arabia,2002,SAU,15512.694346
4,0.0,0.0,0.0,0.0,0.0,0.007655,0.0,0.005149,0.0,0.51032,...,0.0,0.14628,0.0,0.0,0.0,0.0,Singapore,2010,SGP,48752.938074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1361,0.000235,0.002043,0.0,0.000076,0.0,0.000373,0.0,0.000238,0.000975,0.528755,...,0.002212,0.038398,0.007188,0.029433,0.0,0.007046,Latvia,2006,LVA,11895.275058
1362,0.000235,0.012083,0.0,0.0,0.004701,0.00675,0.011494,0.005983,0.008294,0.494969,...,0.013006,0.051703,0.000764,0.018496,0.0,0.0,Romania,2007,ROU,7480.519159
1363,0.33349,0.236761,0.005565,0.183313,0.324665,0.495899,0.66084,0.428621,0.45452,0.783915,...,0.25121,0.22897,0.071506,0.023953,0.00536,0.118915,United States,2011,USA,53394.861838
1364,0.0,0.0,0.000061,0.0,0.004225,0.006174,0.0,0.005296,0.000007,0.601249,...,0.0,0.080991,0.0,0.0,0.002466,0.0,Hong Kong,2019,HKG,44192.380232


In [64]:
full_cleaned_final = full_cleaned_df_final_train_scaled.sort_values('remainder__year', ascending=True)

In [65]:
full_cleaned_final

Unnamed: 0,scale__biofuel_electricity,scale__hydro_electricity,scale__solar_electricity,scale__wind_electricity,scale__coal_electricity,scale__fossil_electricity,scale__gas_production,scale__greenhouse_gas_emissions,scale__low_carbon_electricity,scale__net_elec_imports,...,scale__hydro_consumption,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
452,0.001059,0.024022,0.00003,0.0,0.000427,0.001823,0.00614,0.00144,0.011168,0.51076,...,0.026251,0.018246,0.001818,0.019452,0.000236,0.0,Colombia,2000,COL,4003.840324
1277,0.01171,0.000106,0.00003,0.001266,0.004548,0.014668,0.065683,0.010902,0.002413,0.64921,...,0.000121,0.09841,0.049571,0.000212,0.000581,0.016213,Netherlands,2000,NLD,40456.942237
1120,0.0,0.0,0.0,0.0,0.0,0.005402,0.009762,0.003224,0.0,0.51032,...,0.0,0.276118,0.0,0.0,0.0,0.0,Kuwait,2000,KWT,31657.763247
339,0.002765,0.018302,0.0,0.000183,0.001026,0.001897,0.006717,0.001693,0.009629,0.51032,...,0.020815,0.174031,0.048283,0.150747,0.0,0.009667,New Zealand,2000,NZL,30863.457123
1275,0.0,0.004403,0.0,0.0,0.001832,0.006847,0.054513,0.004868,0.002035,0.519721,...,0.005008,0.030635,0.0,0.00561,0.0,0.0,Uzbekistan,2000,UZB,1276.760315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,0.001706,0.002413,0.001345,0.000015,0.000601,0.000667,0.0,0.000682,0.003329,0.519794,...,0.0,0.109652,0.054181,0.036154,0.191878,0.001465,Slovenia,2022,SVN,23231.243427
540,0.003825,0.00034,0.001315,0.002425,0.0,0.000187,0.0,0.000175,0.001091,0.573999,...,0.0,0.026013,0.09361,0.003931,0.144544,0.179564,Lithuania,2022,LTU,16606.769394
951,0.031248,0.001536,0.007676,0.000991,0.006924,0.007756,0.0,0.007654,0.014507,0.406096,...,0.0,0.142279,0.200403,0.004648,0.22111,0.019237,Czechia,2022,CZE,19402.359402
1352,0.040134,0.014224,0.100214,0.094341,0.001427,0.018873,0.0,0.013405,0.062471,0.365185,...,0.0,0.104868,0.056794,0.009497,0.636974,0.403901,Spain,2022,ESP,26748.793053


In [66]:
full_cleaned_final.to_csv('full_cleaned_final.csv')

In [67]:
full_cleaned_base = full_cleaned_df_base_train_scaled.sort_values('remainder__year', ascending=True)

In [68]:
full_cleaned_base.to_csv('full_cleaned_base.csv')

In [69]:
full_cleaned_df_final_test.to_csv('full_cleaned_final_test.csv')

In [70]:
full_cleaned_final_test.to_csv('full_cleaned_final_test.csv')