# Energy Dataset

## Importing Dataset

Importing the dataset from the Github repository of the project

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/renewable-energy-data-scrapping.csv'

df_energy = pd.read_csv(url)

In [3]:
df_energy.tail(3)

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
21887,Zimbabwe,2019,ZWE,15354606.0,,,,,,24.748,...,0.364,,,,,0.0,0.0,,0.0,
21888,Zimbabwe,2020,ZWE,15669663.0,,,,,,22.336,...,0.395,,,,,0.0,0.0,,0.0,
21889,Zimbabwe,2021,ZWE,15993525.0,,,,,,23.76,...,0.498,,,,,0.0,0.0,,0.0,


## Filtering the rows and columns

Selecting the columns that contain information about renewable energy production 

In [4]:
# columns_to_keep = ['country', 'year', 'iso_code', 'biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity','coal_electricity',
#                    'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions', 'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity',
#                    'oil_production','primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita','hydro_elec_per_capita',
#                    'solar_elec_per_capita','wind_elec_per_capita']

columns_to_keep = ['country', 'year', 'iso_code', 'net_elec_imports','per_capita_electricity','biofuel_elec_per_capita', 'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']

columns_to_drop = list(set(df_energy.columns) - set(columns_to_keep))
df_energy_filtered1 = df_energy.drop(columns=columns_to_drop)

In [5]:
df_energy_filtered1.head(3)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita
0,Afghanistan,1900,AFG,,,,,,
1,Afghanistan,1901,AFG,,,,,,
2,Afghanistan,1902,AFG,,,,,,


## Cleaning rows from years previous to 2000

In [6]:
years_threshold = 1999
df_energy_filtered2 = df_energy_filtered1[df_energy_filtered1.year > years_threshold].reset_index()

In [7]:
df_energy_filtered2.shape

(6457, 10)

## Cleaning rows related to regions (not countries)

Removing the data points related to regions, as we intend to do the analysis on countries

In [8]:
# All countries have a respective 'iso_code', so we will remove the ones that do note have one
df_energy_filtered_countries_only = df_energy_filtered2.dropna(subset=['iso_code'])

In [9]:
df_energy_filtered_countries_only.shape

(4814, 10)

## Creating column with total production of energy

We are not using this at first!!! (This was supposed to be used if we had a lot of zeros/NaNs using the sources sepparated)

In [10]:
# df_energy_filtered_countries_only.loc[:, 'total_renewable_electricity'] = df_energy_filtered_countries_only[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']].sum(axis=1)

In [11]:
# df_energy_filtered_countries_only.head()

In [12]:
# count_sum_equals_zero = (df_energy_filtered_countries_only.total_renewable_electricity == 0).sum()/len(df_energy_filtered)
# count_sum_equals_zero

## Analyzing missing values

How many NaNs we have for each feature?

In [13]:
df_energy_filtered_countries_only.isnull().sum()/len(df_energy_filtered_countries_only)

index                      0.000000
country                    0.000000
year                       0.000000
iso_code                   0.000000
biofuel_elec_per_capita    0.047570
hydro_elec_per_capita      0.029290
net_elec_imports           0.020980
per_capita_electricity     0.020773
solar_elec_per_capita      0.027005
wind_elec_per_capita       0.029082
dtype: float64

In [14]:
## creates a dictionary with all features and the respective amount of Zeros for each
# countries = df_energy_filtered.country.unique()
# missing_values_dict = {}
# for country in countries:
#     percentage_missing_values = ((df_energy_filtered.country == country) & (df_energy_filtered.total_renewable_electricity == 0)).sum()\
#     /len(df_energy_filtered[df_energy_filtered.country == country])
#     missing_values_dict[country] = percentage_missing_values
# missing_values_dict

# Economic Growth Dataset

## Importing Dataset

Importing the dataset from the Github repository of the project

In [15]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/World_Development_Indicators.xlsx'

df_gdp = pd.read_excel(url)

In [16]:
df_gdp.head(1)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],...,1969 [YR1969],1968 [YR1968],1967 [YR1967],1966 [YR1966],1965 [YR1965],1964 [YR1964],1963 [YR1963],1962 [YR1962],1961 [YR1961],1960 [YR1960]
0,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Brazil,BRA,6086.08487,6745.865881,9216.14336,9183.470768,8783.225984,8426.853352,...,3304.860245,3095.225632,2892.687723,2850.904774,2746.261344,2758.866607,2746.517052,2811.630814,2717.004894,2578.432125


## Filtering the rows and columns

In [17]:
df_gdp_filtered = df_gdp.drop(columns=['Series Name', 'Series Code'])

In [18]:
df_gdp_filtered.shape

(266, 65)

## Flattening the years to be in one column (instead of one column per year)

Moving the year columns to become lines respective to their country.<br>
We will use this to merge with the Energy dataset

In [19]:
df_gdp_filtered_flattened = pd.melt(df_gdp_filtered, id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='GDP_per_capita')

In [20]:
df_gdp_filtered_flattened.head(3)

Unnamed: 0,Country Name,Country Code,Year,GDP_per_capita
0,Brazil,BRA,1990 [YR1990],6086.08487
1,Afghanistan,AFG,1990 [YR1990],..
2,Albania,ALB,1990 [YR1990],1606.296047


## Turning 'Year' column into an int

Removing the brackets string in the Year column and turning it into an integer

In [21]:
df_gdp_filtered_flattened['Year'] = df_gdp_filtered_flattened['Year'].str.extract(r'(\d{4})').astype(int)

In [22]:
df_gdp_filtered_flattened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16758 entries, 0 to 16757
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country Name    16758 non-null  object
 1   Country Code    16758 non-null  object
 2   Year            16758 non-null  int64 
 3   GDP_per_capita  16758 non-null  object
dtypes: int64(1), object(3)
memory usage: 523.8+ KB


Renaming the columns so they match the ones in the Energy dataset, easing the merge

In [23]:
df_gdp_filtered_flattened.rename(columns={"Year": "year", "Country Code": "iso_code"}, inplace=True)

# Merging the features and target datasets

In [24]:
full_df = pd.merge(df_energy_filtered_countries_only, df_gdp_filtered_flattened, on=['year', 'iso_code'], how='left')

In [25]:
full_df.drop(columns=['index', 'Country Name'], inplace=True)

In [26]:
full_df.sample(8)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita,GDP_per_capita
2990,Nauru,2014,NRU,0.0,0.0,0.0,1823.985,0.0,0.0,7626.994179
122,Antarctica,2013,ATA,,,,,,,
842,Chad,2006,TCD,0.965,0.0,0.0,12.541,0.0,0.0,662.885503
1699,Gibraltar,2000,GIB,,0.0,0.0,0.0,0.0,0.0,..
1896,Guinea-Bissau,2020,GNB,0.0,0.0,0.0,39.686,0.0,0.0,604.595146
796,Cayman Islands,2004,CYM,0.0,0.0,0.0,10814.151,0.0,0.0,..
4064,South Sudan,2020,SSD,0.0,0.0,0.0,50.914,0.943,0.0,..
423,Belize,2006,BLZ,34.633,623.387,0.21,796.551,0.0,0.0,6400.181438


# Treating the raw dataset

## Removing the countries with 'NaN's in 'GDP_per_capita' (target)

In [27]:
# The countries we remove here do not affect our model

regions_to_remove = full_df['country'][full_df.GDP_per_capita.isna()].unique()
regions_to_remove

array(['Antarctica', 'Cook Islands', 'Falkland Islands', 'French Guiana',
       'Guadeloupe', 'Martinique', 'Montserrat', 'Netherlands Antilles',
       'Niue', 'Reunion', 'Saint Helena', 'Saint Pierre and Miquelon',
       'Taiwan', 'Western Sahara'], dtype=object)

In [28]:
full_df_gdpclean1 = full_df.dropna(subset=['GDP_per_capita']).reset_index(drop=True)
full_df_gdpclean1

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita,GDP_per_capita
0,Afghanistan,2000,AFG,0.000,15.862,0.10,24.050,0.000,0.0,..
1,Afghanistan,2001,AFG,0.000,25.395,0.10,29.967,0.000,0.0,..
2,Afghanistan,2002,AFG,0.000,26.666,0.10,32.857,0.000,0.0,359.766343
3,Afghanistan,2003,AFG,0.000,27.821,0.10,41.510,0.000,0.0,363.101481
4,Afghanistan,2004,AFG,0.000,23.776,0.10,37.786,0.000,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...
4505,Zimbabwe,2017,ZWE,21.693,269.132,2.22,498.268,0.678,0.0,1421.24077
4506,Zimbabwe,2018,ZWE,25.910,335.499,1.02,610.542,1.329,0.0,1462.59028
4507,Zimbabwe,2019,ZWE,24.748,271.580,1.11,536.647,1.954,0.0,1342.989587
4508,Zimbabwe,2020,ZWE,22.336,243.145,1.98,485.014,1.915,0.0,1213.117058


## Removing the countries with missing data in 'GDP_per_capita' (target)

Removing the countries that have more than 45% OF missing data in gdp

In [29]:
total_countries = full_df_gdpclean1['country'].unique()
no_gdp_countries = []
max_missing_values = (2022 - years_threshold)*0.40 # Threshold for missing values of GDP in a country

for i in total_countries:
    if full_df_gdpclean1['GDP_per_capita'][(full_df_gdpclean1.GDP_per_capita == '..') & (full_df_gdpclean1.country == i)].count() > max_missing_values:
        no_gdp_countries.append(i)

no_gdp_countries

['British Virgin Islands',
 'Djibouti',
 'Eritrea',
 'Faroe Islands',
 'Gibraltar',
 'New Caledonia',
 'North Korea',
 'Somalia',
 'Turks and Caicos Islands',
 'Venezuela']

In [30]:
full_df_gdpclean2 = full_df_gdpclean1[~full_df_gdpclean1['country'].isin(no_gdp_countries)].reset_index(drop=True)
full_df_gdpclean2

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita,GDP_per_capita
0,Afghanistan,2000,AFG,0.000,15.862,0.10,24.050,0.000,0.0,..
1,Afghanistan,2001,AFG,0.000,25.395,0.10,29.967,0.000,0.0,..
2,Afghanistan,2002,AFG,0.000,26.666,0.10,32.857,0.000,0.0,359.766343
3,Afghanistan,2003,AFG,0.000,27.821,0.10,41.510,0.000,0.0,363.101481
4,Afghanistan,2004,AFG,0.000,23.776,0.10,37.786,0.000,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...
4285,Zimbabwe,2017,ZWE,21.693,269.132,2.22,498.268,0.678,0.0,1421.24077
4286,Zimbabwe,2018,ZWE,25.910,335.499,1.02,610.542,1.329,0.0,1462.59028
4287,Zimbabwe,2019,ZWE,24.748,271.580,1.11,536.647,1.954,0.0,1342.989587
4288,Zimbabwe,2020,ZWE,22.336,243.145,1.98,485.014,1.915,0.0,1213.117058


## Input value in missing data in target column

Inputing in the countries that have less than 50% missing data in gdp

In [31]:
# All the indexes of rows that have missing data
nan_target_indexes = full_df_gdpclean2[full_df_gdpclean2['GDP_per_capita'] == '..'].index
nan_target_indexes

Index([   0,    1,   65,   66,  241,  396,  617,  750,  751,  752,  753,  754,
        755,  992, 1037, 1060, 1105, 1279, 1368, 1391, 1502, 1547, 1569, 1592,
       1593, 1790, 1923, 1968, 2122, 2189, 2300, 2323, 2478, 2714, 2715, 2716,
       2717, 2780, 2891, 2892, 3131, 3154, 3221, 3354, 3530, 3553, 3624, 3625,
       3626, 3627, 3628, 3629, 3652, 3741, 3785, 3982, 3983, 4114, 4115, 4135,
       4243, 4244, 4245],
      dtype='int64')

In [32]:
# name_values = full_df_gdpclean2.iloc[nan_target_indexes]['iso_code'].values
# np.unique(name_values)

In [33]:
# Getting the parameters for calculating the average GDP to be inputed (MANUAL CALCULATION FOR THE 'get_parameters_input_gdp_mean' FUNCTION

# index = item.index[0]
# country = item.loc[index, 'country'] # Country of the input row
# year = item.loc[index, 'year'] # Year of the input row
# years = np.arange(year-5, year+6, 1) # Range of years used to calculate the mean
# years = np.delete(years, np.where(years == year))
# index, country, year, years

In [34]:
# Function for getting the parameters for calculating the average GDP to be inputed

def get_parameters_input_gdp_mean(nan_index):
    country = full_df_gdpclean2.loc[nan_index, 'country'] # Country of the input row
    year = full_df_gdpclean2.loc[nan_index, 'year'] # Year of the input row
    years = np.arange(year-6, year+7, 1) # Range of years used to calculate the mean
    years = np.delete(years, np.where(years == year))
    # index, country, year, years
    return country, years

In [35]:
# Function for calculating the mean value to be inputed

def calculate_input_gdp_mean(country, years):
    gdp_values = []
    for i in years:
        gdp = full_df_gdpclean2.loc[(full_df_gdpclean2['country'] == country) & (full_df_gdpclean2['year'] == i), 'GDP_per_capita']
        if not gdp.empty:
            gdp_value = pd.to_numeric(gdp.item(), errors='coerce')
            if type(gdp_value) == float:
                gdp_values.append(gdp_value)
                # print(gdp_values)
    
    input_gdp_mean = np.mean(gdp_values)
    print(input_gdp_mean)
    return input_gdp_mean

In [36]:
len(full_df_gdpclean2.country.unique())

195

In [37]:
# Interating over the rows that have missing values in the target and inputing the calculated meand from +/- 6 years
# USE THIS IN CASE WE ARE GETTING VALUES FROM PREVIOUS TO 2000
# import math
# for nan_target_index in nan_target_indexes:
#     try:
#         country, years = get_parameters_input_gdp_mean(nan_target_index)
#         input_gdp_mean = calculate_input_gdp_mean(country, years)
#         if math.isnan(input_gdp_mean):
#             full_df_gdpclean2 = full_df_gdpclean2[~(full_df_gdpclean2['country'] == country)]
#             print(f'Removed {country} as it did not have anough GDP data to fill missing values with calculated mean')
#         else:
#             full_df_gdpclean2.at[nan_target_index, 'GDP_per_capita'] = input_gdp_mean
#             print(f'Inputed {round(input_gdp_mean)} GDP mean for {country} in {years[5] + 1}')
#     except:
#         pass

for nan_target_indexe in nan_target_indexes:
    country, years = get_parameters_input_gdp_mean(nan_target_indexe)
    input_gdp_mean = calculate_input_gdp_mean(country, years)            
    print(f'Inputed {round(input_gdp_mean)} GDP mean for {country} in {years[5] + 1}')
    full_df_gdpclean2.at[nan_target_indexe, 'GDP_per_capita'] = input_gdp_mean

368.18717431310307
Inputed 368 GDP mean for Afghanistan in 2000
378.3914014845572
Inputed 378 GDP mean for Afghanistan in 2001
12943.46728182228
Inputed 12943 GDP mean for American Samoa in 2000
12957.825448914155
Inputed 12958 GDP mean for American Samoa in 2001
45227.96197798152
Inputed 45228 GDP mean for Austria in 2022
42023.06833039157
Inputed 42023 GDP mean for Belgium in 2022
7937.525962808315
Inputed 7938 GDP mean for Bulgaria in 2022
96179.76349915199
Inputed 96180 GDP mean for Cayman Islands in 2000
96231.73853750998
Inputed 96232 GDP mean for Cayman Islands in 2001
95211.0490583366
Inputed 95211 GDP mean for Cayman Islands in 2002
92415.78615612001
Inputed 92416 GDP mean for Cayman Islands in 2003
89837.17042149467
Inputed 89837 GDP mean for Cayman Islands in 2004
87930.40186099533
Inputed 87930 GDP mean for Cayman Islands in 2005
13533.319255092947
Inputed 13533 GDP mean for Croatia in 2022
26960.341145833332
Inputed 26960 GDP mean for Cyprus in 2022
19402.359402287457
Inpu

In [38]:
# Check if there are no more missing values in the target
check_nan_target_indexes = full_df_gdpclean2[full_df_gdpclean2['GDP_per_capita'] == '..'].index
if check_nan_target_indexes.empty == True:
    print('Inputing succeeded!')
else:
    print('Not succeeded :(')

Inputing succeeded!


In [39]:
len(full_df_gdpclean2)

4290

At this point, we have the full dataset with with all target values filled

## Treating 'NaN's from the features

In [40]:
# Checking countries that have more than 35% NaN values in its features

# feat_columns = ['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity',
#            'coal_electricity', 'fossil_electricity', 'gas_production', 'greenhouse_gas_emissions',
#            'low_carbon_electricity', 'net_elec_imports', 'nuclear_electricity', 'oil_production',
#            'primary_energy_consumption','hydro_consumption','per_capita_electricity','biofuel_elec_per_capita',
#            'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']

feat_columns = ['net_elec_imports','per_capita_electricity','biofuel_elec_per_capita', 'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']
max_nan_values = (2022 - years_threshold)*len(feat_columns)*0.25 # Calculates the threshold value for 35% of total datapoints per country

total_countries = full_df_gdpclean2['country'].unique()
no_energy_countries = []

for i in total_countries:
    mask = full_df_gdpclean2['country'] == i
    nan_sum = full_df_gdpclean2.loc[mask, feat_columns].isna().sum().sum()
    if nan_sum > max_nan_values:
        no_energy_countries.append(i)
print(no_energy_countries)

['Micronesia (country)', 'Northern Mariana Islands', 'Tuvalu']


In [41]:
# Dropping countries that have more than 33 NaN values in its features (>25%)
full_df_gdp_feat_clean = full_df_gdpclean2[~full_df_gdpclean2['country'].isin(no_energy_countries)].reset_index(drop=True)
full_df_gdp_feat_clean

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita,GDP_per_capita
0,Afghanistan,2000,AFG,0.000,15.862,0.10,24.050,0.000,0.0,368.187174
1,Afghanistan,2001,AFG,0.000,25.395,0.10,29.967,0.000,0.0,378.391401
2,Afghanistan,2002,AFG,0.000,26.666,0.10,32.857,0.000,0.0,359.766343
3,Afghanistan,2003,AFG,0.000,27.821,0.10,41.510,0.000,0.0,363.101481
4,Afghanistan,2004,AFG,0.000,23.776,0.10,37.786,0.000,0.0,354.033913
...,...,...,...,...,...,...,...,...,...,...
4225,Zimbabwe,2017,ZWE,21.693,269.132,2.22,498.268,0.678,0.0,1421.24077
4226,Zimbabwe,2018,ZWE,25.910,335.499,1.02,610.542,1.329,0.0,1462.59028
4227,Zimbabwe,2019,ZWE,24.748,271.580,1.11,536.647,1.954,0.0,1342.989587
4228,Zimbabwe,2020,ZWE,22.336,243.145,1.98,485.014,1.915,0.0,1213.117058


In [42]:
full_df_gdp_feat_clean.isnull().sum()

country                     0
year                        0
iso_code                    0
biofuel_elec_per_capita    45
hydro_elec_per_capita       1
net_elec_imports            1
per_capita_electricity      0
solar_elec_per_capita       0
wind_elec_per_capita        0
GDP_per_capita              0
dtype: int64

In [43]:
# Filling remaining NaN values that were supposed to be zero (values from 2021 with all historical values equals to zero)

# full_df_gdp_feat_clean['biofuel_electricity'] = full_df_gdp_feat_clean['biofuel_electricity'].fillna(0)
# full_df_gdp_feat_clean['hydro_electricity'] = full_df_gdp_feat_clean['hydro_electricity'].fillna(0)
# full_df_gdp_feat_clean['gas_production'] = full_df_gdp_feat_clean['gas_production'].fillna(0)
# full_df_gdp_feat_clean['oil_production'] = full_df_gdp_feat_clean['oil_production'].fillna(0)
# full_df_gdp_feat_clean['coal_electricity'] = full_df_gdp_feat_clean['coal_electricity'].fillna(0)
# full_df_gdp_feat_clean['fossil_electricity'] = full_df_gdp_feat_clean['fossil_electricity'].fillna(0)
# full_df_gdp_feat_clean['greenhouse_gas_emissions'] = full_df_gdp_feat_clean['greenhouse_gas_emissions'].fillna(0)
full_df_gdp_feat_clean['net_elec_imports'] = full_df_gdp_feat_clean['net_elec_imports'].fillna(0)
# full_df_gdp_feat_clean['nuclear_electricity'] = full_df_gdp_feat_clean['nuclear_electricity'].fillna(0)
# full_df_gdp_feat_clean['primary_energy_consumption'] = full_df_gdp_feat_clean['primary_energy_consumption'].fillna(0)
# full_df_gdp_feat_clean['hydro_consumption'] = full_df_gdp_feat_clean['hydro_consumption'].fillna(0)
# full_df_gdp_feat_clean['per_capita_electricity'] = full_df_gdp_feat_clean['per_capita_electricity'].fillna(0)
# full_df_gdp_feat_clean['biofuel_elec_per_capita'] = full_df_gdp_feat_clean['biofuel_elec_per_capita'].fillna(0)
full_df_gdp_feat_clean['hydro_elec_per_capita'] = full_df_gdp_feat_clean['hydro_elec_per_capita'].fillna(0)
# full_df_gdp_feat_clean['solar_elec_per_capita'] = full_df_gdp_feat_clean['solar_elec_per_capita'].fillna(0)
# full_df_gdp_feat_clean['wind_elec_per_capita'] = full_df_gdp_feat_clean['wind_elec_per_capita'].fillna(0)


In [44]:
full_df_gdp_feat_clean.isnull().sum()

country                     0
year                        0
iso_code                    0
biofuel_elec_per_capita    45
hydro_elec_per_capita       0
net_elec_imports            0
per_capita_electricity      0
solar_elec_per_capita       0
wind_elec_per_capita        0
GDP_per_capita              0
dtype: int64

In [45]:
(full_df_gdp_feat_clean == 0).sum()/len(full_df_gdp_feat_clean)

country                    0.000000
year                       0.000000
iso_code                   0.000000
biofuel_elec_per_capita    0.483688
hydro_elec_per_capita      0.251064
net_elec_imports           0.391017
per_capita_electricity     0.005437
solar_elec_per_capita      0.623404
wind_elec_per_capita       0.645390
GDP_per_capita             0.000000
dtype: float64

# Creating two versions of the treated dataset

Creates the version that will keep Zero values (*base*), and the one that will have 0.0001 inplace of Zero values (*final*)

In [46]:
full_cleaned_df_base = full_df_gdp_feat_clean
full_cleaned_df_final = full_df_gdp_feat_clean

# Pre-processing the features

## Splitting the *base* dataset into training and testing

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
# Creates X and y for base dataset (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_base = full_cleaned_df_base[['biofuel_electricity', 'hydro_electricity', 'solar_electricity', 'wind_electricity']]
# y_base = full_cleaned_df_base['GDP_per_capita']

full_cleaned_df_base.head(3)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita,GDP_per_capita
0,Afghanistan,2000,AFG,0.0,15.862,0.1,24.05,0.0,0.0,368.187174
1,Afghanistan,2001,AFG,0.0,25.395,0.1,29.967,0.0,0.0,378.391401
2,Afghanistan,2002,AFG,0.0,26.666,0.1,32.857,0.0,0.0,359.766343


In [49]:
# Split base X and y into training and testing datasets (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_base, y_base, test_size=0.2, random_state=0)

# Split full base into training and testing datasets
full_cleaned_df_base_train, full_cleaned_df_base_test = train_test_split(full_cleaned_df_base, test_size=0.2, random_state=0)

## Scaling *base* dataset

In [50]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.compose import ColumnTransformer

In [51]:
# # Step 0 - Instanciate Standard Scaler (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# mm_scaler_base = MinMaxScaler()

# # Step 1- Fit the scaler to the features
# mm_scaler_base.fit(X_base_train)

# # 2-Scale/Transform; <-> apply the transformation and store it in a df
# X_base_train_scaled = pd.DataFrame(mm_scaler_base.transform(X_base_train), columns=X_base_train.columns)

In [52]:
# Creating Pipeline for scaling only the features, passing through the target and the reference columns country/year

columns_to_scale = ['net_elec_imports','per_capita_electricity','biofuel_elec_per_capita', 'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']

ct_base = ColumnTransformer([
                ("scale", MinMaxScaler(), columns_to_scale)],
                remainder = "passthrough"
)

In [53]:
# Fit and transforming the scaling on base train dataset

full_cleaned_df_base_train_scaled = pd.DataFrame(
                                        ct_base.fit_transform(full_cleaned_df_base_train),
                                        columns=ct_base.get_feature_names_out()
)

In [54]:
# Transforming the base test dataset

full_cleaned_df_base_test_scaled = pd.DataFrame(
                                        ct_base.transform(full_cleaned_df_base_test),
                                        columns=ct_base.get_feature_names_out()
                                      )
full_cleaned_df_base_test_scaled

Unnamed: 0,scale__net_elec_imports,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.536047,0.002145,0.001219,0.001495,0.0,0.0,Myanmar,2005,MMR,556.14213
1,0.546138,0.117254,0.237075,0.000131,0.0,0.391127,Denmark,2008,DNK,53345.357618
2,0.579054,0.026593,0.118448,0.003626,0.170534,0.119794,Luxembourg,2017,LUX,107142.127556
3,0.848921,0.085069,0.052733,0.019821,0.010554,0.034101,Italy,2009,ITA,31615.270815
4,0.534099,0.001772,0.000774,0.00226,0.002583,0.0,Uganda,2019,UGA,922.023177
...,...,...,...,...,...,...,...,...,...,...
841,0.536047,0.001728,0.0,0.001819,0.00159,0.0,Guinea,2015,GIN,756.425594
842,0.533264,0.031745,0.0,0.00342,0.002431,0.003812,Egypt,2012,EGY,3288.794355
843,0.536047,0.012035,0.00367,0.004446,0.004728,0.005343,Sri Lanka,2017,LKA,4440.511975
844,0.674809,0.223554,0.972477,0.057935,0.015085,0.328848,Finland,2018,FIN,45627.919895


In [55]:
full_cleaned_df_base_train_scaled.head(3)

Unnamed: 0,scale__net_elec_imports,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.536047,0.066621,0.0,0.0,0.0,0.0,Saint Kitts and Nevis,2011,KNA,17653.987617
1,0.535908,0.020171,0.0,0.000377,0.0,0.000914,Tunisia,2003,TUN,3021.626688
2,0.540571,0.001457,0.0,0.000157,0.001398,0.0,Burkina Faso,2017,BFA,671.251794


## In *final* dataset, replacing Zeros in features, by 0.0001

In [56]:
# Checking the minimum value of energy prduction we currently have
min_value = full_cleaned_df_final[['per_capita_electricity', 'biofuel_elec_per_capita', 'hydro_elec_per_capita', 'solar_elec_per_capita', 'wind_elec_per_capita']][full_cleaned_df_final[['per_capita_electricity','biofuel_elec_per_capita', 'hydro_elec_per_capita','solar_elec_per_capita',  'wind_elec_per_capita']] != 0].min().min()

In [57]:
min_value

0.009

In [58]:
# Replacing Zeros by 0.0001
full_cleaned_df_final[['net_elec_imports','per_capita_electricity','biofuel_elec_per_capita', 'hydro_elec_per_capita','solar_elec_per_capita',\
                       'wind_elec_per_capita']] = full_cleaned_df_final[['net_elec_imports','per_capita_electricity','biofuel_elec_per_capita',\
                       'hydro_elec_per_capita','solar_elec_per_capita','wind_elec_per_capita']].replace(0, 0.0001)

In [59]:
(full_cleaned_df_final == 0).sum()

country                    0
year                       0
iso_code                   0
biofuel_elec_per_capita    0
hydro_elec_per_capita      0
net_elec_imports           0
per_capita_electricity     0
solar_elec_per_capita      0
wind_elec_per_capita       0
GDP_per_capita             0
dtype: int64

## Splitting the *final* dataset into training and testing

In [60]:
# Creates X and y for final dataset (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_final = full_cleaned_df_final[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']]
# y_final = full_cleaned_df_final['GDP_per_capita']

full_cleaned_df_final.head(3)

Unnamed: 0,country,year,iso_code,biofuel_elec_per_capita,hydro_elec_per_capita,net_elec_imports,per_capita_electricity,solar_elec_per_capita,wind_elec_per_capita,GDP_per_capita
0,Afghanistan,2000,AFG,0.0001,15.862,0.1,24.05,0.0001,0.0001,368.187174
1,Afghanistan,2001,AFG,0.0001,25.395,0.1,29.967,0.0001,0.0001,378.391401
2,Afghanistan,2002,AFG,0.0001,26.666,0.1,32.857,0.0001,0.0001,359.766343


In [61]:
# Split final X and y into training and testing datasets (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# X_final_train, X_final_test, y_final_train, y_final_test = train_test_split(X_final, y_final, test_size=0.2, random_state=0)

# Split full final into training and testing datasets
full_cleaned_df_final_train, full_cleaned_df_final_test = train_test_split(full_cleaned_df_final, test_size=0.2, random_state=0)

## Scaling *final* dataset

In [62]:
# # Step 0 - Instanciate Standard Scaler (JUST USE THIS IF WE ARE SEPPARATING X FROM Y)
# mm_scaler_final = MinMaxScaler()

# # Step 1- Fit the scaler to the features
# mm_scaler_final.fit(X_final_train)

# # 2-Scale/Transform; <-> apply the transformation and store it in a df
# X_final_train_scaled = pd.DataFrame(mm_scaler_final.transform(X_final_train), columns=X_final_train.columns)

In [63]:
# Creating Pipeline for scaling only the features, passing through the target and the reference columns country/year

columns_to_scale = ['net_elec_imports','per_capita_electricity','biofuel_elec_per_capita', 'hydro_elec_per_capita','solar_elec_per_capita',\
                    'wind_elec_per_capita']
ct_final = ColumnTransformer([
                ("scale", MinMaxScaler(), columns_to_scale)],
                remainder = "passthrough"
)

In [64]:
# Fit and transforming the scaling on final train dataset

full_cleaned_df_final_train_scaled = pd.DataFrame(ct_final.fit_transform(full_cleaned_df_final_train),
                                                    columns=ct_final.get_feature_names_out()
                                                 )

full_cleaned_df_final_train_scaled.head(3)

Unnamed: 0,scale__net_elec_imports,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.536048,0.066621,0.0,0.0,0.0,0.0,Saint Kitts and Nevis,2011,KNA,17653.987617
1,0.535908,0.020171,0.0,0.000377,0.0,0.000914,Tunisia,2003,TUN,3021.626688
2,0.540571,0.001457,0.0,0.000157,0.001398,0.0,Burkina Faso,2017,BFA,671.251794


In [65]:
# Transforming the final test dataset

full_cleaned_df_final_test_scaled = pd.DataFrame(ct_final.transform(full_cleaned_df_final_test),
                                        columns=ct_final.get_feature_names_out()
                                      )
full_cleaned_df_final_test_scaled.head(3)

Unnamed: 0,scale__net_elec_imports,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
0,0.536048,0.002145,0.001219,0.001495,0.0,0.0,Myanmar,2005,MMR,556.14213
1,0.546138,0.117254,0.237075,0.000131,0.0,0.391127,Denmark,2008,DNK,53345.357618
2,0.579054,0.026593,0.118448,0.003626,0.170534,0.119794,Luxembourg,2017,LUX,107142.127556


In [66]:
full_cleaned_final = full_cleaned_df_final_train_scaled.sort_values('remainder__year', ascending=True)

In [67]:
full_cleaned_final

Unnamed: 0,scale__net_elec_imports,scale__per_capita_electricity,scale__biofuel_elec_per_capita,scale__hydro_elec_per_capita,scale__solar_elec_per_capita,scale__wind_elec_per_capita,remainder__country,remainder__year,remainder__iso_code,remainder__GDP_per_capita
2208,0.55936,0.034168,0.001102,0.011572,0.0,0.000145,Turkey,2000,TUR,6454.593119
444,0.536048,0.001117,0.0,0.000805,0.0,0.0,Haiti,2000,HTI,1332.247467
2224,0.536048,0.000683,0.0,0.000099,0.0,0.0,Cambodia,2000,KHM,488.002702
190,0.536048,0.012917,0.000935,0.014431,0.0,0.0,Peru,2000,PER,3279.395593
193,0.536395,0.001247,0.002049,0.001478,0.0,0.0,Tanzania,2000,TZA,551.656616
...,...,...,...,...,...,...,...,...,...,...
688,0.560195,0.088971,0.020003,0.010943,0.589289,0.32478,Greece,2022,GRC,18382.157564
3340,0.600418,0.080559,0.160995,0.017754,0.276316,0.401262,Portugal,2022,PRT,20575.123491
943,0.598051,0.1275,0.168238,0.096938,0.282305,0.240757,Austria,2022,AUT,45227.961978
2066,0.834586,0.083224,0.129346,0.012184,0.434208,0.104739,Italy,2022,ITA,31082.902914


In [68]:
full_cleaned_final.to_csv('full_cleaned_final.csv')

In [69]:
full_cleaned_base = full_cleaned_df_base_train_scaled.sort_values('remainder__year', ascending=True)

In [70]:
full_cleaned_base.to_csv('full_cleaned_base.csv')

In [71]:
full_cleaned_df_final_test.to_csv('full_cleaned_final_test.csv')

In [1]:
full_cleaned_df_final_test.to_csv('full_cleaned_final_test.csv') #new test

NameError: name 'full_cleaned_df_final_test' is not defined