# Energy Dataset

## Importing Dataset

In [25]:
import pandas as pd

url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/renewable-energy-data-scrapping.csv'

df_energy = pd.read_csv(url)

In [26]:
df_energy.year.unique()

array([1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910,
       1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921,
       1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932,
       1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943,
       1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954,
       1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965,
       1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976,
       1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
       1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
       1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021, 2022])

## Filtering the rows and columns

Selecting the columns that contain information about renewable energy sources 

In [29]:
columns_to_keep = ['country', 'year', 'iso_code', 'biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']
columns_to_drop = list(set(df_energy.columns) - set(columns_to_keep))
df_energy_filtered = df_energy.drop(columns=columns_to_drop)

In [30]:
df_energy_filtered.head(3)

Unnamed: 0,country,year,iso_code,biofuel_electricity,hydro_electricity,other_renewable_exc_biofuel_electricity,solar_electricity,wind_electricity
0,Afghanistan,1900,AFG,,,,,
1,Afghanistan,1901,AFG,,,,,
2,Afghanistan,1902,AFG,,,,,


In [31]:
df_energy_filtered.head(3)

Unnamed: 0,country,year,iso_code,biofuel_electricity,hydro_electricity,other_renewable_exc_biofuel_electricity,solar_electricity,wind_electricity
0,Afghanistan,1900,AFG,,,,,
1,Afghanistan,1901,AFG,,,,,
2,Afghanistan,1902,AFG,,,,,


## Cleaning rows from years previous to 2000

In [32]:
df_energy_filtered = df_energy_filtered[df_energy_filtered.year > 1999].reset_index()

In [33]:
df_energy_filtered.shape

(6457, 9)

## Cleaning rows related to regions (not countries)

In [34]:
df_energy_filtered_countries_only = df_energy_filtered.dropna(subset=['iso_code'])

In [35]:
df_energy_filtered_countries_only.shape

(4814, 9)

## Creating column with total production of energy

In [None]:
# df_energy_filtered_countries_only.loc[:, 'total_renewable_electricity'] = df_energy_filtered_countries_only[['biofuel_electricity', 'hydro_electricity', 'other_renewable_exc_biofuel_electricity', 'solar_electricity', 'wind_electricity']].sum(axis=1)

In [38]:
# df_energy_filtered_countries_only.head()

In [39]:
# count_sum_equals_zero = (df_energy_filtered_countries_only.total_renewable_electricity == 0).sum()/len(df_energy_filtered)
# count_sum_equals_zero

## Analyzing missing values

In [37]:
df_energy_filtered_countries_only.isnull().sum()/len(df_energy_filtered_countries_only)

index                                      0.000000
country                                    0.000000
year                                       0.000000
iso_code                                   0.000000
biofuel_electricity                        0.047570
hydro_electricity                          0.029290
other_renewable_exc_biofuel_electricity    0.052971
solar_electricity                          0.027005
wind_electricity                           0.029082
dtype: float64

In [40]:
# countries = df_energy_filtered.country.unique()

In [None]:
# missing_values_dict = {}
# for country in countries:
#     percentage_missing_values = ((df_energy_filtered.country == country) & (df_energy_filtered.total_renewable_electricity == 0)).sum()\
#     /len(df_energy_filtered[df_energy_filtered.country == country])
#     missing_values_dict[country] = percentage_missing_values

In [None]:
# missing_values_dict

# Economic Growth Dataset

## Importing Dataset

In [41]:
url = 'https://github.com/up841068/energy-economic-growth/raw/main/raw_data/World_Development_Indicators.xlsx'

df_gdp = pd.read_excel(url)

In [42]:
df_gdp.head(3)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],...,1969 [YR1969],1968 [YR1968],1967 [YR1967],1966 [YR1966],1965 [YR1965],1964 [YR1964],1963 [YR1963],1962 [YR1962],1961 [YR1961],1960 [YR1960]
0,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Brazil,BRA,6086.08487,6745.865881,9216.14336,9183.470768,8783.225984,8426.853352,...,3304.860245,3095.225632,2892.687723,2850.904774,2746.261344,2758.866607,2746.517052,2811.630814,2717.004894,2578.432125
1,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Afghanistan,AFG,..,..,608.386715,602.516979,592.476537,590.427739,...,..,..,..,..,..,..,..,..,..,..
2,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,Albania,ALB,1606.296047,1960.881946,3780.698202,3855.759734,3952.802538,4090.371657,...,..,..,..,..,..,..,..,..,..,..


## Filtering the rows and columns

In [43]:
df_gdp_filtered = df_gdp.drop(columns=['Series Name', 'Series Code'])

In [44]:
df_gdp_filtered.shape

(266, 65)

## Flattening the years to be in one column (instead of one column per year)

In [45]:
df_gdp_filtered_flattened = pd.melt(df_gdp_filtered, id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='GDP_per_capita')

In [46]:
df_gdp_filtered_flattened

Unnamed: 0,Country Name,Country Code,Year,GDP_per_capita
0,Brazil,BRA,1990 [YR1990],6086.08487
1,Afghanistan,AFG,1990 [YR1990],..
2,Albania,ALB,1990 [YR1990],1606.296047
3,Algeria,DZA,1990 [YR1990],3153.476864
4,American Samoa,ASM,1990 [YR1990],..
...,...,...,...,...
16753,Sub-Saharan Africa,SSF,1960 [YR1960],1135.306164
16754,Sub-Saharan Africa (excluding high income),SSA,1960 [YR1960],1134.964321
16755,Sub-Saharan Africa (IDA & IBRD countries),TSS,1960 [YR1960],1135.306164
16756,Upper middle income,UMC,1960 [YR1960],1155.832773


## Turning 'Year' column into an int

In [47]:
df_gdp_filtered_flattened['Year'] = df_gdp_filtered_flattened['Year'].str.extract(r'(\d{4})').astype(int)

In [48]:
df_gdp_filtered_flattened.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16758 entries, 0 to 16757
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Country Name    16758 non-null  object
 1   Country Code    16758 non-null  object
 2   Year            16758 non-null  int64 
 3   GDP_per_capita  16758 non-null  object
dtypes: int64(1), object(3)
memory usage: 523.8+ KB


In [49]:
df_gdp_filtered_flattened.rename(columns={"Year": "year", "Country Code": "iso_code"}, inplace=True)

# Merging the features and target datasets

In [73]:
full_df = pd.merge(df_energy_filtered_countries_only, df_gdp_filtered_flattened, on=['year', 'iso_code'], how='left')

In [76]:
full_df.drop(columns=['index', 'Country Name'], inplace=True)

KeyError: "['index', 'Country Name'] not found in axis"

In [60]:
full_df.sample(8)

Unnamed: 0,country,year,iso_code,biofuel_electricity,hydro_electricity,other_renewable_exc_biofuel_electricity,solar_electricity,wind_electricity,GDP_per_capita
4122,Sudan,2011,SDN,0.15,6.4,0.0,0.0,0.0,2381.370361
4728,Western Sahara,2002,ESH,0.0,0.0,0.0,0.0,0.0,
369,Barbados,2019,BRB,0.0,0.0,0.0,0.04,0.0,17168.145687
3495,Portugal,2015,PRT,3.1,8.66,0.2,0.8,11.61,19250.106538
2687,Malta,2013,MLT,0.01,0.0,0.0,0.03,0.0,22071.03161
4666,Vanuatu,2006,VUT,0.0,0.0,0.0,0.0,0.0,2669.776623
2253,Kazakhstan,2000,KAZ,0.0,7.53,,0.0,0.0,4446.452533
2113,Iraq,2016,IRQ,0.0,3.34,0.0,0.06,0.0,4903.823396


In [78]:
full_df['country'][full_df.GDP_per_capita.isna()].unique()

array(['Antarctica', 'Cook Islands', 'Falkland Islands', 'French Guiana',
       'Guadeloupe', 'Martinique', 'Montserrat', 'Netherlands Antilles',
       'Niue', 'Reunion', 'Saint Helena', 'Saint Pierre and Miquelon',
       'Taiwan', 'Western Sahara'], dtype=object)

In [62]:
full_df.isnull().sum()/len(full_df)

country                                    0.000000
year                                       0.000000
iso_code                                   0.000000
biofuel_electricity                        0.047570
hydro_electricity                          0.029290
other_renewable_exc_biofuel_electricity    0.052971
solar_electricity                          0.027005
wind_electricity                           0.029082
GDP_per_capita                             0.063149
dtype: float64

In [64]:
(full_df == 0).sum()/len(full_df)

country                                    0.000000
year                                       0.000000
iso_code                                   0.000000
biofuel_electricity                        0.498130
hydro_electricity                          0.285002
other_renewable_exc_biofuel_electricity    0.845035
solar_electricity                          0.630245
wind_electricity                           0.637516
GDP_per_capita                             0.000000
dtype: float64

In [71]:
full_df.to_csv('/home/pedroabisamara/code/final_project/cleaned_dataset.csv')