In [145]:
#Data manipulation libraries
import pandas as pd
import numpy as np

## 1. The power plants data

In [146]:
# load data
# SOURCE:
    # http://datasets.wri.org/dataset/globalpowerplantdatabase

powerplants = pd.read_csv("data/globalpowerplantdatabasev120/global_power_plant_database.csv")
powerplants.head()

Unnamed: 0,country,country_long,name,gppd_idnr,capacity_mw,latitude,longitude,primary_fuel,other_fuel1,other_fuel2,...,url,geolocation_source,wepp_id,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,estimated_generation_gwh
0,AFG,Afghanistan,Kajaki Hydroelectric Power Plant Afghanistan,GEODB0040538,33.0,32.322,65.119,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009793.0,2017.0,,,,,,
1,AFG,Afghanistan,Mahipar Hydroelectric Power Plant Afghanistan,GEODB0040541,66.0,34.556,69.4787,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009795.0,2017.0,,,,,,
2,AFG,Afghanistan,Naghlu Dam Hydroelectric Power Plant Afghanistan,GEODB0040534,100.0,34.641,69.717,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009797.0,2017.0,,,,,,
3,AFG,Afghanistan,Nangarhar (Darunta) Hydroelectric Power Plant ...,GEODB0040536,11.55,34.4847,70.3633,Hydro,,,...,http://globalenergyobservatory.org,GEODB,1009787.0,2017.0,,,,,,
4,AFG,Afghanistan,Northwest Kabul Power Plant Afghanistan,GEODB0040540,42.0,34.5638,69.1134,Gas,,,...,http://globalenergyobservatory.org,GEODB,,2017.0,,,,,,


In [147]:
# rename country columns
powerplants.rename(columns = {"country": "country_short", "country_long":"country"},inplace=True)

In [148]:
# drop columns we do not need
powerplants.drop(columns = ["url","gppd_idnr","geolocation_source","source"],inplace = True)

In [149]:
# investigate the other_fuel columns

In [150]:
(print(f"more than 1 fuel type: {sum(~powerplants['other_fuel1'].isna())}"
       f"\nmore than 2 fuel types {sum(~powerplants['other_fuel2'].isna())}"
       f"\nmore than 3 fuel types {sum(~powerplants['other_fuel3'].isna())}\n"))

more than 1 fuel type: 1963
more than 2 fuel types 303
more than 3 fuel types 104



In [151]:
powerplants.drop(columns = ["other_fuel2","other_fuel3"],inplace=True)

In [152]:
powerplants.rename(columns = {"other_fuel1":"other_fuel"},inplace = True)

## 2. World data set

In [153]:
countries = pd.read_csv("data/countries-of-the-world.csv")

In [154]:
countries.columns = countries.columns.str.lower()

In [155]:
countries.loc[:,"country"] = countries.loc[:,"country"].str.strip()

In [156]:
## for some reason most of the numeric columns are strings with commas, instead of dots
str_cols = list(countries.columns.values)[4:8] + list(countries.columns.values)[9:]
countries.dtypes

for col in str_cols:
    countries[col] = pd.to_numeric(countries[col].str.replace(",","."))
    


In [157]:
countries.dtypes

country                                object
region                                 object
population                              int64
area (sq. mi.)                          int64
pop. density (per sq. mi.)            float64
coastline (coast/area ratio)          float64
net migration                         float64
infant mortality (per 1000 births)    float64
gdp ($ per capita)                    float64
literacy (%)                          float64
phones (per 1000)                     float64
arable (%)                            float64
crops (%)                             float64
other (%)                             float64
climate                               float64
birthrate                             float64
deathrate                             float64
agriculture                           float64
industry                              float64
service                               float64
dtype: object

## 3. Checking if countries in both sets have same name

In [158]:
# countries in powerplants dataset
powerplants_m = powerplants.merge(right = countries, how = "left", on = "country")
powerplants_m.loc[powerplants_m["population"].isna(),"country"].value_counts()

United States of America            8686
South Korea                          128
Myanmar                               34
North Korea                           31
Bosnia and Herzegovina                20
Syrian Arab Republic                  18
Democratic Republic of the Congo      15
Cote DIvoire                           8
Congo                                  8
Trinidad and Tobago                    6
Brunei Darussalam                      4
Montenegro                             3
Kosovo                                 2
Antarctica                             2
Gambia                                 2
Central African Republic               2
Name: country, dtype: int64

In [159]:
# check those countries in the countries dataset
countries.loc[countries["country"].str.contains(""),"country"]

0         Afghanistan
1             Albania
2             Algeria
3      American Samoa
4             Andorra
            ...      
222         West Bank
223    Western Sahara
224             Yemen
225            Zambia
226          Zimbabwe
Name: country, Length: 227, dtype: object

In [160]:
# cleaning up country names !!!!!!!!!!!

countries.loc[countries["country"]== "United States","country"] = "United States of America"
countries.loc[countries["country"]== "Korea, North","country"] = "North Korea"
countries.loc[countries["country"]== "Korea, South","country"] = "South Korea"
countries.loc[countries["country"]== "Burma","country"] = "Myanmar"
countries.loc[countries["country"]== "Bosnia & Herzegovina","country"] = "Bosnia and Herzegovina"
countries.loc[countries["country"]== "Congo, Dem. Rep.","country"] = "Democratic Republic of the Congo"
countries.loc[countries["country"]== "Congo, Repub. of the","country"] = "Congo"
countries.loc[countries["country"]== "Syria","country"] = "Syrian Arab Republic"
countries.loc[countries["country"]== "Cote d'Ivoire","country"] = "Cote DIvoire"
countries.loc[countries["country"]== "Trinidad & Tobago","country"] = "Trinidad and Tobago"
countries.loc[countries["country"]== "Brunei","country"] = "Brunei Darussalam"
countries.loc[countries["country"]== "Gambia, The","country"] = "Gambia"
countries.loc[countries["country"]== "Central African Rep.","country"] = "Central African Republic"
countries.loc[countries["country"]== "","country"] = ""
countries.loc[countries["country"]== "","country"] = ""
countries.loc[countries["country"]== "","country"] = ""

In [161]:
all_power_countr = list(powerplants.country.unique())
countries.loc[~countries.country.isin(all_power_countr),"country"].values

array(['American Samoa', 'Andorra', 'Anguilla', 'Antigua & Barbuda',
       'Aruba', 'Bahamas, The', 'Barbados', 'Belize', 'Bermuda',
       'British Virgin Is.', 'Cayman Islands', 'Chad', 'Comoros',
       'Cook Islands', 'Dominica', 'East Timor', 'Faroe Islands',
       'French Polynesia', 'Gaza Strip', 'Gibraltar', 'Greenland',
       'Grenada', 'Guadeloupe', 'Guam', 'Guernsey', 'Haiti', 'Hong Kong',
       'Isle of Man', 'Jersey', 'Kiribati', 'Liechtenstein', 'Macau',
       'Maldives', 'Malta', 'Marshall Islands', 'Martinique', 'Mayotte',
       'Micronesia, Fed. St.', 'Monaco', 'Montserrat', 'Nauru',
       'Netherlands Antilles', 'New Caledonia', 'N. Mariana Islands',
       'Palau', 'Puerto Rico', 'Reunion', 'Saint Helena',
       'Saint Kitts & Nevis', 'Saint Lucia', 'St Pierre & Miquelon',
       'Saint Vincent and the Grenadines', 'Samoa', 'San Marino',
       'Sao Tome & Principe', 'Seychelles', 'Solomon Islands', 'Somalia',
       'Suriname', 'Tonga', 'Turks & Caicos Is', 

In [162]:
# three countries could not been found: Montenegro, Kosovo, Antartica

# we drop them
powerplants.drop(index = powerplants.loc[powerplants.country.isin(["Montenegro","Kosovo","Antarctica"])].index,inplace = True)

In [163]:
powerplants = powerplants.merge(right = countries, how = "left", on = "country")

In [164]:
powerplants.to_csv("data/powerplants.csv")