#### Importing the data and the necessary libraries

In [1]:
import pandas as pd

co2 = pd.read_csv('../source_data/co2_emissions.csv')
industrial = pd.read_csv('../source_data/industrial_production.csv')
gdp = pd.read_csv('../source_data/gdp.csv')

# CO2

#### Choosing relevant columns and renaming

In [2]:
columnsco2 = list(co2.iloc[0])
columnsco2[1] = 'Country'
co2.columns = columnsco2
co2.drop(0, axis=0, inplace=True)
co2.head()

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",1975,Emissions (thousand metric tons of carbon diox...,16853532,,Carbon Dioxide Information Analysis Center (CD...
2,1,"Total, all countries or areas",1985,Emissions (thousand metric tons of carbon diox...,19864139,,Carbon Dioxide Information Analysis Center (CD...
3,1,"Total, all countries or areas",1995,Emissions (thousand metric tons of carbon diox...,23120435,,Carbon Dioxide Information Analysis Center (CD...
4,1,"Total, all countries or areas",2005,Emissions (thousand metric tons of carbon diox...,29490014,,Carbon Dioxide Information Analysis Center (CD...
5,1,"Total, all countries or areas",2010,Emissions (thousand metric tons of carbon diox...,33472376,,Carbon Dioxide Information Analysis Center (CD...


In [3]:
print(set(co2['Series']), set(co2['Footnotes']), set(co2['Source']))

(set(['Emissions (thousand metric tons of carbon dioxide)', 'Emissions per capita (metric tons of carbon dioxide)']), set([nan, 'For statistical purposes, the data for China do not include those for the Hong Kong Special Administrative Region (Hong Kong SAR), Macao Special Administrative Region (Macao SAR) and Taiwan Province of China.', 'Including San Marino.', 'Including overseas territories.', 'Including Monaco.']), set(['Carbon Dioxide Information Analysis Center (CDIAC) of the Oak Ridge National Laboratory, Oak Ridge, Tennessee, U.S.A., database on national CO2 emission estimates, last accessed March 2017.']))


In [4]:
co2.drop(['Footnotes'], axis=1, inplace=True)

#### Checking dtypes

In [5]:
co2.head()

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Source
1,1,"Total, all countries or areas",1975,Emissions (thousand metric tons of carbon diox...,16853532,Carbon Dioxide Information Analysis Center (CD...
2,1,"Total, all countries or areas",1985,Emissions (thousand metric tons of carbon diox...,19864139,Carbon Dioxide Information Analysis Center (CD...
3,1,"Total, all countries or areas",1995,Emissions (thousand metric tons of carbon diox...,23120435,Carbon Dioxide Information Analysis Center (CD...
4,1,"Total, all countries or areas",2005,Emissions (thousand metric tons of carbon diox...,29490014,Carbon Dioxide Information Analysis Center (CD...
5,1,"Total, all countries or areas",2010,Emissions (thousand metric tons of carbon diox...,33472376,Carbon Dioxide Information Analysis Center (CD...


In [6]:
co2 = co2.astype({'Region/Country/Area':'int64', 'Year':'int64'})

# The commas in ['Value'] are fucking up the conversion of the type, we will replace them in the separed tables

#### Separing data by series

In [7]:
co2_per_capita = co2[co2['Series']== 'Emissions per capita (metric tons of carbon dioxide)'].drop(['Series'], axis=1)
co2_absolut = co2[co2['Series'] != 'Emissions per capita (metric tons of carbon dioxide)'].drop(['Series'], axis=1)

In [8]:
co2_per_capita.drop(['Source'], axis=1, inplace=True)
co2_absolut.drop(['Source'], axis=1, inplace=True)

#### dtypes for value

In [10]:
co2_absolut['Value'] = co2_absolut['Value'].str.replace(',','')

co2_per_capita = co2_per_capita.astype({'Value':'float64'})
co2_absolut = co2_absolut.astype({'Value':'int64'})

#### joining tables

In [45]:
co2_complete = pd.merge(co2_per_capita, co2_absolut, on=['Region/Country/Area', 'Year', 'Country'], how='left')

In [47]:
co2_complete.columns = ['Region/Country/Area','Country', 'Year','per_capita', 'thousand_tons']

In [72]:
co2_complete.head()

Unnamed: 0,Region/Country/Area,Country,Year,per_capita,thousand_tons
0,1,"Total, all countries or areas",1975,4.1,16853532
1,1,"Total, all countries or areas",1985,4.1,19864139
2,1,"Total, all countries or areas",1995,4.0,23120435
3,1,"Total, all countries or areas",2005,4.5,29490014
4,1,"Total, all countries or areas",2010,4.8,33472376


In [81]:
co2_complete.to_csv('../clean_data/co2_clean.csv')

# GDP

#### Columns

In [58]:
gdp_cols = gdp.iloc[0]
gdp_cols[1] = 'Country'
gdp.columns = gdp_cols
gdp.drop(0, axis=0, inplace=True)
gdp.head()

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source
1,1,"Total, all countries or areas",1985,GDP in current prices (millions of US dollars),13518851,,"United Nations Statistics Division, New York, ..."
2,1,"Total, all countries or areas",1995,GDP in current prices (millions of US dollars),31084222,,"United Nations Statistics Division, New York, ..."
3,1,"Total, all countries or areas",2005,GDP in current prices (millions of US dollars),47550129,,"United Nations Statistics Division, New York, ..."
4,1,"Total, all countries or areas",2010,GDP in current prices (millions of US dollars),66145612,,"United Nations Statistics Division, New York, ..."
5,1,"Total, all countries or areas",2015,GDP in current prices (millions of US dollars),74757288,,"United Nations Statistics Division, New York, ..."


In [60]:
print(set(gdp['Series']), 
      set(gdp['Footnotes']), 
      set(gdp['Source']))

(set(['GDP in constant 2010 prices (millions of US dollars)', 'GDP per capita (US dollars)', 'GDP real rates of growth (percent)', 'GDP in current prices (millions of US dollars)']), set([nan, 'Including Kosovo and Metohija.', 'Tanzania mainland only, excluding Zanzibar.', 'For statistical purposes, the data for China do not include those for the Hong Kong Special Administrative Region (Hong Kong SAR), Macao Special Administrative Region (Macao SAR) and Taiwan Province of China.', 'Does not incorporate export of binational hydroelectric power at market prices.', 'Data compiled in accordance with the System of National Accounts 1968 (1968 SNA).', 'Excluding northern Cyprus.', 'Including Western Sahara.', 'Excluding Kosovo and Metohija.', 'Including French Guiana, Guadeloupe, Martinique and R\xe9union.', 'Excludes the temporarily occupied territory of the Autonomous Republic of Crimea and Sevastopol.', 'Does not incorporate value added generated by binational hydroelectric plants.']), se

In [61]:
gdp.drop(['Footnotes'], axis=1, inplace=True)
gdp.drop(['Source'], axis=1, inplace=True)

#### dtypes

In [63]:
gdp['Value'] = gdp['Value'].str.replace(',','')
gdp = gdp.astype({'Value':'float64', 'Year':'int64', 'Region/Country/Area':'int64'})

#### separing tables

In [64]:
gdp_growth = gdp[gdp['Series'] == 'GDP real rates of growth (percent)']
gdp_capita = gdp[gdp['Series'] == 'GDP per capita (US dollars)']
gdp_abs = gdp[gdp['Series'] == 'GDP in current prices (millions of US dollars)']

#### join tables

In [68]:
gdp_complete0 = pd.merge(gdp_growth, gdp_capita, on=['Region/Country/Area', 'Year', 'Country'])

In [69]:
gdp_complete = pd.merge(gdp_complete0, gdp_abs, on=['Region/Country/Area', 'Year', 'Country'])   

In [76]:
gdp_complete.columns = ['Region/Country/Area','Country', 'Year','abc','Real growth rate','abc1','Per capita', 'abc2','In millions']
gdp_complete.drop('abc', axis=1, inplace=True)
gdp_complete.drop('abc1', axis=1, inplace=True)
gdp_complete.drop('abc2', axis=1, inplace=True)

In [79]:
gdp_complete.head()

Unnamed: 0,Region/Country/Area,Country,Year,Real growth rate,Per capita,In millions
0,1,"Total, all countries or areas",1985,3.5,2776.0,13518851.0
1,1,"Total, all countries or areas",1995,3.1,5406.0,31084222.0
2,1,"Total, all countries or areas",2005,3.9,7270.0,47550129.0
3,1,"Total, all countries or areas",2010,4.3,9508.0,66145612.0
4,1,"Total, all countries or areas",2015,2.8,10128.0,74757288.0


In [80]:
gdp_complete.to_csv('../clean_data/gdp_clean.csv')

# Industrial production

In [83]:
industrial.head()

Unnamed: 0,[T25.],Index of Industrial Production,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Region/Country/Area,,Series,Year,Value,Footnotes,Source
1,8,Albania,Index of industrial production: Total industry...,2007,118.5,,"United Nations Statistics Division, New York, ..."
2,8,Albania,Index of industrial production: Total industry...,2008,142.7,,"United Nations Statistics Division, New York, ..."
3,8,Albania,Index of industrial production: Total industry...,2009,148.3,,"United Nations Statistics Division, New York, ..."
4,8,Albania,Index of industrial production: Total industry...,2010,200.2,,"United Nations Statistics Division, New York, ..."


#### Columns

In [84]:
industrialcols = list(industrial.iloc[0])
industrialcols[1] = 'Country'
industrial.columns = industrialcols
industrial.drop(0, axis=0, inplace=True)
industrial.head()

Unnamed: 0,Region/Country/Area,Country,Series,Year,Value,Footnotes,Source
1,8,Albania,Index of industrial production: Total industry...,2007,118.5,,"United Nations Statistics Division, New York, ..."
2,8,Albania,Index of industrial production: Total industry...,2008,142.7,,"United Nations Statistics Division, New York, ..."
3,8,Albania,Index of industrial production: Total industry...,2009,148.3,,"United Nations Statistics Division, New York, ..."
4,8,Albania,Index of industrial production: Total industry...,2010,200.2,,"United Nations Statistics Division, New York, ..."
5,8,Albania,Index of industrial production: Total industry...,2011,237.6,,"United Nations Statistics Division, New York, ..."


In [87]:
print(set(industrial['Series']))

set(['Index of industrial production: Metal products and machinery (Index Base: 2005=100)', 'Index of industrial production: Metal products (Index base: 2005=100)', 'Index of industrial production: Water and waste management (Index base: 2005=100)', 'Index of industrial production: Mining (Index base: 2005=100)', 'Index of industrial production: Total industry - Mining; manufacturing; electricity, gas and water (Index base: 2005=100)', 'Index of industrial production: Machinery (Index base: 2005=100)', 'Index of industrial production: Textiles, wearing apparel, leather, footwear (Index base: 2005=100)', 'Index of industrial production: Food, beverages and tobacco (Index base: 2005=100)', 'Index of industrial production: Miscellaneous manufacturing industries (Index base: 2005=100)', 'Index of industrial production: Manufacturing (Index base: 2005=100)', 'Index of industrial production: Electricity, gas and water (Index base: 2005=100)', 'Index of industrial production: Electricity, gas

In [88]:
print(set(industrial['Footnotes']))

set([nan, 'Twelve months ending 31 March of the year stated.;Including water and waste management.', 'The indices are based on ISIC Rev. 3.;Twelve months beginning 1 July of the year stated.;Data refers to textiles only.', 'Data refers to chemicals and pharmaceutical products only.', 'Excluding petroleum and pharmaceutical products.', 'Twelve months ending 30 June of the year stated.', 'The indices are based on ISIC Rev. 3.;Data refers to fabricated metal products only.', 'The indices are based on ISIC Rev. 3.;Data refers to electricity and water only.', 'Data refers to electricity, steam and air conditioning supply only.', 'The indices are based on ISIC Rev. 3.;Data refers to electricity only.', 'Data refers to food only.', 'The indices are based on ISIC Rev. 3.;Data refers to textiles only.', 'The indices are based on ISIC Rev. 3.;Including basic metals.', 'The indices are based on ISIC Rev. 3.;Twelve months beginning 1 July of the year stated.;Data refers to electricity only.', 'Bre

In [89]:
print(set(industrial['Source']))

set(['United Nations Statistics Division, New York, Environment and energy statistics branch, Industrial and Energy Statistics Section, last accessed November 2015.'])


In [90]:
industrial.drop('Source', axis=1, inplace=True)

In [91]:
industrial.head()

Unnamed: 0,Region/Country/Area,Country,Series,Year,Value,Footnotes
1,8,Albania,Index of industrial production: Total industry...,2007,118.5,
2,8,Albania,Index of industrial production: Total industry...,2008,142.7,
3,8,Albania,Index of industrial production: Total industry...,2009,148.3,
4,8,Albania,Index of industrial production: Total industry...,2010,200.2,
5,8,Albania,Index of industrial production: Total industry...,2011,237.6,


In [92]:
industrial.to_csv('../clean_data/industrial_clean.csv')