In [1]:
import json

with open("init.txt") as jf:
    inp = json.load(jf)

accessString = "postgresql://" + inp["username"] + ":" + inp["password"] + "@" + inp["hostname"] + "/" + inp["database"]

from sqlalchemy import create_engine
engine = create_engine(accessString, echo = False)



# Fullfilling requiremnt: Creating a view

In [2]:
engine.execute(
    """
    DROP VIEW IF EXISTS staged_view;
    CREATE VIEW staged_view AS 
    SELECT * FROM
    master_gini as mg, master_wb as wb, master_un as un, master_misc as misc, code_year
    WHERE
    mg.g_id = wb.wb_id
    AND wb.wb_id = un.un_id
    AND misc.misc_id = un.un_id
    AND un.un_id = code_year.id;
    """
)

<sqlalchemy.engine.result.ResultProxy at 0x127ea9284e0>

In [3]:
columns = engine.execute("SELECT * FROM staged_view LIMIT 10").keys()
print(columns)

['country', 'gini_disp', 'gini_disp_se', 'gini_mkt', 'gini_mkt_se', 'g_id', 'corruptionSE', 'ruleOfLawSE', 'voice', 'regQual', 'govEffect', 'govEffectSE', 'stabilitySE', 'voiceSE', 'corruption', 'stability', 'regQualSE', 'ruleOfLaw', 'INCOMEPC_growth', 'INCOMEPC', 'trade', 'industrial', 'manu', 'agri', 'urban', 'wb_id', 'HDI', 'EDU_IDX', 'Income_Index', 'life', 'un_id', 'kaopen', 'polity2', 'misc_id', 'code', 'year', 'id']


### We're interested in ineuqality. Let's try to see which countries, focusing specifically on the years after 2000, have had the highest average inequality. And to avoid small sample we enforce that there must be at least 10 samples available

In [4]:
res1 = engine.execute("""
    SELECT country, AVG(gini_disp), AVG(gini_mkt)
    FROM staged_view
    WHERE year > 2000
    GROUP BY country
    HAVING COUNT(gini_mkt) > 10
    ORDER BY AVG(gini_disp) DESC LIMIT 10
""")

In [5]:
for x in res1:
    print(x)

('Namibia', 61.379999999999995, 69.14000000000001)
('South Africa', 58.70714285714286, 68.32857142857144)
('Haiti', 54.25454545454545, 57.75454545454546)
('Zambia', 53.94285714285714, 60.60714285714287)
('Honduras', 50.67999999999999, 51.346666666666664)
('Colombia', 50.27333333333333, 52.19333333333332)
('Sri Lanka', 49.733333333333334, 46.220000000000006)
('Cape Verde', 49.10000000000001, 54.335714285714296)
('Peru', 48.54666666666667, 51.22666666666667)
('Rwanda', 48.43076923076923, 55.20769230769231)


### Are there years in which inequality was particularly bad?

In [6]:
res1 = engine.execute("""
    SELECT year, AVG(gini_disp), AVG(gini_mkt)
    FROM staged_view
    GROUP BY year
    ORDER BY AVG(gini_disp) DESC
""")

In [7]:
for x in res1:
    print(x)

(2004, 39.876623376623385, 47.209090909090904)
(2005, 39.77594936708861, 47.07721518987341)
(2002, 39.761437908496724, 47.05816993464051)
(2003, 39.70774193548387, 47.01612903225806)
(2006, 39.650943396226396, 46.93081761006291)
(2000, 39.60137931034484, 47.0144827586207)
(2007, 39.496855345911946, 46.72578616352205)
(1998, 39.32589928057553, 46.80143884892086)
(1996, 39.27703703703702, 46.84592592592591)
(2008, 39.24193548387098, 46.66387096774194)
(2009, 39.02885906040272, 46.638255033557044)
(2010, 38.617605633802825, 46.52253521126759)
(2011, 38.15909090909089, 46.27348484848485)
(2012, 37.93095238095238, 46.215873015873015)
(2013, 37.78157894736842, 46.54298245614034)
(2014, 37.136363636363626, 46.53636363636364)
(2015, 36.74285714285716, 46.73095238095239)
(2016, 35.61666666666667, 46.020370370370365)


### There is a very clear trend that recent years have seen inequality decrease!! That's certainly a promising trend

### Are poor countries more likely to be inequal? Let's run some queries to get a better understanding of how inequality relates to income per capita. We will also:

## Fullfill a requirement: Use common table expression

In [8]:
res1 = engine.execute(""" 
    WITH very_inequal (cntr, gdsp, gmkt, ipc, ipcg, edu) AS
    (SELECT code, AVG(gini_disp), AVG(gini_mkt), AVG("INCOMEPC"), AVG("INCOMEPC_growth"), AVG("EDU_IDX")
    FROM staged_view
    GROUP BY code
    HAVING AVG(gini_disp) > (SELECT AVG(av_gini_d) FROM (SELECT AVG(gini_disp) as av_gini_d FROM staged_view GROUP BY CODE) AS foo)
    OR AVG(gini_disp) > (SELECT AVG(av_gini_m) FROM (SELECT AVG(gini_mkt) as av_gini_m FROM staged_view GROUP BY CODE) AS foo)),
    
    foo2 (val) AS (SELECT AVG("INCOMEPC") FROM staged_view)
    
    SELECT cntr, cc.cntr_name, "ipc", gdsp, gmkt
    FROM very_inequal, country_code as cc 
    WHERE ipc > (SELECT val FROM foo2)
    AND cntr = cc.code
    ORDER BY ipc DESC;
""")

In [9]:
for x in res1:
    print(x)

('QAT', 'Qatar', 56250.07510555556, 39.786666666666655, 44.32666666666666)
('HKG', 'Hong Kong, China (SAR)', 31496.908352777784, 40.68333333333333, 45.48333333333333)
('HKG', 'Hong Kong', 31496.908352777784, 40.68333333333333, 45.48333333333333)
('BHS', 'Bahamas', 28213.136286666668, 44.81666666666667, 47.41666666666666)
('BRB', 'Barbados', 14427.370280722222, 46.65833333333333, 48.85833333333334)
('KNA', 'St. Kitts and Nevis', 14071.701238055553, 41.400000000000006, 42.34444444444444)
('KNA', 'Saint Kitts and Nevis', 14071.701238055553, 41.400000000000006, 42.34444444444444)
('TTO', 'Trinidad and Tobago', 13880.134483388889, 41.857142857142854, 43.87142857142857)
('ATG', 'Antigua and Barbuda', 12829.91816988889, 48.1, 47.1)
('GNQ', 'Equatorial Guinea', 11976.0258655, 50.5, 55.4)
('SYC', 'Seychelles', 11071.781235055554, 41.3875, 45.3)


### We see that there are around 10 or so countries who have above average income per capita but and above average inequality. Hong Kong, interestingly, being chief among them

## Now let's put things back in a dataframe so we can run some advanced analytics!

In [10]:
import pandas as pd
columns = engine.execute("SELECT * FROM staged_view LIMIT 10").keys()
data = engine.execute("SELECT * FROM staged_view")
rows = []
for x in data:
    rows.append(list(x))
staging_master = pd.DataFrame(rows, columns = columns)

In [11]:
import numpy as np
import linearmodels
from linearmodels import PanelOLS
from linearmodels import OLS
import statsmodels.api as sm
from linearmodels import IV2SLS 
from linearmodels import IVGMM

In [12]:
staging_master.columns

Index(['country', 'gini_disp', 'gini_disp_se', 'gini_mkt', 'gini_mkt_se',
       'g_id', 'corruptionSE', 'ruleOfLawSE', 'voice', 'regQual', 'govEffect',
       'govEffectSE', 'stabilitySE', 'voiceSE', 'corruption', 'stability',
       'regQualSE', 'ruleOfLaw', 'INCOMEPC_growth', 'INCOMEPC', 'trade',
       'industrial', 'manu', 'agri', 'urban', 'wb_id', 'HDI', 'EDU_IDX',
       'Income_Index', 'life', 'un_id', 'kaopen', 'polity2', 'misc_id', 'code',
       'year', 'id'],
      dtype='object')

In [13]:
staging_master = staging_master.set_index(["code", "year"])

In [14]:
master = staging_master.drop(columns = ["un_id", "misc_id", "wb_id", "g_id", "country", "id"])

In [15]:
master = master.apply(pd.to_numeric)
master["constant"] = 1

## Let's do some regressions using the linearmodels library!

# New technology!

In [16]:
mod = OLS(master.gini_disp, master[["INCOMEPC_growth", "INCOMEPC", "constant"]])
res = mod.fit()
print(res.summary)

                            OLS Estimation Summary                            
Dep. Variable:              gini_disp   R-squared:                      0.2684
Estimator:                        OLS   Adj. R-squared:                 0.2678
No. Observations:                2391   F-statistic:                    861.03
Date:                Sun, Dec 08 2019   P-value (F-stat)                0.0000
Time:                        16:58:39   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                
                 Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-----------------------------------------------------------------------------------
INCOMEPC_growth    -0.1937     0.0365    -5.3002     0.0000     -0.2653     -0.1221
INCOMEPC           -0.0003  8.92

Inputs contain missing values. Dropping rows with missing observations.


## increased income per capita and growth are both associated with a decrease inequality. 

In [17]:
master.columns

Index(['gini_disp', 'gini_disp_se', 'gini_mkt', 'gini_mkt_se', 'corruptionSE',
       'ruleOfLawSE', 'voice', 'regQual', 'govEffect', 'govEffectSE',
       'stabilitySE', 'voiceSE', 'corruption', 'stability', 'regQualSE',
       'ruleOfLaw', 'INCOMEPC_growth', 'INCOMEPC', 'trade', 'industrial',
       'manu', 'agri', 'urban', 'HDI', 'EDU_IDX', 'Income_Index', 'life',
       'kaopen', 'polity2', 'constant'],
      dtype='object')

In [18]:
mod = OLS(master.gini_disp, master[["INCOMEPC_growth", "HDI", "EDU_IDX", "INCOMEPC", "trade", "industrial", "life", "govEffect", "voice", "constant"]])
res = mod.fit()
print(res.summary)

                            OLS Estimation Summary                            
Dep. Variable:              gini_disp   R-squared:                      0.3503
Estimator:                        OLS   Adj. R-squared:                 0.3477
No. Observations:                2208   F-statistic:                    1584.9
Date:                Sun, Dec 08 2019   P-value (F-stat)                0.0000
Time:                        16:58:42   Distribution:                  chi2(9)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                
                 Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-----------------------------------------------------------------------------------
INCOMEPC_growth    -0.1746     0.0381    -4.5840     0.0000     -0.2492     -0.0999
HDI                 36.421     6

Inputs contain missing values. Dropping rows with missing observations.


## Countries with higher life expectancies have less inequality, and higher voice and accountability leads to more inequality. Education itself seems to decrease ineqaulity quite a lot.

## let's try with gini_mkt

In [19]:
mod = OLS(master.gini_mkt, master[["INCOMEPC_growth", "HDI", "EDU_IDX", "INCOMEPC", "trade", "industrial", "life", "govEffect", "voice", "constant"]])
res = mod.fit()
print(res.summary)

                            OLS Estimation Summary                            
Dep. Variable:               gini_mkt   R-squared:                      0.1521
Estimator:                        OLS   Adj. R-squared:                 0.1487
No. Observations:                2208   F-statistic:                    259.86
Date:                Sun, Dec 08 2019   P-value (F-stat)                0.0000
Time:                        16:58:43   Distribution:                  chi2(9)
Cov. Estimator:                robust                                         
                                                                              
                                Parameter Estimates                                
                 Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
-----------------------------------------------------------------------------------
INCOMEPC_growth    -0.1591     0.0360    -4.4245     0.0000     -0.2295     -0.0886
HDI                 36.658     5

Inputs contain missing values. Dropping rows with missing observations.


## horrendous R^2, it seems gini_mkt is hard to pin down, though we do have a lot of significant predictors. 

## Now in standard econometric fashion ,we will use PANEL methodology, that is, use dummies for every country and year. 

In [20]:
mod = PanelOLS(master.gini_disp, master[["INCOMEPC_growth", "HDI", "EDU_IDX", "INCOMEPC", "trade", "industrial", "life", "govEffect", "voice", "constant"]], entity_effects = True, time_effects = True)
res = mod.fit()
print(res.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:              gini_disp   R-squared:                        0.0556
Estimator:                   PanelOLS   R-squared (Between):             -0.3267
No. Observations:                2208   R-squared (Within):              -0.1922
Date:                Sun, Dec 08 2019   R-squared (Overall):             -0.3388
Time:                        16:58:45   Log-likelihood                   -3327.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      13.215
Entities:                         161   P-value                           0.0000
Avg Obs:                       13.714   Distribution:                  F(9,2021)
Min Obs:                       1.0000                                           
Max Obs:                       18.000   F-statistic (robust):             13.215
                            

Inputs contain missing values. Dropping rows with missing observations.
  labels = self._frame.index.labels
  return list(index.levels[1][index.labels[1]].unique())
  return np.asarray(self._frame.index.labels[0])[:, None]
  return np.asarray(self._frame.index.labels[1])[:, None]
  return list(index.levels[0][index.labels[0]].unique())


## Very interestingly, it seems our regression gets worse! This is in the sense that it seems most of our variables are captured by time-fixed and entity fixed variation - that is to say our normal regression is picking up on differences across countries not within. We need perhaps, better models

In [21]:
#let's put it in a csv for future reference

master.to_csv("masterData12_8.csv")