# WARNING

This notebook is not tested, and I gave up on solving this problem using Python. Ask your classmates to solve this problem for you in Stata or whatever, but at the moment with the big lack of statistics packages for Python just don't do it.

In [68]:
import warnings
import patsy

import pandas as pd
import numpy as np

from statsmodels.regression.linear_model import WLS

## Natural Experiments

### 1. Replication of Tables

#### Table 3

In [69]:
df = pd.read_stata("data/acemoglu_et_al-2011.dta")

years_to_keep = [1700, 1750, 1800, 1850, 1875, 1900]
df = df.query("year in @years_to_keep")

In [3]:
def fit_regression(df, cluster="id", weight_col=None, query=None):
    """Fit TWFE regression with controls."""
    df = df if query is None else df.query(query)
    
    controls = [f"fpresence{year}" for year in (1750, 1800, 1850, 1875, 1900)]
    formula = "urbrate ~ C(id) + C(year) + " + " + ".join(controls)
    y, X = patsy.dmatrices(formula, df, return_type="dataframe")

    weights = 1.0 if weight_col is None else df.loc[y.index, weight_col].values
    
    model = WLS(y, X, weights=weights).fit(
        cov_type="cluster",
        cov_kwds={"groups": df.loc[y.index, cluster]},
        hasconst=True,
    )
    return model, X

In [4]:
def model_to_table(model, X):
    """Transform model summary to data frame."""
    model_html = model.summary().tables[1].as_html()
    results = pd.read_html(model_html, header=0, index_col=0)[0]
    df = results.loc[results.index.str.startswith("fpresence"), ["coef", "std err"]]
    df = df.reset_index().melt(id_vars="index").sort_values("index")
    df = df.set_index(["index", "variable"])
    df.loc[("n_obs", "n_obs"), "value"] = model.nobs
    df.loc[("n_states", "n_states"), "value"] = model.n_groups
    to_test = [f"fpresence{year}" for year in (1850, 1875, 1900)]
    to_test_loc = [X.columns.get_loc(var) for var in to_test]
    r_matrix = np.zeros(model.params.shape)
    r_matrix[to_test_loc] = 1
    df.loc[("f-test", "p-value"), "value"] = model.f_test(r_matrix).pvalue
    return df

In [5]:
def produce_table3(supress_warnings=False):
    warnings_filter = "ignore" if supress_warnings else "default"
    
    combinations = {
        ("West-Elbe", "Weighted"): {"weight_col": "totalpop1750", "query": "westelbe == 1"},
        ("West-Elbe", "Unweighted"): {"query": "westelbe == 1"},
        ("All", "Weighted"): {"weight_col": "totalpop1750"},
        ("All", "Unweighted"): {}
    }

    dfs = {}
    for key, kwargs in combinations.items():
        with warnings.catch_warnings():
            warnings.simplefilter(warnings_filter)
            model, X = fit_regression(df, cluster="id", **kwargs)
            result = model_to_table(model, X)
        dfs[key] = result

    table3 = pd.concat(dfs, axis=1).droplevel(2, axis=1).round(3)
    return table3

In [6]:
table3 = produce_table3(supress_warnings=True)
table3

Unnamed: 0_level_0,Unnamed: 1_level_0,West-Elbe,West-Elbe,All,All
Unnamed: 0_level_1,Unnamed: 1_level_1,Weighted,Unweighted,Weighted,Unweighted
index,variable,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
fpresence1750,coef,-0.491,-0.252,-0.488,-0.197
fpresence1750,std err,0.276,0.192,0.26,0.181
fpresence1800,coef,-0.247,-0.042,-0.268,-0.047
fpresence1800,std err,0.25,0.171,0.251,0.197
fpresence1850,coef,-0.16,0.033,-0.221,-0.024
fpresence1850,std err,0.278,0.171,0.276,0.201
fpresence1875,coef,0.402,0.354,0.266,0.252
fpresence1875,std err,0.362,0.328,0.335,0.331
fpresence1900,coef,0.634,0.529,0.503,0.506
fpresence1900,std err,0.453,0.446,0.416,0.468


#### Table 6

In [7]:
df["yearsref"]

0       0.00
1       0.00
2       1.75
3      50.25
4      75.25
       ...  
199     0.00
200     0.00
201     7.50
202    18.50
206    37.25
Name: yearsref, Length: 114, dtype: float32

In [None]:
controls = [f"fpresence{year}" for year in (1750, 1800, 1850, 1875, 1900)]
formula = "urbrate ~ C(id) + C(year) + " + " + ".join(controls)

In [41]:
query = "westelbe == 1"

In [42]:
df = df.query(query)

In [46]:
weight_col = None # "totalpop1750"
formula = "urbrate ~ yearsref + yr1700-yr1900 + C(id) + C(year)"
cluster = "id"

In [47]:
y, X = patsy.dmatrices(formula, df, return_type="dataframe")

weights = 1.0 if weight_col is None else df.loc[y.index, weight_col].values

model = WLS(y, X, weights=weights).fit(
    cov_type="cluster",
    cov_kwds={"groups": df.loc[y.index, cluster]},
    hasconst=True,
)

In [48]:
model.summary()



0,1,2,3
Dep. Variable:,urbrate,R-squared:,0.879
Model:,WLS,Adj. R-squared:,0.839
Method:,Least Squares,F-statistic:,28.79
Date:,"Fri, 18 Jun 2021",Prob (F-statistic):,1.87e-06
Time:,21:26:35,Log-Likelihood:,-224.72
No. Observations:,74,AIC:,487.4
Df Residuals:,55,BIC:,531.2
Df Model:,18,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,15.5753,4.853,3.209,0.001,6.063,25.087
C(id)[T.2],3.1363,2.491,1.259,0.208,-1.746,8.018
C(id)[T.3],-12.4978,2.346,-5.328,0.000,-17.096,-7.900
C(id)[T.4],-14.8326,0.629,-23.573,0.000,-16.066,-13.599
C(id)[T.5],2.3193,3.532,0.657,0.511,-4.604,9.243
C(id)[T.6],-8.8627,3.646,-2.431,0.015,-16.009,-1.717
C(id)[T.7],-1.3584,2.190,-0.620,0.535,-5.650,2.933
C(id)[T.10],-9.0469,3.736,-2.422,0.015,-16.369,-1.725
C(id)[T.11],-13.2351,1.397,-9.476,0.000,-15.973,-10.498

0,1,2,3
Omnibus:,39.243,Durbin-Watson:,1.479
Prob(Omnibus):,0.0,Jarque-Bera (JB):,219.646
Skew:,1.342,Prob(JB):,2.02e-48
Kurtosis:,11.002,Cond. No.,2.01e+17


### Addition of ``coal100`` Variable

In [72]:
coal = pd.read_excel("data/coal.xlsx")
coal = coal.drop("Unnamed: 0", axis=1).rename(columns={"region": "name"})

df = df.merge(coal, on="name")

In [None]:
## re-run regression