In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, log_loss
import sklearn
import os
from matplotlib.pyplot import hist
# import more functions or modules if you need them !!

In [12]:
oil_df = pd.read_pickle('../Data/data/comprehensive_new.pkl')

In [13]:
oil_df

Unnamed: 0,onset2COWCS,onsetUCS,coup,periregular,numcode,year,ecgrowth,logmountain,ethnic_fractionalization,religion_fractionalization,...,valoilres_binarize,valoilres_public_diff,valoilres_public_binarize,oilpop_diff,oilpop_binarize,valoilres_impute_diff,valoilres_impute_binarize,oilpop_impute_diff,oilpop_impute_binarize,milexp_pergdpSIPRI_diff
0,0.00001,,,,4,1929.0,,0.041836,0.007693,0.002717,...,,,,,,,,,,
1,0.00001,,,0.00001,4,1930.0,,0.041836,0.007693,0.002717,...,,,,,,,,,,
2,0.00001,,,0.00001,4,1931.0,,0.041836,0.007693,0.002717,...,,,,,,,,,,
3,0.00001,,,0.00001,4,1932.0,,0.041836,0.007693,0.002717,...,,,,,,,,,,
4,0.00001,,,0.00001,4,1933.0,,0.041836,0.007693,0.002717,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17915,0.00001,0.00001,0.00001,,894,2004.0,0.038210,-0.016094,0.007808,0.007359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
17916,0.00001,0.00001,0.00001,,894,2005.0,0.038433,-0.016094,0.007808,0.007359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
17917,,0.00001,0.00001,,894,2006.0,0.045788,-0.016094,0.007808,0.007359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
17918,,0.00001,0.00001,,894,2007.0,0.045470,-0.016094,0.007808,0.007359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [2]:
# ATT and ATE AIPTW
def att_aiptw(Q0, Q1, g, A, Y, prob_t=None):
    """
    Double ML estimator for the ATT
    This uses the ATT specific scores, see equation 3.9 of https://www.econstor.eu/bitstream/10419/149795/1/869216953.pdf
    Return: aiptw of ATE and its standard error
    """
    
    # number of observations
    n = Y.shape[0]
    
    # estimate marginal probability of treatment
    if prob_t is None:
        prob_t = A.mean() 
    
    # att aiptw
    tau_hat = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0)).mean()/ prob_t
  
    # influence curve and standard error of aiptw
    phi = (A*(Y-Q0) - (1-A)*(g/(1-g))*(Y-Q0) - tau_hat*A) / prob_t
    std_hat = np.std(phi) / np.sqrt(n)

    return tau_hat, std_hat

def ate_aiptw(Q0, Q1, g, A, Y, prob_t=None):
    """
    Double ML estimator for the ATE
    Return: aiptw of ATE and its standard error
    """
    # number of observations
    n = Y.shape[0]
    
    # ate aiptw
    tau_hat = (Q1 - Q0 + A*(Y-Q1)/g - (1-A)*(Y-Q0)/(1-g)).mean()
  
    # influence curve and standard error of aiptw
    phi = Q1 - Q0 + A*(Y-Q1)/g - (1-A)*(Y-Q0)/(1-g) - tau_hat   
    std_hat = np.std(phi) / np.sqrt(n)

    return tau_hat, std_hat

In [3]:
# Conditional outcome models (Q models)
def make_linear_Q_model():
    ''' A function that returns a linear q model for later use in k-folding'''
    return LinearRegression()

def make_Q_model():
    ''' A function that returns a general ML q model for later use in k-folding'''
    return RandomForestRegressor(random_state=RANDOM_SEED, n_estimators=500, max_depth=None)
# One example: RandomForestRegressor(random_state=RANDOM_SEED, n_estimators=500, max_depth=None)

In [4]:
# Propensity score models (g models)
def make_g_model():
    ''' A function that returns a g model for computing propensity scores'''
    return RandomForestClassifier(n_estimators=100, max_depth=5)
# One example: RandomForestClassifier(n_estimators=100, max_depth=5)

In [5]:
# Functions for K-fold cross-fitting
def treatment_k_fold_fit_and_predict(make_model, X:pd.DataFrame, A:np.array, n_splits:int):
    '''
    Implements K fold cross-fitting for the model predicting the treatment A. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns an array containing the predictions  

    Args:
    model: function that returns sklearn model (which implements fit and predict_prob)
    X: dataframe of variables to adjust for
    A: array of treatments
    n_splits: number of splits to use
    '''

    predictions = np.full_like(A, np.nan, dtype=float)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    
    for train_index, test_index in kf.split(X, A):
        X_train = X.loc[train_index]
        A_train = A.loc[train_index]
        g = make_model()
        g.fit(X_train, A_train)

        # get predictions for split
        predictions[test_index] = g.predict_proba(X.loc[test_index])[:, 1]
    
    # sanity check that overlap holds
    assert np.isnan(predictions).sum() == 0
    return predictions

def outcome_k_fold_fit_and_predict(make_model, X:pd.DataFrame, y:np.array, A:np.array, n_splits:int, output_type:str):
    '''
    Implements K fold cross-fitting for the model predicting the outcome Y. 
    That is, 
    1. Split data into K folds
    2. For each fold j, the model is fit on the other K-1 folds
    3. The fitted model is used to make predictions for each data point in fold j
    Returns two arrays containing the predictions for all units untreated, all units treated  

    Args:
    model: function that returns sklearn model (that implements fit and either predict_prob or predict)
    X: dataframe of variables to adjust for
    y: array of outcomes
    A: array of treatments
    n_splits: number of splits to use
    output_type: type of outcome, "binary" or "continuous"
    '''

    predictions0 = np.full_like(A, np.nan, dtype=float)
    predictions1 = np.full_like(y, np.nan, dtype=float)
    if output_type == 'binary':
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
    elif output_type == 'continuous':
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    # include the treatment as input feature
    X_w_treatment = X.copy()
    X_w_treatment["A"] = A

    # for predicting effect under treatment / control status for each data point 
    X0 = X_w_treatment.copy()
    X0["A"] = 0
    X1 = X_w_treatment.copy()
    X1["A"] = 1

    
    for train_index, test_index in kf.split(X_w_treatment, y):
        X_train = X_w_treatment.loc[train_index]
        y_train = y.loc[train_index]
        q = make_model()
        q.fit(X_train, y_train)

        if output_type =='binary':
            predictions0[test_index] = q.predict_proba(X0.loc[test_index])[:, 1]
            predictions1[test_index] = q.predict_proba(X1.loc[test_index])[:, 1]
        elif output_type == 'continuous':
            predictions0[test_index] = q.predict(X0.loc[test_index])
            predictions1[test_index] = q.predict(X1.loc[test_index])

    assert np.isnan(predictions0).sum() == 0
    assert np.isnan(predictions1).sum() == 0
    return predictions0, predictions1

In [None]:
untransformed_lst = ['onset2COWCS',
                    'onsetUCS',
                    'coup',
                    'periregular',
                    'numcode',
                    'year',
                    'ecgrowth',
                    'logmountain',
                    'ethnic_fractionalization',
                    'religion_fractionalization',
                    'language_fractionalization',
                    'leg_british',
                    'no_transition']

# variables that need to take pct change directly
pct_change_lst = ['pop_maddison', 
                'democracy',
                'wildcat',
                'out_regdisaster',
                'valoilres', # value of oil reserves
                'valoilres_public', # value of oil reserves from public data
                'oilpop', # oil reserves per capita in million barrels per 1000 persons
                'valoilres_impute', # value of oilpop_impute (multiply by crude oil price)
                'oilpop_impute'] # oil reserves per capita - imputed

# variables that need to take diff directly
diff_lst = ['milexp_pergdpSIPRI']

# variables that need to take exp and then compute pct change
log_lst = ['logpopdens']

In [15]:
all_vars = ['popdens_diff',
            'milexp_pergdpSIPRI_diff',
            'pop_maddison_diff', 
            'democracy_diff',
            'wildcat_diff',
            'out_regdisaster_diff',
            'valoilres_diff',
            'valoilres_public_diff',
            'oilpop_diff',
            'valoilres_impute_diff',
            'oilpop_impute_diff',
            'onset2COWCS',
            'onsetUCS',
            'coup',
            'periregular',
            'numcode',
            'year',
            'ecgrowth',
            'logmountain',
            'ethnic_fractionalization',
            'religion_fractionalization',
            'language_fractionalization',
            'leg_british',
            'no_transition']

In [16]:
oil_df[all_vars]

Unnamed: 0,popdens_diff,milexp_pergdpSIPRI_diff,pop_maddison_diff,democracy_diff,wildcat_diff,out_regdisaster_diff,valoilres_diff,valoilres_public_diff,oilpop_diff,valoilres_impute_diff,...,periregular,numcode,year,ecgrowth,logmountain,ethnic_fractionalization,religion_fractionalization,language_fractionalization,leg_british,no_transition
0,,,,,,,,,,,...,,4,1929.0,,0.041836,0.007693,0.002717,0.006141,0.00001,
1,,,,,,-1.000000e+00,,,,,...,0.00001,4,1930.0,,0.041836,0.007693,0.002717,0.006141,0.00001,0.00001
2,,,,0.000000,,0.000000e+00,,,,,...,0.00001,4,1931.0,,0.041836,0.007693,0.002717,0.006141,0.00001,0.00001
3,,,,0.000000,,0.000000e+00,,,,,...,0.00001,4,1932.0,,0.041836,0.007693,0.002717,0.006141,0.00001,0.00001
4,,,,0.000000,,3.566128e+10,,,,,...,0.00001,4,1933.0,,0.041836,0.007693,0.002717,0.006141,0.00001,0.00001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17915,0.000149,,0.015008,0.000000,0.0,8.259656e-01,0.0,0.0,0.0,0.0,...,,894,2004.0,0.038210,-0.016094,0.007808,0.007359,0.008734,0.01000,
17916,0.000139,,0.013990,0.000000,0.0,1.678100e+00,0.0,0.0,0.0,0.0,...,,894,2005.0,0.038433,-0.016094,0.007808,0.007359,0.008734,0.01000,
17917,0.000154,,0.015553,0.000000,0.0,-8.735424e-01,0.0,0.0,0.0,0.0,...,,894,2006.0,0.045788,-0.016094,0.007808,0.007359,0.008734,0.01000,
17918,0.000166,,0.016760,0.000000,0.0,8.096983e-01,0.0,0.0,0.0,0.0,...,,894,2007.0,0.045470,-0.016094,0.007808,0.007359,0.008734,0.01000,


In [20]:
oil_df[all_vars].isna().sum()

popdens_diff                   7595
milexp_pergdpSIPRI_diff       15905
pop_maddison_diff              7595
democracy_diff                 8842
wildcat_diff                   4829
out_regdisaster_diff            303
valoilres_diff                10047
valoilres_public_diff         10985
oilpop_diff                    9995
valoilres_impute_diff         10047
oilpop_impute_diff             9995
onset2COWCS                    3008
onsetUCS                       6072
coup                           5065
periregular                   12162
numcode                           0
year                              0
ecgrowth                       8579
logmountain                    5396
ethnic_fractionalization       2800
religion_fractionalization     1200
language_fractionalization     2080
leg_british                    3120
no_transition                 12162
dtype: int64