#Directory, Libraries and Data

In [1]:
%cd /content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching

/content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching


In [2]:
#install library
!pip install CausalInference

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting CausalInference
  Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 3.4 MB/s 
[?25hInstalling collected packages: CausalInference
Successfully installed CausalInference-0.1.3


In [3]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
from causalinference import CausalModel

In [4]:
#Loading Data
df = pd.read_csv("stackoverflow.csv")
df.head()

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction,Data_scientist,Database_administrator,...,Developer_with_stats_math_background,DevOps,Embedded_developer,Graphic_designer,Graphics_programming,Machine_learning_specialist,Mobile_developer,Quality_assurance_engineer,Systems_administrator,Web_developer
0,United Kingdom,100000.0,20,0,1,5000,Remote,8,0,0,...,0,0,1,0,0,0,0,0,0,0
1,United States,130000.0,20,1,1,1000,Remote,9,0,0,...,0,1,1,0,0,0,0,1,0,1
2,United States,175000.0,16,0,1,10000,Not remote,7,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Germany,64516.12903,4,0,0,1000,Not remote,9,0,0,...,0,0,0,0,0,0,0,0,0,1
4,India,6636.323594,1,0,1,5000,Not remote,5,0,0,...,0,0,0,0,0,0,0,0,0,1


#Data Analysis

In [5]:
#Picking variables
df = df.iloc[:,:8]
df.head(0)

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction


In [6]:
#Transforming character variables
df = pd.get_dummies(df, drop_first= True)
df.head(1)

Unnamed: 0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States,Remote_Remote
0,100000.0,20,0,1,5000,8,0,0,1,0,1


In [7]:
#Looking at Group means
df.groupby('Remote_Remote').mean()

Unnamed: 0_level_0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States
Remote_Remote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,70201.175042,7.142857,0.332736,0.76051,2187.325563,7.551106,0.142857,0.096035,0.189878,0.480175
1,87400.737001,10.12,0.443478,0.766957,1712.756522,7.855652,0.069565,0.097391,0.121739,0.662609


In [8]:
#T-tests
#stating which variables to test
continuous = ['Salary', 'YearsCodedJob']

#Where to store results
stat = {}
pvalue = {}

#Loop
for x in continuous:
  group1 = df.where(df.Remote_Remote == 0).dropna()[x]
  group2 = df.where(df.Remote_Remote == 1).dropna()[x]
  stat[x], pvalue[x] = ss.ttest_ind(group1, group2)
ttests = pd.DataFrame.from_dict(pvalue, orient = 'Index')
ttests.columns = ['pvalue']
print(ttests)

                     pvalue
Salary         1.057708e-22
YearsCodedJob  3.637316e-30


#Matching

In [9]:
#Isolate y, treat, confounders
y = df.CareerSatisfaction.values 
treat = df.Remote_Remote.values 
confounders = df.drop(columns = ["Remote_Remote",
                                 "CareerSatisfaction"]).values

In [10]:
#Matching
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj = True)
print(model.estimates)

  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef



Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.114      0.130      0.881      0.378     -0.140      0.368
           ATC      0.106      0.136      0.778      0.436     -0.161      0.372
           ATT      0.187      0.142      1.321      0.187     -0.090      0.464



#Robustness check

In [11]:
#Remove 1 confounder
confounders = df.drop(columns = ["Remote_Remote",
                                 "CareerSatisfaction",
                                 "Hobby"]).values

In [12]:
#Matching
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj = True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.148      0.131      1.128      0.259     -0.109      0.406
           ATC      0.140      0.138      1.013      0.311     -0.131      0.411
           ATT      0.220      0.137      1.602      0.109     -0.049      0.488

