# Chapter 7

In [5]:
rm(list=ls())
library(foreign)

# Load raw data
dataR <- read.csv("Angrist_et_al_AER_2006.csv",head=TRUE,sep=",")

# Subset data, keeping if age >= 9 & age <= 25 & checkid == 1
dataS <- dataR[dataR$age >= 9 & dataR$age <= 25 & dataR$checkid == 1,]

# Fix NA
dataS$read[is.na(dataS$read)] <- 0

# having prepped the data for use, now attach the dataset locally
attach(dataS)

sex <- sex_name


The following objects are masked from dataS (pos = 3):

    age, checkid, match_i, match_i7, match_ic, match_ic7, math,
    mathcens1, mathcens10, phone, read, readcens1, readcens10,
    sex_name, vouch0, X

The following objects are masked from dataS (pos = 4):

    age, checkid, match_i, match_i7, match_ic, match_ic7, math,
    mathcens1, mathcens10, phone, read, readcens1, readcens10,
    sex_name, vouch0, X



In [6]:
# Generate a variable ("observed") indicating whether or not the unit is observed (r_i=1)
observed <- 1 - (read == 0)

# Use logistic regression to predict probabilities of being observed
probobs <- glm(observed~(vouch0*sex)+(vouch0*phone)+(vouch0*age),family=binomial(link="logit"))$fitted

# Compare distributions of predicted probabilities across experimental conditions
# Check to make sure that there are no zero predicted probabilities in either condition
summary(probobs[vouch0==0])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.005258 0.090593 0.295256 0.302246 0.413661 0.887591 

In [7]:
summary(probobs[vouch0==1])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.006938 0.237711 0.449397 0.375784 0.503739 0.872052 

In [8]:
# Generate weights: inverse of predicted probability of being observed
wt <- 1/probobs

In [9]:
# Restrict analysis to observed subjects.
sel_valid <- observed == 1
table(sel_valid)


sel_valid
FALSE  TRUE 
 2319  1223 

In [10]:
# Coefficients for unweighted regression (restricting analysis to observed subjects)
lm(read~vouch0,subset=sel_valid)$coefficients

In [11]:
# Coefficients for IPW regression (restricting analysis to observed subjects)
lm(read~vouch0,weights=wt,subset=sel_valid)$coefficients


## Stata

In [10]:
import delim Angrist_et_al_AER_2006, clear

// Subset data, keeping if age >= 9 & age <= 25 & checkid == 1
keep if age >= 9 & age <= 25 & checkid == 1




(15 vars, 4044 obs)

(502 observations deleted)


In [11]:
// Fix NA
//install dm67_4.pkg
qui nmissing

global mis_var=r(varlist) 

qui foreach var of varlist $mis_var {
replace `var' = 0 if missing(`var')
}

rename sex_name sex





. rename sex_name sex


In [12]:
// Generate a variable ("observed") indicating 
// whether or not the unit is observed (r_i=1)
gen observed = 1 - (read == 0)

recast int vouch0 sex age phone

gen vouch0sex = vouch0*sex
gen vouch0phone = vouch0*phone
gen vouch0age = vouch0*age

// Use logistic regression to predict probabilities of being observed
qui glm observed vouch0 sex phone age vouch0sex vouch0phone vouch0age, family(binomial)

predict probobs








(option mu assumed; predicted mean observed)


In [13]:
// Compare distributions of predicted probabilities across experimental conditions
// Check to make sure that there are no zero predicted probabilities in either condition
tabstat probobs, by(vouch0) stat(min p25 med mean p75 max) nototal


// Generate weights: inverse of predicted probability of being observed
gen wt=1/probobs



Summary for variables: probobs
     by categories of: vouch0 

  vouch0 |       min       p25       p50      mean       p75       max
---------+------------------------------------------------------------
       0 |  .0052578  .0905925  .2952563  .3022464  .4136614   .887591
       1 |  .0069381  .2377109  .4493973  .3757839  .5037394  .8720517
----------------------------------------------------------------------



In [14]:
// Restrict analysis to observed subjects.
gen sel_valid = observed == 1
tab sel_valid




  sel_valid |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |      2,319       65.47       65.47
          1 |      1,223       34.53      100.00
------------+-----------------------------------
      Total |      3,542      100.00


In [15]:
//Coefficients for unweighted regression (restricting analysis to observed subjects)

qui reg read vouch0 if sel_valid == 1
mat unweight = e(b)
mat li unweight





unweight[1,2]
       vouch0      _cons
y1  .68273776  46.920815


In [16]:
// Coefficients for IPW regression (restricting analysis to observed subjects)
qui reg read vouch0 [iw=wt] if sel_valid == 1
mat weighted = e(b)
mat li weighted





weighted[1,2]
       vouch0      _cons
y1  .72303035  46.437818
