<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Init" data-toc-modified-id="Init-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Init</a></span><ul class="toc-item"><li><span><a href="#Helper-functions" data-toc-modified-id="Helper-functions-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Helper functions</a></span></li></ul></li><li><span><a href="#Examine-survey" data-toc-modified-id="Examine-survey-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Examine survey</a></span></li><li><span><a href="#Get-relevant-Nielsen-products" data-toc-modified-id="Get-relevant-Nielsen-products-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get relevant Nielsen products</a></span><ul class="toc-item"><li><span><a href="#Import-purchased-products" data-toc-modified-id="Import-purchased-products-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Import purchased products</a></span></li><li><span><a href="#Import-product-data" data-toc-modified-id="Import-product-data-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Import product data</a></span></li><li><span><a href="#Drop-nonfood-products" data-toc-modified-id="Drop-nonfood-products-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Drop nonfood products</a></span></li></ul></li><li><span><a href="#Get-nutrition-data" data-toc-modified-id="Get-nutrition-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Get nutrition data</a></span></li><li><span><a href="#Impute-data-for-missings" data-toc-modified-id="Impute-data-for-missings-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Impute data for missings</a></span><ul class="toc-item"><li><span><a href="#Products-extra" data-toc-modified-id="Products-extra-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Products extra</a></span></li><li><span><a href="#Prepare-frames" data-toc-modified-id="Prepare-frames-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Prepare frames</a></span></li><li><span><a href="#Impute" data-toc-modified-id="Impute-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Impute</a></span></li></ul></li><li><span><a href="#Nutrients-per-trip" data-toc-modified-id="Nutrients-per-trip-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Nutrients per trip</a></span></li><li><span><a href="#Nutrients-per-week" data-toc-modified-id="Nutrients-per-week-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Nutrients per week</a></span><ul class="toc-item"><li><span><a href="#Collapse" data-toc-modified-id="Collapse-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Collapse</a></span></li><li><span><a href="#Merge-treatment-info" data-toc-modified-id="Merge-treatment-info-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Merge treatment info</a></span></li></ul></li><li><span><a href="#Summary-table" data-toc-modified-id="Summary-table-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Summary table</a></span><ul class="toc-item"><li><span><a href="#How-many-imputed-items" data-toc-modified-id="How-many-imputed-items-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>How many imputed items</a></span></li></ul></li><li><span><a href="#Sun-and-Abraham" data-toc-modified-id="Sun-and-Abraham-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Sun and Abraham</a></span></li></ul></div>

# assemble_data

By: Zachary A. Goodman

Updated: 2021-05

This script outputs all data ready to be analyzed in `esa_analysis`. 

## Init

In [None]:
clear all

* set working directory
qui cd ../

* set global directories
global raw_data ./data/raw_data
global temp_data ./data/temp_data
global gen_data ./data/gen_data
global figures ./tex/figures
global tables ./tex/tables

* check dependencies
which gtools

### Helper functions

In [None]:
* function for creating frames

capture program drop create_frame
program create_frame
    args name
    /*
    Creates frame `name' and changes to it
    */

    cwf default
    capture frame drop `name'
    frame create `name'
    cwf `name'

    end

In [None]:
* function for getting survey participants and responses

capture program drop get_survey
program get_survey
    clear
    import delimited $gen_data/survey_cleaned.csv, varnames(1)

    * clean up vars
    unique hhid
    rename hhid household_code
    gen date_survey = date(date_final_survey, "YMD")  
    gen date_rebate = date(date_receipt, "YMD")
    drop ans_* date_p* date_final_survey date_receipt paper

    * format date vars
    format %td date_survey
    format %td date_rebate

    * gen new vars
    gen esp_amount = q8a
    gen esp_method = q8
    label define method 1 "Direct deposit" 2 "Paper check" 3 "Don't Know"
    label values esp_method method

    end

## Examine survey

- q6: Has your hh received a rebate? 1 = Yes, 2 = No but should, 3 = No and unsure, 4 = Def no, 5 = unsure

In [None]:
get_survey
tab esp_method

## Get relevant Nielsen products
Strategy:
1. import purchased products in 2008 and drop duplicates
3. left merge product data
3. drop nonfood products

### Import purchased products

In [None]:
* import household codes by panelists who complete survey

create_frame panelists

qui get_survey
gen counts = 1

preserve
collapse (sum) counts, by(date_rebate q8)
twoway (line counts date_rebate if q8 == 1) (line counts date_rebate if q8 == 2) 

restore

cwf default

In [None]:
* get necessary trips, i.e. those of surveyed households

create_frame trips

forvalues year = 2007/2009 {
    capture restore
    preserve
    clear
    di "getting `year'..."
    
    * load one year's worth of trips
    import delimited "$raw_data/hms/`year'/trips_`year'.tsv", varnames(1) colrange(1:2)

    * keep only those by surveyed panelists
    frlink m:1 household_code, frame(panelists)
    frget counts, from(panelists)
    keep if !mi(counts)
    drop counts
    
    gunique household_code trip_code_uc
    
    tempfile temp
    qui save `temp', replace
    restore
    append using `temp'
}

gunique household_code trip_code_uc

save $temp_data/unique_trips_0709, replace

In [None]:
* import all purchased upcs, keep one copy of each
* keep only purchases for necessary trips

create_frame purchases

clear
tempfile temp

forvalues year = 2007/2009 {
    capture restore
    preserve
    clear
    di "getting `year'..."
    
    * import one year's purchases
    import delimited "$raw_data/hms/`year'/purchases_`year'.tsv", varnames(1) colrange(1:2)
    
    * link trips 
    frlink m:1 trip_code_uc, frame(trips)
    frget household_code, from(trips)
    
    * keep only relevant trips
    keep if !mi(household_code)
    
    * keep one copy of each upc
    bys upc: keep if _n == 1
    
    * save only upc code
    drop household_code trip_code_uc
    qui save `temp', replace
    restore
    append using `temp'
}

bys upc: keep if _n == 1 // why? duplicates across years. Dropping inside loop for speed

save $temp_data/unique_purchased_upcs_0709.dta, replace

In [None]:
unique upc

### Import product data

In [None]:
create_frame products

import delimited "$raw_data/hms/master/products.tsv", varnames(1) bindquote(nobind)

* keep only necessary variables
keep upc upc_ver_uc product_module_code product_group_code department_code ///
    brand_code_uc multi size1_code_uc size1_amount size1_units ///
    upc_desc brand_desc product_module_desc product_group_desc department_desc
* TODO: drop desc vars

* keep only one copy of each product
bys upc: keep if upc_ver_uc == 1
drop upc_ver_uc


/* left merge product data on purchases. 
Note that all purchases have a matching product, so keeping matching
observations only is a left merge. */

merge 1:1 upc using $temp_data/unique_purchased_upcs_0709.dta, keep(3) nogen
gunique upc
save $temp_data/unique_purchased_products_0709.dta, replace

### Drop nonfood products

In [None]:
* drop health and beauty, non food grocery, alcohol, general merchandise,
*  and magnet data
drop if inlist(department_code, 0, 7, 8, 9, 99)

* drop baby food product modules
drop if inlist(product_module_code, 1274, 1272, 1276, 1282)

* drop pet food product modules
drop if inlist(product_module_code, 1313, 1303, 1311, 1309, ///
    1306, 1300, 1310, 1304, 1299, 1301)

* drop ice, cooking wine, fruit protectors
drop if inlist(product_module_code, 2610, 1189, 1448)

* drop if unclassified non-food
drop if inlist(product_module_code, 902, 914)

count if mi(product_module_code)
gunique upc

save $temp_data/relevant_food_products_0709.dta, replace

## Get nutrition data

In [None]:
create_frame nutrient

* load nutrition data
use $temp_data/nutrition.dta, clear 

* drop check digit from upc and coerce to numeric
replace upc = substr(upc, 1, length(upc) - 1)
destring upc, replace

/* after throughly checking the data, negative signs in front of nutrient
quantities are unintentional. */
* remove the negative signs
replace quantity = -1*quantity if quantity < 0
replace pct = -1*pct if pct < 0

* drop if missing quantity
drop if mi(quantity)

* merge with Nielsen product upcs from above
merge m:1 upc using $temp_data/unique_purchased_upcs_0709.dta, keep(2 3)

* save unmatched relevant products for later imputation
preserve
keep if _m == 2
keep upc
save $temp_data/unique_upcs_tobeimputed_0709.dta, replace
restore

* proceed with matching upcs only
keep if _m == 3
drop _m

In [None]:
* rename nutrient labels
replace nutrient = subinstr(nutrient, " ", "", .)
replace nutrient = lower(nutrient)
replace nutrient = "fiber" if nutrient == "dietaryfiber"
replace nutrient = "carbs" if nutrient == "totalcarbohydrate"
replace nutrient = "satfat" if nutrient == "saturatedfat"
replace nutrient = "fat" if nutrient == "totalfat"

// tab nutrient

In [None]:
* fix uom errors - all corrections were manually checked after looking up the products online
// tab uom
replace uom = "g" if inlist(uom, "0", "4", "G", "Null", "g ", "g0", "gf", "m") | ///
    inlist(uom, "g    ", "gr", "gm", "h", "g8", "oz")

* put all nutrition units in grams
replace quantity = quantity/1000 if inlist(uom, "mg", "mg ", "mEq", "Mg")
replace uom = "g" if inlist(uom, "mg", "mg ", "mEq", "Mg")

* manual fixes
replace quantity = 13 if uom == "4g"
replace uom = "g" if uom == "4g"
replace quantity = 0.4 if uom == "9"
replace uom = "g" if uom == "9"
replace quantity = 0 if uom == "nil"
replace uom = "g" if uom == "nil"

* fix absurdly high sodium
// li itemname quantity pct if quantity > 2 & pct < 60 & nutrient == "sodium"
replace quantity = quantity/1000 if quantity > 2 & pct < 60 & nutrient == "sodium"

tab uom

count if mi(uom)
* those missing uom are calories, calories from fat, 0, or grams without label

In [None]:
* use pct flag to check other nutrients
* check pct against FDA's DRV for macros
capture program drop gen_pctcheck
program gen_pctcheck
    capture drop pctcheck
    gen pctcheck = quantity/2.75 if nutrient == "carbs" 
    replace pctcheck = quantity/.03 if nutrient == "cholesterol" 
    replace pctcheck = quantity/.78 if nutrient == "fat"
    replace pctcheck = quantity/.28 if nutrient == "fiber"
    replace pctcheck = quantity/.50 if nutrient == "protein"
    replace pctcheck = quantity/.20 if nutrient == "satfat"
    replace pctcheck = quantity/.023 if nutrient == "sodium"
    replace pctcheck = quantity/.50 if nutrient == "sugars"
    end

gen_pctcheck



* drop nutrition info for products whose pct are wildly off

drop if !mi(pctcheck, pct) & pctcheck/10 > pct & pct > 0

In [None]:
* delete duplicate entries
* This can happen if products have nutrition facts for multiple serving sizes,
*  e.g. regular serving size and 100g
* We only need one of the two to get total grams.

bys upc nutrient: gen N = _N
preserve
keep if N > 1
drop N
sort upc description itemsize itemmeasure productweight servingsizetext servingsizeuom nutrient quantity
export excel $temp_data/uom_duplicates_202105.xlsx, firstrow(var) replace
restore

drop if N > 1
preserve
import excel $temp_data/uom_duplicates_corrected.xlsx, firstrow clear
tempfile temp
qui save `temp'
restore
//drop N
append using `temp', force

gunique upc nutrient

In [None]:
* Clean servings per container variable

preserve
import excel $gen_data/spc_unique_corrected.xlsx, firstrow clear
tempfile temp
qui save `temp'
restore

capture drop spc
merge m:1 servingspercontainer using `temp', nogen keep(1 3)
// drop servingspercontainer
replace spc = . if spc == 0
tab servingspercontainer if mi(spc)
drop servingspercontainer

In [None]:
* reshape wide

* first ensure spc constant within upc
qui bys upc spc: gen n = 1 if _n == 1
qui bys upc: ereplace n = sum(n)
count if n > 1
replace spc = 16 if n == 2

* check UoA
gunique upc nutrient

* keep only necessary vars
keep upc nutrient quantity spc

* rename vars
rename quantity grams

* reshape
reshape wide @grams, i(upc) j(nutrient, string)

* rename
rename (caloriesgrams caloriesfromfatgrams sugarsgrams) (calories caloriesfromfat sugargrams)

In [None]:
* save data
save $temp_data/unique_upc_nutrients.dta, replace

## Impute data for missings

The imputation process is as follows (similar to Dubois, Griffith, and Nevo (2014) AER):
1. Direct UPC match
2. Impute within same product module, size type, product, brand, flavor, variety, type, formula, and style
3. Loosen brand restriction
4. Loosen flavor, variety, type, formula, style restrictions
5. Loosen product restriction
6. Manually impute 

### Products extra

We need this for flavors, etc.

In [None]:
* Loop over each year, keep most recent variant, and only if in list of upcs

create_frame products_extra
tempfile temp

forvalues year = 2007/2009 {
    import delimited "$raw_data/hms/`year'/products_extra_`year'.tsv", varnames(1) clear
    keep if upc_ver_uc == 1
    keep upc panel_year product* flavor* variety* type* formula* style* form*
    if `year' != 2007 { 
        append using `temp'
    }
    qui save `temp', replace 
}

* keep most recent year
sort upc panel_year
by upc: keep if _n == _N
unique upc
drop panel_year

save $temp_data/products_extra_0709.dta, replace

### Prepare frames

In [None]:
* prep products
create_frame products
use $temp_data/relevant_food_products_0709.dta

* merge products extra
frlink 1:1 upc, frame(products_extra)
frget *, from(products_extra)

* merge nutrition
create_frame nutrient
use $temp_data/unique_upc_nutrients.dta
cwf products
frlink 1:1 upc, frame(nutrient)
frget *, from(nutrient)

* rename frequently used vars
rename (product_module_code size1_units product_code brand_code_uc flavor_code variety_code ///
        type_code formula_code style_code) ///
    (pm size product brand flavor variety type formula style)

* replace missings as same code
foreach v in product flavor variety type formula style {
    replace `v' = . if inlist(`v'_descr, "NOT STATED", ///
        "NOT APPLICABLE", "NA", "N/A")
}

* hotfix - TODO: remove this
drop if inlist(department_code, 8, 99)

### Impute

In [None]:
gen has_nutrition = !mi(nutrient)

* Impute = 1 -> direct match
gen imputed = has_nutrition

In [None]:
* impute function

capture program drop impute_byid
program define impute_byid
    args id round

    * calculate mean serving ratio within group
    capture drop servingratio meanservingratio
    gen servingratio = spc / size1_amount
    bys `id': egen meanservingratio = mean(servingratio)

    * impute each nutrient within group
    local nutrients calories caloriesfromfat carbs fat fiber protein ///
        satfat sodium sugar transfat
    foreach nt of varlist `nutrients' {
        di "Imputing `nt'..."
        qui bysort `id': egen meannt = mean(`nt')
        qui replace imputed = `round' if mi(`nt') & !mi(meannt) & imputed == 0
        replace `nt' = meannt if mi(`nt') & !mi(meannt)
        drop meannt
    }

    * impute servings per container
    replace spc = meanservingratio * size1_amount if imputed == `round'

    end

In [None]:
* Impute = 2: within pm, size type, brand, product, flavor, etc.

capture drop groupid
egen groupid = group(pm size product brand flavor variety type formula style), missing

impute_byid groupid 2

In [None]:
* Impute = 3: relax brand

capture drop groupid
egen groupid = group(pm size product flavor variety type formula style), missing

impute_byid groupid 3

In [None]:
* Impute = 4: relax flavor, variety, ...

capture drop groupid
egen groupid = group(pm size product), missing

impute_byid groupid 4

In [None]:
* Impute = 5: product

capture drop groupid
egen groupid = group(pm size), missing

impute_byid groupid 5

In [None]:
* drop 'unclassified' product modules without any matches
drop if imputed == 0 & pm < 1000

In [None]:
* which PMs don't have matches
tab product_module_desc if imputed == 0

In [None]:
tab imputed

In [None]:
* save imputed nutrition data
save $temp_data/imputed_nutrition_0709.dta, replace

In [None]:
* export products missing any nutrition to csv for manual impute

capture restore
preserve

capture drop temp
gen temp = mi(calories, caloriesfromfat, carbs, fat, fiber, protein, ///
        satfat, sodium, sugar, transfat)
tab temp

keep if temp
export excel $temp_data/nutrient_manual_impute.xlsx, firstrow(var) replace

restore

In [None]:
/* bring back manually imputed data
Had to remove:
- Nonfood
- Generic food kits without descriptors
- Gift packages that have candy in them
*/

drop if mi(calories, caloriesfromfat, carbs, fat, fiber, protein, ///
        satfat, sodium, sugar, transfat)
capture restore
preserve
tempfile temp
import excel $temp_data/nutrient_manual_impute_corrected.xlsx, firstrow clear
qui save `temp'
restore
append using `temp'

* save updated entries
// save $temp_data/imputed_nutrition_0709.dta, replace

##  Nutrients per trip

In this section, we do the following:
1. Divide the panel into batches
2. Load trips for a given batch, merging on household_cd (9.4M lines/year)
3. Merge purchases on trip_code_uc (64M lines/year)
4. Merge nutrition on UPC
5. Keep relevant nutrition variables
6. Collapse to trip level, taking sums over nutrition
7. Save batch and repeat

In [None]:
* create a frame for nutrition

create_frame nutrition
use $temp_data/imputed_nutrition_0709.dta

* keep only necessary vars
gen storebrand = regexm(upc_descr, "CTL BR")
replace storebrand = regexm(brand_descr, "CTL BR") if storebrand == 0
keep upc multi calories-transfatgrams imputed storebrand

In [None]:
* create a frame for alcohol
* todo - add other beverage volumes?

create_frame beverages
import delimited $raw_data/hms/master/products.tsv, varnames(1) bindquote(nobind) clear

* keep only necessary obs
keep if inlist(department_code, 1, 2, 3, 8)
keep if upc_ver_uc == 1

* keep only necessary cols
keep upc multi department_desc product_group_desc product_module_descr size1_amount size1_units

* gen volumes for certain product types
gen oz_alcohol = size1_amount if department_desc == "ALCOHOLIC BEVERAGES" & size1_units == "OZ"
replace oz_alcohol = size1_amount * 33.814 if mi(oz_alcohol) & ///
     department_desc == "ALCOHOLIC BEVERAGES" & size1_units == "LI"
replace oz_alcohol = size1_amount * 0.033814 if mi(oz_alcohol) & ///
     department_desc == "ALCOHOLIC BEVERAGES" & size1_units == "ML"
gen oz_soda_regular = size1_amount if size1_units == "OZ" & product_module_desc == "SOFT DRINKS - CARBONATED"

* drop superfluous vars
drop department_desc product_group_desc product_module_descr size1_amount size1_units
keep if !mi(oz_alcohol) | !mi(oz_soda_regular)

* rename multi for merge
rename multi multi2 // why? because of products omitted in nutrition data (e.g. alcohol)

In [None]:
* function to iteratively merge nutrition on purchases

capture program drop agg_batch
program agg_batch
    args batch_name

    local batches = 5

    * get batchsize
    cwf trips
    qui sum trip_code_uc
    local batchsize = ceil(r(N) / `batches')
    local topmax = r(N)

    * loop over batches
    forvalues b = 1/`batches' {
        
        di "Batch #`b'..."
        local bottom = 1 + (`b' - 1) * `batchsize'
        local top = min(`b' * `batchsize', `topmax')
        
        * keep subset of trips
        cwf default
        capture frame drop trips_sub
        frame copy trips trips_sub
        cwf trips_sub
        keep in `bottom'/`top'
                
        * copy nutrition, keep subset
        capture frame drop purchases_sub
        frame copy purchases purchases_sub
        cwf purchases_sub
        di "Getting purchases..."
        frlink m:1 trip_code_uc, frame(trips_sub)
        qui frget household_code, from (trips_sub)
        keep if !mi(household_code)
        
        * merge nutrition
        di "Getting nutrition..."
        frlink m:1 upc, frame(nutrition)
        qui frget *, from(nutrition)
        
        * merge beverages
        di "Getting beverages..."
        frlink m:1 upc, frame(beverages)
        qui frget *, from(beverages)
        replace multi = multi2 if mi(multi)
        drop multi2
    
        * generate additional vars
        di "Prepping vars..."
        qui gen items_scanned = 1 * quantity
        qui gen items_food = !mi(impute) * quantity
        qui gen items_coupons = coupon_value > 0 * quantity
        qui gen final_price_paid = total_price_paid - coupon_value
        qui gen final_price_paid_food = final_price_paid * (items_food > 0)
        qui rename coupon_value coupons_amount_saved
        qui rename deal_flag items_deals
        qui rename storebrand items_storebrand
        qui replace items_deals = items_deals * quantity
        qui replace items_storebrand = items_storebrand * quantity
        
//         * add grams of sugar per product type
//         qui gen beverage = 0
//         local namelist = "coffee_fluid fruit_punch fruit_drinks milk soda_diet soda_regular tea_liquid soft_drinks_powder"
//         foreach n in `namelist' {
//             qui gen sugar_`n' = (oz_`n' > 0 & !mi(oz_`n')) * sugargrams
//             qui replace beverage = 1 if (oz_`n' > 0 & !mi(oz_`n'))
//         }
//         qui gen sugar_nonbeverage = sugargrams * (1 - beverage)
//         qui gen sugar_storebrand = sugargrams * (items_storebrand > 0) & !mi(items_storebrand)
//         qui gen sugar_candy = sugargrams * (candy == 1 & !mi(candy))
//         qui gen sugar_cookies = sugargrams * (cookies == 1 & !mi(cookies))
        
        * Prepare vars for collapsing
        * multiply relevant vars by quantity
        foreach var of varlist calories-transfatgrams ///
            oz_alcohol-oz_soda_regular {
            qui replace `var' = `var' * multi * quantity
        }
        
        * imputed
        forvalues i = 0/5 {
            gen imputed`i' = (imputed == `i') * quantity
        }

        * keep relevant vars
        local collapsevars items_* final_price_paid* ///
            coupons_amount_saved imputed* ///
            calories-transfatgrams oz_* // sugar_*
        keep household_code trip_code_uc `collapsevars'
        drop imputed
        
        * Next, collapse to trip-level taking sums
        di "Collapsing..."
        gcollapse (sum) `collapsevars', by(household_code trip_code_uc)
        
        * Save data
        save $temp_data/`batch_name'_`b'.dta, replace
        
    }

    * append across batches
    * current batch is 10
    di "Appending all batches..."
    local batch_less_one = `batches'-1
    forvalues b = 1/`batch_less_one' {
        append using $temp_data/`batch_name'_`b'.dta
    }

    save $gen_data/`batch_name'.dta, replace
    
    end

In [None]:
* loop over years

create_frame unique_trips
use $temp_data/unique_trips_0709.dta
gen insample = 1

forvalues year = 2007/2009 {
    
    di "Getting data for year = `year'..."
    
    * trips frame
    create_frame trips
    import delimited "$raw_data/hms/`year'/trips_`year'.tsv", varnames(1)
    
    * keep relevant trips
    frlink 1:1 trip_code_uc, frame(unique_trips)
    frget insample, from(unique_trips)
    keep if !mi(insample)
    drop insample    
    
    * purchases frame
    create_frame purchases
    import delimited "$raw_data/hms/`year'/purchases_`year'.tsv", varnames(1) clear
    drop upc_ver_uc
    
    * run agg function
    agg_batch nutrition_trips_`year'

}

In [None]:
gunique household_code
gunique trip_code_uc

## Nutrients per week

### Collapse

In [None]:
* function to collapse to trip level

capture program drop collapse_trip
program collapse_trip
    args year

    di "Collapsing `year' data to household-week level."
    
    * import trips-nutrition
    di "Getting trips-nutrition data..."
    cwf default
    use $gen_data/nutrition_trips_`year'.dta, clear

    * get trips data
    di "Merging trips data..."
    create_frame trips
    import delimited "$raw_data/hms/`year'/trips_`year'.tsv", varnames(1)
    keep trip_code_uc store_code_uc purchase_date total_spent

    * merge trips data
    cwf default
    frlink m:1 trip_code_uc, frame(trips)
    frget *, from(trips)

    * TODO anything with method of payment? Perhaps SNAP indicator

    * gen week indicators
    * week-of-date (week) includes year
    gen date = date(purchase_date, "YMD")
    format %td date
    gen week = wofd(date)
    gen year = year(date)

    * drop vars that do not remain constant within week
    drop date purchase_date trip_code_uc trips 

    * gen count var for shopping trips
    gen trips = 1

    * collapse taking sums
    di "Collapsing..."
    gcollapse (sum) items_deals-oz_soda_regular trips total_spent, by(household_code year week)

    * save output
    save $temp_data/nutrition_week_`year'.dta, replace

    end

In [None]:
* collapse all years

// forvalues year = 2007/2009 {
//     collapse_trip `year'
// }

* append all years, current using data is 2009
di "Appending all years..."
use $temp_data/nutrition_week_2009.dta, clear
forvalues year = 2007/2008 {
    append using $temp_data/nutrition_week_`year'.dta
}

* collapse again (because late december purchases)
di "One last collapse..."
gcollapse (sum) items_deals-total_spent, by(household_code year week)

* save all
di "Saving data..."
save $gen_data/hh_year_week.dta, replace

### Merge treatment info

And relevant household info

In [None]:
* get surveyed panelists
create_frame panelists
qui get_survey

* gen week-of-year treatment var
gen week_treat = wofd(date_rebate)

* gen identifier for paper check
gen hh_method_paper = q8 == 2
gen hh_method_dd = q8 == 1

* ESP amount
gen hh_esp_amount = q8a

* liquidity indicator
gen hh_savings_two_months = q4 == 1

* financial plan
gen hh_financial_plan = q5 == 1

* preference for saving
gen hh_saving_preference = q3 == 2


* get panelists demographics for 2008 (year of ESP)
preserve
import delimited "$raw_data/hms/2008/panelists_2008.tsv", varnames(1) clear
rename household_cd household_code

* save and merge
tempfile temp
qui save `temp'
restore
merge 1:1 household_code using `temp'
tab _m
keep if _m == 3
drop _m

In [None]:
* clean demographic vars

* 1 - 6+ HH members
gen hh_size = household_size
replace hh_size = 6 if hh_size > 6
label var hh_size "Household Size, top-coded at 6"

* Low, middle, high income
gen hh_income = 1 
replace hh_income = 2 if inrange(household_income, 17, 21)
replace hh_income = 3 if inrange(household_income, 23, 26)
replace hh_income = 4 if household_income == 27
label var hh_income "Household Income"
label define income 1 "<$35K" 2 "$35K - $59,999" ///
    3 "$60K - $99,999" 4 ">$100K"
label values hh_income income

* (Female) Age < 35, 35 - 49, 50 - 64, 65+
gen hh_age = female_head_age
replace hh_age = male_head_age if hh_age == 0
replace hh_age = -1*hh_age
replace hh_age = 4 if hh_age == -9
replace hh_age = 3 if hh_age <= -7
replace hh_age = 2 if hh_age <= -4
replace hh_age = 1 if hh_age < 0
label var hh_age "Age of the (female) head of household"
label define age 1 "<35" 2 "35 - 49" ///
    3 "50-64" 4 "65+"
label values hh_age age

* <HS, HS grad, some college, BA+
gen hh_educ = female_head_educ
replace hh_educ = male_head_educ if hh_educ == 0 
replace hh_educ = hh_educ - 1 if hh_educ != 1
replace hh_educ = 4 if hh_educ == 5
label var hh_educ "Education of the (female) head of household"
label define educ 1 "< HS" 2 "HS Grad" ///
    3 "Some College" 4 "BA+" 
label values hh_educ educ

* White, Black, Hispanic, Asian
rename race hh_race 
label var hh_race "Racial identity of the household"
label define race 1 "White" 2 "Black" 3 "Asian" 4 "Other"
label values hh_race race

* Kids under 18 y/n
gen hh_child = 0
replace hh_child = 1 if age_and != 9
label var hh_child "Indicates if any children < 18 in HH"
label define child 1 "Yes" 2 "No"
label values hh_child child

* Female <30 hrs, 30 - 34 hrs, >= 35 hrs, not employed (includes retired)
rename female_head_emp hh_emp_female
label var hh_emp_fem "Hours employment/week of female head of HH"
label define emp 1 "< 30" 2 "30 - 34" 3 ">= 35" 9 "Not employed" 0 "No head of this gender"
label values hh_emp_fem emp

* Male <30 hrs, 30 - 34 hrs, >= 35 hrs, not employed (includes retired)
rename male_head_emp hh_emp_male
label var hh_emp_male "Hours employment/week of male head of HH"
label values hh_emp_male emp

* Presence of Female head - captured by hh_female_emp
* Presence of Male head - captured by hh_male_emp


* keep only necessary vars
keep household_code projection_factor hh_* household_income wic_* week_treat

* save panelist info
save $gen_data/panelist_info.dta, replace

In [None]:
desc

In [None]:
* add treatment status to hh_week data
keep household_code projection_factor week_treat-hh_method_dd
merge 1:m household_code using $gen_data/hh_year_week.dta, keep(3) nogen
desc

In [None]:
* use tsset to balance panel

* drop year = 2006
drop if year == 2006

* set panel
tsset household_code week, weekly

* add zeros between min and max weeks of each person, otherwise missing
tsfill
foreach v of varlist items_deals-total_spent {
    qui replace `v' = 0 if mi(`v')
}
sort household_code
foreach v of varlist week_treat-projection_factor {
    qui by household_code: ereplace `v' = min(`v')
}
drop year

* gen week diff
gen week_diff = week - week_treat

In [None]:
* drop all households that don't have periods both before and after treatment

capture drop min_period max_period
bys household_code: egen min_period = min(week_diff)
bys household_code: egen max_period = max(week_diff) if !mi(week_diff)

drop if min_period > 0 & !mi(min_period)
drop if max_period < 0 & !mi(max_period)
drop min_period max_period

In [None]:
save $gen_data/hh_year_week_panel.dta, replace

In [None]:
* check data
desc

In [None]:
gunique household_code
gunique household_code week_diff

In [None]:
tab week_diff

In [None]:
capture drop min_period max_period
bys household_code: egen min_period = min(week_diff)
bys household_code: egen max_period = max(week_diff) if !mi(week_diff)
replace min_period = . if min_period != week_diff
replace max_period = . if max_period != week_diff

sum min_period max_period
drop min_period max_period

In [None]:
* export to CSV for R

export delimited $gen_data/hh_year_week_panel.csv, replace