forked from ajdamico/asdfree
-
Notifications
You must be signed in to change notification settings - Fork 0
/
recode and replicate.R
313 lines (229 loc) · 11 KB
/
recode and replicate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# analyze survey data for free (http://asdfree.com) with the r language
# survey of business owners
# 2007
# # # # # # # # # # # # # # # # #
# # block of code to run this # #
# # # # # # # # # # # # # # # # #
# library(downloader)
# setwd( "C:/My Directory/SBO/" )
# source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/Survey%20of%20Business%20Owners/recode%20and%20replicate.R" , prompt = FALSE , echo = TRUE )
# # # # # # # # # # # # # # #
# # end of auto-run block # #
# # # # # # # # # # # # # # #
# contact me directly for free help or for paid consulting work
# anthony joseph damico
# ajdamico@gmail.com
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
###########################################################################################################
# prior to running this analysis script, the sbo 2007 file must be loaded as a database (.db) on the #
# local machine. running the 2007 download all microdata script will create this database file. #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# https://raw.githubusercontent.com/ajdamico/asdfree/master/Survey%20of%20Business%20Owners/download%20and%20import.R #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# that script will create a file "sbo07.db" with 'y' in C:/My Directory/SBO or wherever you put it. #
###########################################################################################################
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#############################################################################################################################################
# this script matches the results of the SAS code sent to me by the superstar Annie Leung at the united states census bureau. thanx a zil. #
# email: https://github.com/ajdamico/asdfree/blob/master/Survey%20of%20Business%20Owners/census%20emails%20regarding%20SBO%20PUMS.pdf #
# csv: https://github.com/ajdamico/asdfree/blob/master/Survey%20of%20Business%20Owners/PUMS_MIN_FINAL.CSV #
# code: https://github.com/ajdamico/asdfree/blob/master/Survey%20of%20Business%20Owners/pums%20code.sas #
#############################################################################################################################################
# uncomment this line by removing the `#` at the front..
# setwd( "C:/My Directory/SBO/" )
# ..in order to set your current working directory
# name the database folder that should have been saved in the working directory
SBO.dbname <- "sbo"
library(MonetDBLite)
library(DBI) # load the DBI package (implements the R-database coding)
library(mitools) # load mitools package (analyzes multiply-imputed data)
library(survey) # load survey package (analyzes complex design surveys)
library(downloader) # downloads and then runs the source() function on scripts from github
# load sbo-specific functions (a specially-designed series of multiply-imputed, hybrid-survey-object setup to match the census bureau's tech docs)
source_url( "https://raw.githubusercontent.com/ajdamico/asdfree/master/Survey%20of%20Business%20Owners/sbosvy%20functions.R" , prompt = FALSE )
# set R to produce conservative standard errors instead of crashing
# http://r-survey.r-forge.r-project.org/survey/exmample-lonely.html
options( survey.lonely.psu = "adjust" )
# this setting matches the MISSUNIT option in SUDAAN
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# note regarding sql commands and alternatives: the recodes below use basic commands from sql #
# this is necessary for computers with limited resources, since none of the data requires ram #
# if you'd prefer to read the entire dataset into ram instead, use the command #
# x <- dbReadTable( db , 'y' ) #
# or, to not overload RAM and get a subset of the columns in y, with survey variables #
# x <- dbGetQuery( db , 'select race1 , eth1 , pct1 , ... from y' ) #
# at which point, the `?transform` function can be used to make recodes on the `x` data.frame #
# but really, why bother? i've done all the work for you as a db-backed object. run w it. #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
##############################################################
# step 1: connect to the sbo data table you'd like to recode #
# then make a copy so you don't lose the pristine original. #
# name the database files in the "SIPP08" folder of the current working directory
dbfolder <- paste0( getwd() , "/" , SBO.dbname )
# connect to the MonetDBLite database (.db)
db <- dbConnect( MonetDBLite::MonetDBLite() , dbfolder )
# now simply copy you'd like to recode into a new table
dbSendQuery( db , "CREATE TABLE x AS SELECT * FROM y" )
# this action protects the original 'y' table from any accidental errors.
# at any point, we can delete this recoded copy of the data table using the command..
# dbRemoveTable( db , "x" )
# ..and start fresh by re-copying the pristine file from y
# whether or not you are recoding,
# it's a grrrrrrrrreat idea to keep a pristine table `y`
# and do everything on a separate `x` table that you don't
# haveta worryabout screwin' up. nice.
############################################
# step 2: make all of your recodes at once #
# from this point forward, all commands will only touch the
# 'x' table. the 'y' is now off-limits.
# # # # # # # # # # #
# start of recoding #
# # # # # # # # # # #
# add a new numeric column to the data table `x` called `pct_minority`
dbSendQuery( db , 'ALTER TABLE x ADD COLUMN pct_minority DOUBLE PRECISION' )
# fill it full of zeroes
dbSendQuery( db , 'UPDATE x SET pct_minority = 0' )
# loop through the numbers 1 - 4
for ( i in 1:4 ){
# build a character string, storing it in an object called `where.clause`
where.clause <-
# paste together..
paste0(
# paste together..
paste0(
# CHARINDEX with B thru S.
"CHARINDEX( " ,
c( "'B'" , "'A'" , "'I'" , "'P'" , "'S'" ) ,
" , race" ,
i ,
" )" ,
# collapsed by OR statements
collapse = " OR "
) ,
# appended with a separate OR CHARINDEX command..
" OR CHARINDEX( 'H' , eth" ,
i ,
" )"
)
# look at what you've done!
print( where.clause )
# send a sql command to your database that..
dbSendQuery(
db ,
paste0(
# ..updates all rows where the `where.clause` is true,
# adding pct_minority to the current `pct#` column
"UPDATE x SET pct_minority = pct_minority + pct" ,
i ,
" WHERE " ,
where.clause
)
)
# end uh dah loop.
}
# add a new character column called `tab` to the `x` data table
dbSendQuery( db , 'ALTER TABLE x ADD COLUMN tab STRING' )
# any business owned by less than half minorities is an `N`
dbSendQuery( db , "UPDATE x SET tab = 'N' WHERE pct_minority < 50" )
# any business exactly half-owned by minorities is an `E`
dbSendQuery( db , "UPDATE x SET tab = 'E' WHERE pct_minority = 50" )
# majority-minority-owned businesses are coded as `M` in the `tab` column.
dbSendQuery( db , "UPDATE x SET tab = 'M' WHERE pct_minority > 50" )
# boom. done recoding.
# # # # # # # # # #
# end of recoding #
# # # # # # # # # #
##############################################################################
# step 3: create the random groups tables, a kinda sorta multiple imputation #
# the survey of business owners uses this weird 'random groups' variable `rg`
# that's sorta kinda basically multiply-imputation, but not really.
# if you're ultra-curious and ultra-bored and ultra-smart, you can read more here
# http://www2.census.gov/econ/sbo/07/pums/2007_sbo_pums_users_guide.pdf#page=7
# loop through each of the ten random groups..
for ( i in 1:10 ){
# send another sql command that..
dbSendQuery(
db ,
# ..creates a table `x1` `x2` `x3` .. up to .. `x10`
paste0(
'CREATE TABLE x' ,
i ,
' AS SELECT * FROM x WHERE rg = ' ,
i
)
# that only contains records where the column `rg` equals the current iteration
)
}
#############################################################################
# step 4: create a new survey design object connecting to the recoded table #
#####################################################
# survey design for a hybrid database-backed object #
#####################################################
# create a survey design object with the SBO design
# to use for the coefficients: means, medians, totals, etc.
sbo.coef <-
svydesign(
id = ~1 ,
weight = ~tabwgt ,
data = 'x' ,
dbtype = "MonetDBLite" ,
dbname = dbfolder
)
# this one just uses the original table `x`
# create a survey design object with the SBO design
# to use for the variance and standard error
sbo.var <-
svydesign(
id = ~1 ,
weight = ~newwgt ,
data = imputationList( datasets = as.list( paste0( 'x' , 1:10 ) ) ) ,
dbtype = "MonetDBLite" ,
dbname = dbfolder
)
# this one uses the ten `x1` thru `x10` tables you just made.
# slap 'em together into a single list object..
sbo.svy <- list( coef = sbo.coef , var = sbo.var )
# ..and name that list object a `sbosvyimputationList`
# so the correct survey methods (in sbosvy functions.R) get used.
class( sbo.svy ) <- 'sbosvyimputationList'
########################################################
# end of hybrid database-backed survey object creation #
########################################################
##############################################
# step 5: time for fun. here's the analysis #
# run a single MIcombine() function
# outside a single with() function
# outside a single svyby() function
b <- MIcombine( with( sbo.svy , svyby( ~tab , ~fipst , svytotal ) ) )
# that counts the number of minority-owned businesses, by state
# print the result of the call..
b
# extract the statistics..
coef( b )
# ..or the standard errors..
SE( b )
# ..or even the relative standard errors
cv( b )
# ..or hey, merge 'em all together into a single data.frame
out <-
data.frame(
TABWGT = coef( b ) ,
UNADJ_VAR = diag( vcov( b ) ) / 1.992065 ,
ADJ_VAR = diag( vcov( b ) ) ,
RSE = round( cv( b ) * 100 )
)
# extract the state code, and instantly convert it to a number
out$fipst <- as.numeric( lapply( strsplit( rownames( out ) , ':' ) , "[[" , 1 ) )
# extract the `tab` variable
out$tab <- lapply( strsplit( rownames( out ) , ':' ) , "[[" , 2 )
# delete the rownames, since they're annoying and no longer helpful
rownames( out ) <- NULL
# sort the tabs by state
out <- out[ order( out$fipst ) , ]
# that matches the `PUMS_MIN_FINAL.CSV` file precisely.
# and that is a beautiful thing.
# note! big note!
# you can delete these eleven tables you just created easily,
# with this easy, easy, easy loop:
dbRemoveTable( db , 'x' )
for ( i in 1:10 ) dbRemoveTable( db , paste0( 'x' , i ) )