Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100755 304 lines (246 sloc) 12.141 kB
9f90fe6 @tmoertel Add fairness analysis
authored
1 #!/usr/bin/Rscript
2
3 ### Analysis of 2012/2013 real-estate assessments for Mt. Lebanon, Pennsylvania
4 ### Tom Moertel <tom@moertel.org>
5 ###
6 ### Sources:
7 ###
8 ### [1] Allegheny County 2012 "base-year" assessments
9 ### http://www2.county.allegheny.pa.us/RealEstate/Default.aspx
10 ###
11 ### [2] Allegheny County 2013 "reval" assessments
12 ### http://www2.alleghenycounty.us/reval/
13 ###
14 ### [3] Allegheny County residental real-estate sales data set from
15 ### Pittsburgh Neighborhood and Community Information System at
16 ### the University Center for Social and Urban Research
17 ### http://www.ucsur.pitt.edu/thepub.php?pl=370
18
19
20 library(ggplot2)
21 library(scales)
22 library(plyr)
23 library(reshape2)
24
25
26 ## Load the data sets
27
28 ## First: assessments
29 baseval <- read.csv("data/mtlebo_baseval.csv", sep = "|") # 2012
30 reval <- read.csv("data/mtlebo_reval.csv", sep = "|") # 2013 reassessment
31 bothval <- merge(baseval, reval, by = "Parcel_ID", all = T) # combined
32 bothval <- mutate(bothval, PIN = gsub("-", "", Parcel_ID),
33 State_Code.y = reorder(State_Code.y,
34 0.0 - Total_Value_2013_Reval, sum, na.rm = T))
35
36 ## Second: residental real-estate sales
37 re_sales <- read.csv("data/re_sales_2010_2011.csv", comment.char = "#")
38 re_sales <- rename(re_sales, c(MAPBLOLOT = "PIN"))
39
40 ## Merge the data sets
41 re_sales_bothval <- merge(re_sales, bothval, by = "PIN")
42
43 ## Narrow to columns useful for comparison of old/new assessments
44 re_comp <- with(re_sales_bothval,
45 data.frame(PIN = PIN,
46 Sold_For = SALEPRICE,
47 Was_Assessed_At = Total_Value_2012_Mkt,
48 Reassessed_At = Total_Value_2013_Reval))
49
50
51 ## Establish some cutoffs to exlude properties w/ low predictive value
52 LOW_CUTOFF <- 50000 # excludes ~ 1.8% of properties
53 HIGH_CUTOFF <- 1000000 # excludes ~ 0.2% of properties
54
55
56 ## Create a training set for relating assessed values to actual sales
57 ## prices for the same properties; we exclude fringe properties where
58 ## the sales data aren't dense enough to yield reliable predictions.
59 ## The resulting set comprises 863 properties.
60 training_set <-
61 subset(re_comp,
62 # drop extreme properties, which have little predictive value
63 Reassessed_At > LOW_CUTOFF &
64 Was_Assessed_At > LOW_CUTOFF &
65 Sold_For > LOW_CUTOFF & Sold_For < HIGH_CUTOFF &
66 Reassessed_At < 2 * Was_Assessed_At # suggests unusual circumstances
67 )
68
69
70 ## Now we create some models to "capture" how the assessed values
71 ## related to actual market prices for our training set. We use
72 ## non-parametric additive models because we have no interest in
73 ## understanding the underlying assessment models and their
74 ## parameters; rather we want to record their predictions, warts and
75 ## all, so that we can "replay" them in reverse, applying them to
76 ## reassessed properties outside of our training set to estimate their
77 ## fair-market values. For example, if the the houses in our training
78 ## set that sold for about $100,000 were reassessed for about $120,000
79 ## on average, our models will predict that, on average, homes
80 ## reassessed at $120,000 will generally sell for about $100,000
81 ## on the market.
82
83
84 require(mgcv) # load libraries for general additive modeling
85
86
87 ## two smooths
88 (price_gam <- gam(Sold_For ~
89 s(Was_Assessed_At, bs = "ad") +
90 s(Reassessed_At, bs = "ad"),
91 data = training_set,
92 family = Gamma(link = log),
93 method = "ML"))
94
95 ## one 2-var smooth
96 (price_gam2 <- gam(Sold_For ~
97 s(Was_Assessed_At, Reassessed_At, bs = "ad"),
98 data = training_set,
99 family = Gamma(link = log),
100 method = "ML"))
101
102 ## one 2-var smooth
103 (price_gam3 <- gam(Sold_For ~
104 s(Was_Assessed_At, Reassessed_At, bs = "ad", k = 20),
105 data = training_set,
106 family = Gamma(link = log),
107 method = "ML"))
108
109
110 ## All 3 models fit about as well as could be hoped, but the 3rd model
111 ## fits a teensy bit better, so we prefer it, even though the results
112 ## we get later would be about the same with any of the models.
113 anova(price_gam, price_gam2, price_gam3, test = "F")
114 AIC(price_gam, price_gam2, price_gam3)
115
116 summary(price_gam3)
117
118
119 ## Plotting the smoothed surfaces of the first model is instructive.
120 ## It's easy to see that both old and new assessments are regressive,
121 ## (i.e., overvaluing lower-valued properties and undervaluing
122 ## higher-valued properties) and that the new assessments are even
123 ## moreso than the old.
124 par(mfrow = c(1,2))
125 plot(price_gam, residuals = T, ylim = c(-1,2))
126
127
128
129 ## Additionally, we fit a nonparametric kernel-smoothing model that is
130 ## less constrained than the GAMs. This is pretty close to "just
131 ## letting the data speak." The downside is that we may overfit to
132 ## weirdness in the data set that is not predictive of general market
133 ## pricing. (That's why we also use our price_gam3 model for
134 ## comparison.)
135
136 library(np)
137
138 price_np <- npreg(log(Sold_For) ~ Was_Assessed_At + Reassessed_At, bs = "ad",
139 data = training_set)
140
141 summary(price_np)
142
143
144
145
146 ## Now we're going to predict actual market prices for Mt. Lebanon
147 ## residences. We're going to exclude about 2% of the most- and
148 ## least- expensive residences, for which our model didn't have
149 ## sufficient training data to yield reliable predictions. This
150 ## exclusion shouldn't affect our conclusions much because the bulk of
151 ## the residential tax base is still accounted for. Further,
152 ## residences account for most of Mt. Lebanon's property value and
153 ## the vast majority of Mt. Lebanon taxpayers, so we're only going to
154 ## consider unfairness among owners of residential property.
155 ##
156 ## > ddply(bothval, .(State_Code.y), summarize,
157 ## count = length(State_Code.y),
158 ## assess_val = sum(0.0 + Total_Value_2013_Reval, na.rm = T))
159 ##
160 ## State_Code.y count assess_val
161 ## 1 Residential 11399 2435232405
162 ## 2 Commercial 359 362086964
163 ## 3 Other 12 19775671
164 ## 4 Industrial 5 2562600
165 ## 5 Agricultural 1 192400
166 ## 6 Government 2 70800
167 ## 7 253 0
168
169 ## Grab residential properties
170 mtlebo_props <-
171 with(subset(bothval, State_Code.y == "Residential"),
172 data.frame(PIN = PIN,
173 Was_Assessed_At = Total_Value_2012_Mkt,
174 Reassessed_At = Total_Value_2013_Reval))
175 ## Exclude fringe properties
176 mtlebo_props <- subset(mtlebo_props,
177 Was_Assessed_At > 50000 &
178 Was_Assessed_At < 485000 & # ~ 1%
179 Reassessed_At > 66000 & # ~ 2%
180 Reassessed_At < 686000) # ~ 0%
181
182 ## Helper fn to estimate property values given a model and property set
183 estimate_property_values <-
184 function(model = price_gam3, ds = mtlebo_props, yxform = identity) {
185 est_mkt_values <- yxform(predict(model, newdata = ds, type = "response"))
186 with(ds,
187 data.frame(PIN = PIN,
188 Old_Asm = 0.0 + Was_Assessed_At,
189 New_Asm = 0.0 + Reassessed_At,
190 Est_Mkt = 0.0 + est_mkt_values))
191 }
192
193
194 ## Estimate market prices for all of Lebo homes using both GAM and KS models
195 mtlebo_fair <- estimate_property_values(price_gam3, mtlebo_props)
196 mtlebo_fair_np <- estimate_property_values(price_np, mtlebo_props, exp)
197
198 ## Also, for comparison, treat the sales data set as if it represented
199 ## its own, isolated community and prepare to calculate the tax fairness
200 ## in that community.
201 sales_fair <- with(training_set,
202 data.frame(PIN = PIN,
203 Old_Asm = 0.0 + Was_Assessed_At,
204 New_Asm = 0.0 + Reassessed_At,
205 Est_Mkt = 0.0 + Sold_For))
206
207
208 ## The following function takes one of the above "_fair" data sets,
209 ## representing a community's properties, slices it into bands, and
210 ## determines how much each band pays in property taxes relative to
211 ## what it ought to pay if all homes were assessed ideally at their
212 ## fair-market prices.
213 unfairness_table <- function(ds, bands = 20, steps = bands * 10) {
214 stopifnot(steps >= bands)
215 bandsize = 1 / bands
216 stepsize = 1 / steps
217 bands <- seq(bandsize, 1, by = stepsize)
218 ds <- ds[order(ds$Est_Mkt), ] # sort properties by market price
219 ds_c <- numcolwise(cumsum)(ds) # compute running totals
220 taxbases <- as.vector(tail(ds_c, 1)) # last = community-wide total
221
222 with(ds_c, {
223 ## For each band, we compute its fair share of taxes under the
224 ## assumption that each band is assessed at fair-market value.
225 fair_market_share <- ((quantile(Est_Mkt, bands) -
226 quantile(Est_Mkt, bands - bandsize)) /
227 taxbases$Est_Mkt)
228
229 ## Now we compute each band's share under the old and new
230 ## assessments.
231 old_asm_share <- ((quantile(Old_Asm, bands) -
232 quantile(Old_Asm, bands - bandsize)) /
233 taxbases$Old_Asm)
234
235 new_asm_share <- ((quantile(New_Asm, bands) -
236 quantile(New_Asm, bands - bandsize)) /
237 taxbases$New_Asm)
238
239 ## Finally, we compare the shares under old and new assessments
240 ## to the ideal fair-market shares.
241 data.frame(Est_Mkt = quantile(ds$Est_Mkt, bands - bandsize/2),
242 Old_Unf = old_asm_share / fair_market_share - 1,
243 New_Unf = new_asm_share / fair_market_share - 1)
244 })
245 }
246
247 ## Now we use the function above to package tax-fairness estimates for
248 ## all three communities into one composite data table for easy comparison.
249 unfairness_preds <-
250 rbind(data.frame(model = "gam", subject = "Mt. Lebanon",
251 unfairness_table(mtlebo_fair, 10)),
252 data.frame(model = "np", subject = "Mt. Lebanon",
253 unfairness_table(mtlebo_fair_np, 10)),
254 data.frame(model = "identity", subject = "Recent sales only",
255 unfairness_table(sales_fair, 10)))
256
257 unfairness_preds_m <- melt(unfairness_preds, c("model", "subject", "Est_Mkt"))
258
259 levels(unfairness_preds_m$variable) <- c("Old assessment", "New assessment")
260
261
262 ## Now we plot the unfairness trend for the imaginary isolated
263 ## community composed solely of recently sold Mt. Lebanon residences
264 p <-
265 qplot(Est_Mkt, value,
266 data = subset(unfairness_preds_m, subject == "Recent sales only"),
267 main = paste(sep = "\n",
268 "If all of Lebo were exactly like recently sold homes,",
269 "low-value properties would be massively overtaxed"),
270 ylab = "Estimated overpaid taxes for properties of similar value",
271 xlab = "Fair-market property value",
272 geom = "line",
273 color = variable) +
274 scale_color_discrete(labels = c("Old", "New")) +
275 scale_x_continuous(label = dollar_format()) +
276 scale_y_continuous(label = percent_format()) +
277 labs(colour = "Assessment")
278
279 ggsave(p, file = "/tmp/mtlebo-assessments-vs-isolated-sales-overtaxing.pdf",
280 useDingbats = F, width = 11, height = 8.5)
281
282
283 ## Here's the plot we've been waiting for: Estimated tax unfairness
284 ## for all Mt. Lebanon residences, under both old and new assessments.
285 ## We used two separate models to arrive at these estimates, so we
286 ## plot one model's trends with a solid line (gam), the other's with
287 ## a dashed (np = kernel smoothing).
288 p <-
289 qplot(Est_Mkt / 1000, value,
290 data = subset(unfairness_preds_m, subject == "Mt. Lebanon"),
291 main = "For Mt. Lebanon, New Assessments are More Unfair than Old",
292 ylab = "Estimated overpaid taxes for properties of similar value",
293 xlab = "Fair-market property value ($thousands)",
294 geom = "line",
295 color = variable,
296 linetype = model) +
297 scale_x_continuous(label = dollar_format()) +
298 scale_y_continuous(label = percent_format())
299
300 ggsave(p, file = "out/mtlebo-reval-unfairness.pdf", width = 11, height = 7)
301
302 ggsave(p, file = "out/mtlebo-reval-unfairness.png",
303 width = 11 * (2/3), height = 7 * (2/3))
Something went wrong with that request. Please try again.