Permalink
Browse files

Add fairness analysis

  • Loading branch information...
1 parent 68a5ec2 commit 9f90fe62b385f584174a3fa3294bf78385e4dfe2 @tmoertel committed Mar 5, 2012
Showing with 303 additions and 0 deletions.
  1. +303 −0 fairness.R
View
@@ -0,0 +1,303 @@
+#!/usr/bin/Rscript
+
+### Analysis of 2012/2013 real-estate assessments for Mt. Lebanon, Pennsylvania
+### Tom Moertel <tom@moertel.org>
+###
+### Sources:
+###
+### [1] Allegheny County 2012 "base-year" assessments
+### http://www2.county.allegheny.pa.us/RealEstate/Default.aspx
+###
+### [2] Allegheny County 2013 "reval" assessments
+### http://www2.alleghenycounty.us/reval/
+###
+### [3] Allegheny County residental real-estate sales data set from
+### Pittsburgh Neighborhood and Community Information System at
+### the University Center for Social and Urban Research
+### http://www.ucsur.pitt.edu/thepub.php?pl=370
+
+
+library(ggplot2)
+library(scales)
+library(plyr)
+library(reshape2)
+
+
+## Load the data sets
+
+## First: assessments
+baseval <- read.csv("data/mtlebo_baseval.csv", sep = "|") # 2012
+reval <- read.csv("data/mtlebo_reval.csv", sep = "|") # 2013 reassessment
+bothval <- merge(baseval, reval, by = "Parcel_ID", all = T) # combined
+bothval <- mutate(bothval, PIN = gsub("-", "", Parcel_ID),
+ State_Code.y = reorder(State_Code.y,
+ 0.0 - Total_Value_2013_Reval, sum, na.rm = T))
+
+## Second: residental real-estate sales
+re_sales <- read.csv("data/re_sales_2010_2011.csv", comment.char = "#")
+re_sales <- rename(re_sales, c(MAPBLOLOT = "PIN"))
+
+## Merge the data sets
+re_sales_bothval <- merge(re_sales, bothval, by = "PIN")
+
+## Narrow to columns useful for comparison of old/new assessments
+re_comp <- with(re_sales_bothval,
+ data.frame(PIN = PIN,
+ Sold_For = SALEPRICE,
+ Was_Assessed_At = Total_Value_2012_Mkt,
+ Reassessed_At = Total_Value_2013_Reval))
+
+
+## Establish some cutoffs to exlude properties w/ low predictive value
+LOW_CUTOFF <- 50000 # excludes ~ 1.8% of properties
+HIGH_CUTOFF <- 1000000 # excludes ~ 0.2% of properties
+
+
+## Create a training set for relating assessed values to actual sales
+## prices for the same properties; we exclude fringe properties where
+## the sales data aren't dense enough to yield reliable predictions.
+## The resulting set comprises 863 properties.
+training_set <-
+ subset(re_comp,
+ # drop extreme properties, which have little predictive value
+ Reassessed_At > LOW_CUTOFF &
+ Was_Assessed_At > LOW_CUTOFF &
+ Sold_For > LOW_CUTOFF & Sold_For < HIGH_CUTOFF &
+ Reassessed_At < 2 * Was_Assessed_At # suggests unusual circumstances
+ )
+
+
+## Now we create some models to "capture" how the assessed values
+## related to actual market prices for our training set. We use
+## non-parametric additive models because we have no interest in
+## understanding the underlying assessment models and their
+## parameters; rather we want to record their predictions, warts and
+## all, so that we can "replay" them in reverse, applying them to
+## reassessed properties outside of our training set to estimate their
+## fair-market values. For example, if the the houses in our training
+## set that sold for about $100,000 were reassessed for about $120,000
+## on average, our models will predict that, on average, homes
+## reassessed at $120,000 will generally sell for about $100,000
+## on the market.
+
+
+require(mgcv) # load libraries for general additive modeling
+
+
+## two smooths
+(price_gam <- gam(Sold_For ~
+ s(Was_Assessed_At, bs = "ad") +
+ s(Reassessed_At, bs = "ad"),
+ data = training_set,
+ family = Gamma(link = log),
+ method = "ML"))
+
+## one 2-var smooth
+(price_gam2 <- gam(Sold_For ~
+ s(Was_Assessed_At, Reassessed_At, bs = "ad"),
+ data = training_set,
+ family = Gamma(link = log),
+ method = "ML"))
+
+## one 2-var smooth
+(price_gam3 <- gam(Sold_For ~
+ s(Was_Assessed_At, Reassessed_At, bs = "ad", k = 20),
+ data = training_set,
+ family = Gamma(link = log),
+ method = "ML"))
+
+
+## All 3 models fit about as well as could be hoped, but the 3rd model
+## fits a teensy bit better, so we prefer it, even though the results
+## we get later would be about the same with any of the models.
+anova(price_gam, price_gam2, price_gam3, test = "F")
+AIC(price_gam, price_gam2, price_gam3)
+
+summary(price_gam3)
+
+
+## Plotting the smoothed surfaces of the first model is instructive.
+## It's easy to see that both old and new assessments are regressive,
+## (i.e., overvaluing lower-valued properties and undervaluing
+## higher-valued properties) and that the new assessments are even
+## moreso than the old.
+par(mfrow = c(1,2))
+plot(price_gam, residuals = T, ylim = c(-1,2))
+
+
+
+## Additionally, we fit a nonparametric kernel-smoothing model that is
+## less constrained than the GAMs. This is pretty close to "just
+## letting the data speak." The downside is that we may overfit to
+## weirdness in the data set that is not predictive of general market
+## pricing. (That's why we also use our price_gam3 model for
+## comparison.)
+
+library(np)
+
+price_np <- npreg(log(Sold_For) ~ Was_Assessed_At + Reassessed_At, bs = "ad",
+ data = training_set)
+
+summary(price_np)
+
+
+
+
+## Now we're going to predict actual market prices for Mt. Lebanon
+## residences. We're going to exclude about 2% of the most- and
+## least- expensive residences, for which our model didn't have
+## sufficient training data to yield reliable predictions. This
+## exclusion shouldn't affect our conclusions much because the bulk of
+## the residential tax base is still accounted for. Further,
+## residences account for most of Mt. Lebanon's property value and
+## the vast majority of Mt. Lebanon taxpayers, so we're only going to
+## consider unfairness among owners of residential property.
+##
+## > ddply(bothval, .(State_Code.y), summarize,
+## count = length(State_Code.y),
+## assess_val = sum(0.0 + Total_Value_2013_Reval, na.rm = T))
+##
+## State_Code.y count assess_val
+## 1 Residential 11399 2435232405
+## 2 Commercial 359 362086964
+## 3 Other 12 19775671
+## 4 Industrial 5 2562600
+## 5 Agricultural 1 192400
+## 6 Government 2 70800
+## 7 253 0
+
+## Grab residential properties
+mtlebo_props <-
+ with(subset(bothval, State_Code.y == "Residential"),
+ data.frame(PIN = PIN,
+ Was_Assessed_At = Total_Value_2012_Mkt,
+ Reassessed_At = Total_Value_2013_Reval))
+## Exclude fringe properties
+mtlebo_props <- subset(mtlebo_props,
+ Was_Assessed_At > 50000 &
+ Was_Assessed_At < 485000 & # ~ 1%
+ Reassessed_At > 66000 & # ~ 2%
+ Reassessed_At < 686000) # ~ 0%
+
+## Helper fn to estimate property values given a model and property set
+estimate_property_values <-
+ function(model = price_gam3, ds = mtlebo_props, yxform = identity) {
+ est_mkt_values <- yxform(predict(model, newdata = ds, type = "response"))
+ with(ds,
+ data.frame(PIN = PIN,
+ Old_Asm = 0.0 + Was_Assessed_At,
+ New_Asm = 0.0 + Reassessed_At,
+ Est_Mkt = 0.0 + est_mkt_values))
+ }
+
+
+## Estimate market prices for all of Lebo homes using both GAM and KS models
+mtlebo_fair <- estimate_property_values(price_gam3, mtlebo_props)
+mtlebo_fair_np <- estimate_property_values(price_np, mtlebo_props, exp)
+
+## Also, for comparison, treat the sales data set as if it represented
+## its own, isolated community and prepare to calculate the tax fairness
+## in that community.
+sales_fair <- with(training_set,
+ data.frame(PIN = PIN,
+ Old_Asm = 0.0 + Was_Assessed_At,
+ New_Asm = 0.0 + Reassessed_At,
+ Est_Mkt = 0.0 + Sold_For))
+
+
+## The following function takes one of the above "_fair" data sets,
+## representing a community's properties, slices it into bands, and
+## determines how much each band pays in property taxes relative to
+## what it ought to pay if all homes were assessed ideally at their
+## fair-market prices.
+unfairness_table <- function(ds, bands = 20, steps = bands * 10) {
+ stopifnot(steps >= bands)
+ bandsize = 1 / bands
+ stepsize = 1 / steps
+ bands <- seq(bandsize, 1, by = stepsize)
+ ds <- ds[order(ds$Est_Mkt), ] # sort properties by market price
+ ds_c <- numcolwise(cumsum)(ds) # compute running totals
+ taxbases <- as.vector(tail(ds_c, 1)) # last = community-wide total
+
+ with(ds_c, {
+ ## For each band, we compute its fair share of taxes under the
+ ## assumption that each band is assessed at fair-market value.
+ fair_market_share <- ((quantile(Est_Mkt, bands) -
+ quantile(Est_Mkt, bands - bandsize)) /
+ taxbases$Est_Mkt)
+
+ ## Now we compute each band's share under the old and new
+ ## assessments.
+ old_asm_share <- ((quantile(Old_Asm, bands) -
+ quantile(Old_Asm, bands - bandsize)) /
+ taxbases$Old_Asm)
+
+ new_asm_share <- ((quantile(New_Asm, bands) -
+ quantile(New_Asm, bands - bandsize)) /
+ taxbases$New_Asm)
+
+ ## Finally, we compare the shares under old and new assessments
+ ## to the ideal fair-market shares.
+ data.frame(Est_Mkt = quantile(ds$Est_Mkt, bands - bandsize/2),
+ Old_Unf = old_asm_share / fair_market_share - 1,
+ New_Unf = new_asm_share / fair_market_share - 1)
+ })
+}
+
+## Now we use the function above to package tax-fairness estimates for
+## all three communities into one composite data table for easy comparison.
+unfairness_preds <-
+ rbind(data.frame(model = "gam", subject = "Mt. Lebanon",
+ unfairness_table(mtlebo_fair, 10)),
+ data.frame(model = "np", subject = "Mt. Lebanon",
+ unfairness_table(mtlebo_fair_np, 10)),
+ data.frame(model = "identity", subject = "Recent sales only",
+ unfairness_table(sales_fair, 10)))
+
+unfairness_preds_m <- melt(unfairness_preds, c("model", "subject", "Est_Mkt"))
+
+levels(unfairness_preds_m$variable) <- c("Old assessment", "New assessment")
+
+
+## Now we plot the unfairness trend for the imaginary isolated
+## community composed solely of recently sold Mt. Lebanon residences
+p <-
+qplot(Est_Mkt, value,
+ data = subset(unfairness_preds_m, subject == "Recent sales only"),
+ main = paste(sep = "\n",
+ "If all of Lebo were exactly like recently sold homes,",
+ "low-value properties would be massively overtaxed"),
+ ylab = "Estimated overpaid taxes for properties of similar value",
+ xlab = "Fair-market property value",
+ geom = "line",
+ color = variable) +
+ scale_color_discrete(labels = c("Old", "New")) +
+ scale_x_continuous(label = dollar_format()) +
+ scale_y_continuous(label = percent_format()) +
+ labs(colour = "Assessment")
+
+ggsave(p, file = "/tmp/mtlebo-assessments-vs-isolated-sales-overtaxing.pdf",
+ useDingbats = F, width = 11, height = 8.5)
+
+
+## Here's the plot we've been waiting for: Estimated tax unfairness
+## for all Mt. Lebanon residences, under both old and new assessments.
+## We used two separate models to arrive at these estimates, so we
+## plot one model's trends with a solid line (gam), the other's with
+## a dashed (np = kernel smoothing).
+p <-
+qplot(Est_Mkt / 1000, value,
+ data = subset(unfairness_preds_m, subject == "Mt. Lebanon"),
+ main = "For Mt. Lebanon, New Assessments are More Unfair than Old",
+ ylab = "Estimated overpaid taxes for properties of similar value",
+ xlab = "Fair-market property value ($thousands)",
+ geom = "line",
+ color = variable,
+ linetype = model) +
+ scale_x_continuous(label = dollar_format()) +
+ scale_y_continuous(label = percent_format())
+
+ggsave(p, file = "out/mtlebo-reval-unfairness.pdf", width = 11, height = 7)
+
+ggsave(p, file = "out/mtlebo-reval-unfairness.png",
+ width = 11 * (2/3), height = 7 * (2/3))

0 comments on commit 9f90fe6

Please sign in to comment.