Permalink
Browse files

Add new txhousing dataset

  • Loading branch information...
1 parent df46b76 commit 72ee97434c9b71f343232d29e26fccf294d9402b @hadley hadley committed Aug 20, 2015
Showing with 8,715 additions and 0 deletions.
  1. +4 −0 NEWS
  2. +18 −0 R/ggplot2.r
  3. +64 −0 data-raw/tx-housing.R
  4. +8,603 −0 data-raw/tx-housing.csv
  5. BIN data/txhousing.rda
  6. +26 −0 man/txhousing.Rd
View
4 NEWS
@@ -1,6 +1,10 @@
ggplot2 1.0.1.9xxx
----------------------------------------------------------------
+* New `txhousing` dataset containing information about the Texas housing
+ market. Useful for examples that need multiple time series, and for
+ demonstrating model+vis methods.
+
* The default font size in `geom_text()` has been decreased from 5mm (14 pts)
to 3.8 mm (11 pts) to match the new default theme sizes.
View
@@ -183,3 +183,21 @@ NULL
#' \item{col}{Colour name}
#' }
"luv_colours"
+
+#' Housing sales in TX.
+#'
+#' Information about the housing market in Texas provided by the TAMU
+#' real estate center, \url{http://recenter.tamu.edu/}.
+#'
+#' @format A data frame with 8602 observations and 9 variables:
+#' \itemize{
+#' \item{city}{Name of MLS area}
+#' \item{year,month,date}{Date}
+#' \item{sales}{Number of sales}
+#' \item{volume}{Total value of sales}
+#' \item{median}{Median sale price}
+#' \item{listings}{Total active listings}
+#' \item{inventory}{"Months inventory": amount of time it would take to sell
+#' all current listings at current pace of sales.}
+#' }
+"txhousing"
View
@@ -0,0 +1,64 @@
+library(rvest)
+library(tidyr)
+library(readr)
+library(dplyr)
+
+# Find list of all pages -------------------------------------------------------
+root <- read_html("http://recenter.tamu.edu/Data/hs/")
+links <- root %>%
+ html_nodes(".threecol a")
+pages <- links %>%
+ html_attr("href") %>%
+ url_absolute(xml_url(root)) %>%
+ setNames(html_text(links))
+
+# Extract table from each page -------------------------------------------------
+to_char <- function(df) {
+ df[] <- lapply(df, as.character)
+ df
+}
+
+tamu_table <- . %>%
+ html() %>%
+ html_node(".dataTable") %>%
+ html_table()
+
+tables <- lapply(pages, tamu_table)
+data <- lapply(tables, . %>% .[-1, ] %>% to_char) %>%
+ Map(function(df, city) {
+ df$city <- city
+ df
+ }, ., names(.)) %>%
+ bind_rows() %>%
+ as_data_frame()
+
+data[data == "-"] <- NA
+
+months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
+ "Oct", "Nov", "Dec")
+
+txhousing <- data %>%
+ mutate(
+ Sales = parse_numeric(Sales),
+ DollarVolume = parse_numeric(DollarVolume),
+ AveragePrice = parse_numeric(AveragePrice),
+ MedianPrice = parse_numeric(MedianPrice),
+ TotalListings = parse_numeric(TotalListings),
+ MonthsInventory = parse_numeric(MonthsInventory)
+ ) %>%
+ extract(Date, c("Year", "Month"), "(\\d*)-?([a-zA-Z]*)", convert = TRUE) %>%
+ mutate(
+ Year = zoo::na.locf(ifelse(Year == "", NA, Year)),
+ Month = match(Month, months)) %>%
+ select(city, year = Year, month = Month, sales = Sales,
+ volume = DollarVolume, average = AveragePrice, median = MedianPrice,
+ listings = TotalListings, inventory = MonthsInventory) %>%
+ mutate(date = year + (month - 1) / 12) %>%
+ # Don't need totals & Palestine is v. low quality
+ filter(!(city %in% c("Texas Totals", "Palestine"))) %>%
+ # Reduce file size
+ filter(year >= 2000) %>%
+ select(-average)
+
+write.csv(txhousing, "data-raw/tx-housing.csv", row.names = FALSE, quote = FALSE)
+devtools::use_data(txhousing, overwrite = TRUE)
Oops, something went wrong.

0 comments on commit 72ee974

Please sign in to comment.