tidymodels · topepo · Jun 29, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 28, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
-# modeldatatoo (development version)
+# modeldatatoo 0.1.0.9000
 
-* Added a `NEWS.md` file to track changes to the package.
+- Added a data set for predicting hotel rates. 
+
+# modeldatatoo 0.1.0
+
+First CRAN version
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -13,11 +13,8 @@ template:
 reference:
   - title: Data Sets
     contents:
-    - data_ames
-    - data_animals
-    - data_chimiometrie_2019
-    - data_elevators
-    - attach_small_fine_foods
+    - starts_with("data_")
+    - starts_with("attach_")
   - title: Helper Functions
     contents:
     - internal_board

diff --git a/data-raw/hotel_rates.R b/data-raw/hotel_rates.R
@@ -0,0 +1,161 @@
+library(tidymodels)
+library(readr)
+library(janitor)
+library(textrecipes)
+library(lubridate)
+library(randomNames)
+library(pins)
+library(here)
+
+# ------------------------------------------------------------------------------
+
+tidymodels_prefer()
+options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
+
+# ------------------------------------------------------------------------------
+
+# See "Hotel booking demand datasets"
+# https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Hotel+booking+demand+datasets%22
+hotel_raw <-
+  readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv") %>%
+  as_tibble() %>%
+  mutate(
+    arrival_date = paste(arrival_date_year, arrival_date_month, arrival_date_day_of_month, sep = "_"),
+    arrival_date = ymd(arrival_date),
+    arrival_date_num = decimal_date(arrival_date),
+
+    market_segment = gsub("TA/TO", "_travel_agent", market_segment, fixed = TRUE),
+    market_segment = gsub("TA", "_travel_agent", market_segment),
+    market_segment = gsub("[[:space:]]", "", market_segment),
+
+    meal = case_when(
+      meal == "BB" ~ "Bed and Breakfast",
+      meal == "HB" ~ "breakfast and one other meal",
+      meal == "FB" ~ "breakfast lunch and dinner",
+      TRUE ~ "no meal package"
+    ),
+
+    near_christmas =
+      arrival_date_month == "December" &
+      arrival_date_day_of_month <= 26 &
+      arrival_date_day_of_month >= 24,
+    near_christmas = as.numeric(near_christmas),
+    near_new_years =
+      (arrival_date_month == "December" & arrival_date_day_of_month >= 30) |
+      (arrival_date_month == "January" & arrival_date_day_of_month <= 2),
+    near_new_years = as.numeric(near_new_years)
+  )
+
+# ------------------------------------------------------------------------------
+# instead of codes, use random names for agents and companies. Stratify by
+# ethnicity to avoid overlap
+
+
+agents <- tibble(agent = unique(hotel_raw$agent))
+
+set.seed(1)
+agents$fake_name <-
+  randomNames(
+    nrow(agents),
+    name.order = "first.last",
+    name.sep = "_",
+    ethnicity = c(1:2, 4:6), # reserve 3 for company names
+    sample.with.replacement = FALSE
+  )
+agents$fake_name <- gsub("[[:punct:]]", "_", tolower(agents$fake_name))
+agents$fake_name <- gsub("[[:space:]]", "_", tolower(agents$fake_name))
+agents$fake_name[agents$agent == "NULL"] <- "not_applicable"
+
+hotel_raw <-
+  left_join(hotel_raw, agents, by = "agent") %>%
+  mutate(agent = fake_name) %>%
+  select(-fake_name)
+
+###
+
+companies <- tibble(company = unique(hotel_raw$company))
+
+set.seed(2)
+companies$fake_name <-
+  randomNames(
+    nrow(companies),
+    ethnicity = 3,
+    which.names = "last",
+    sample.with.replacement = FALSE
+  )
+companies$fake_name <- gsub("[[:punct:]]", "_", tolower(companies$fake_name))
+companies$fake_name <- gsub("[[:space:]]", "_", tolower(companies$fake_name))
+types <- c("_llc", "_inc", "_and_company", "_pbc")
+types <- sample(types, nrow(companies), replace = TRUE)
+companies$fake_name <- paste0(companies$fake_name, types)
+companies$fake_name[companies$company == "NULL"] <- "not_applicable"
+
+hotel_raw <-
+  left_join(hotel_raw, companies, by = "company") %>%
+  mutate(company = fake_name) %>%
+  select(-fake_name)
+
+# ------------------------------------------------------------------------------
+# version for regression analysis
+
+hotel_rates_all <-
+  hotel_raw %>%
+  filter(
+    is_canceled == 0 &
+      adr > 15 &
+      adr < 2000 &
+      hotel == "Resort Hotel" &
+      reservation_status == "Check-Out" &
+      deposit_type == "No Deposit" &
+      !(market_segment %in% c("Complementary", "Undefined"))
+  ) %>%
+  select(-reservation_status, -is_canceled, avg_price_per_room = adr,
+         -reservation_status_date, -hotel, -arrival_date_month,
+         -deposit_type) %>%
+  mutate(year_day = yday(arrival_date)) %>%
+  relocate(avg_price_per_room) %>%
+  recipe() %>%
+  step_clean_levels(all_nominal()) %>%
+  prep() %>%
+  bake(new_data = NULL)
+
+# ------------------------------------------------------------------------------
+# pull off first year of data to compute historical ADR by day
+
+year_2016_data <-
+  hotel_rates_all %>%
+  filter(arrival_date <= min(arrival_date) + years(1))
+
+year_2016_stats <-
+  year_2016_data %>%
+  summarize(
+    hist_adr_raw = mean(avg_price_per_room),
+    hist_bookings = n(),
+    .by = year_day
+  ) %>%
+  arrange(year_day)
+
+year_2016_stats$historical_adr <-
+  loess(hist_adr_raw ~ year_day, data = year_2016_stats, span = .1, degree = 2)$fitted
+
+# Add a value for the leap year
+year_2016_stats_leap <-
+  tibble(year_day = 366, historical_adr = year_2016_stats$hist_adr_raw[nrow(year_2016_stats)])
+
+year_2016_stats <-
+  bind_rows(year_2016_stats, year_2016_stats_leap) %>%
+  select(year_day, historical_adr)
+
+hotel_rates <-
+  hotel_rates_all %>%
+  filter(arrival_date > min(arrival_date) + years(1)) %>%
+  left_join(year_2016_stats, by = "year_day") %>%
+  arrange(arrival_date) %>%
+  select(-arrival_date_year, -arrival_date_week_number)
+
+# ------------------------------------------------------------------------------
+
+board <- board_folder(here("pkgdown/assets/pins-board"), versioned = FALSE)
+board |> pin_write(hotel_rates, "hotel_rates", type = "rds")
+board |> write_board_manifest()
+
diff --git a/pkgdown/assets/pins-board/_pins.yaml b/pkgdown/assets/pins-board/_pins.yaml
@@ -6,5 +6,7 @@ chimiometrie_2019:
 - chimiometrie_2019/20230428T165546Z-e388a/
 elevators:
 - elevators/20230426T215431Z-8e16a/
+hotel_rates:
+- hotel_rates/20230627T201835Z-e8f9a/
 small_fine_foods:
 - small_fine_foods/20230427T000852Z-3bb6e/
diff --git a/pkgdown/assets/pins-board/hotel_rates/20230627T201835Z-e8f9a/data.txt b/pkgdown/assets/pins-board/hotel_rates/20230627T201835Z-e8f9a/data.txt
@@ -0,0 +1,9 @@
+file: hotel_rates.rds
+file_size: 166586
+pin_hash: e8f9a6097e497248
+type: rds
+title: 'hotel_rates: a pinned 15402 x 30 data frame'
+description: ~
+tags: ~
+created: 20230627T201835Z
+api_version: 1.0
diff --git a/pkgdown/assets/pins-board/hotel_rates/20230627T201835Z-e8f9a/hotel_rates.rds b/pkgdown/assets/pins-board/hotel_rates/20230627T201835Z-e8f9a/hotel_rates.rds