Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# modeldatatoo (development version)
# modeldatatoo 0.1.0.9000

* Added a `NEWS.md` file to track changes to the package.
- Added a data set for predicting hotel rates.

# modeldatatoo 0.1.0

First CRAN version
7 changes: 2 additions & 5 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,8 @@ template:
reference:
- title: Data Sets
contents:
- data_ames
- data_animals
- data_chimiometrie_2019
- data_elevators
- attach_small_fine_foods
- starts_with("data_")
- starts_with("attach_")
- title: Helper Functions
contents:
- internal_board
Expand Down
161 changes: 161 additions & 0 deletions data-raw/hotel_rates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
library(tidymodels)
library(readr)
library(janitor)
library(textrecipes)
library(lubridate)
library(randomNames)
library(pins)
library(here)

# ------------------------------------------------------------------------------

tidymodels_prefer()
options(pillar.advice = FALSE, pillar.min_title_chars = Inf)

# ------------------------------------------------------------------------------

# See "Hotel booking demand datasets"
# https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Hotel+booking+demand+datasets%22
hotel_raw <-
readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv") %>%
as_tibble() %>%
mutate(
arrival_date = paste(arrival_date_year, arrival_date_month, arrival_date_day_of_month, sep = "_"),
arrival_date = ymd(arrival_date),
arrival_date_num = decimal_date(arrival_date),

market_segment = gsub("TA/TO", "_travel_agent", market_segment, fixed = TRUE),
market_segment = gsub("TA", "_travel_agent", market_segment),
market_segment = gsub("[[:space:]]", "", market_segment),

meal = case_when(
meal == "BB" ~ "Bed and Breakfast",
meal == "HB" ~ "breakfast and one other meal",
meal == "FB" ~ "breakfast lunch and dinner",
TRUE ~ "no meal package"
),

near_christmas =
arrival_date_month == "December" &
arrival_date_day_of_month <= 26 &
arrival_date_day_of_month >= 24,
near_christmas = as.numeric(near_christmas),
near_new_years =
(arrival_date_month == "December" & arrival_date_day_of_month >= 30) |
(arrival_date_month == "January" & arrival_date_day_of_month <= 2),
near_new_years = as.numeric(near_new_years)
)

# ------------------------------------------------------------------------------
# instead of codes, use random names for agents and companies. Stratify by
# ethnicity to avoid overlap


agents <- tibble(agent = unique(hotel_raw$agent))

set.seed(1)
agents$fake_name <-
randomNames(
nrow(agents),
name.order = "first.last",
name.sep = "_",
ethnicity = c(1:2, 4:6), # reserve 3 for company names
sample.with.replacement = FALSE
)
agents$fake_name <- gsub("[[:punct:]]", "_", tolower(agents$fake_name))
agents$fake_name <- gsub("[[:space:]]", "_", tolower(agents$fake_name))
agents$fake_name[agents$agent == "NULL"] <- "not_applicable"

hotel_raw <-
left_join(hotel_raw, agents, by = "agent") %>%
mutate(agent = fake_name) %>%
select(-fake_name)

###

companies <- tibble(company = unique(hotel_raw$company))

set.seed(2)
companies$fake_name <-
randomNames(
nrow(companies),
ethnicity = 3,
which.names = "last",
sample.with.replacement = FALSE
)
companies$fake_name <- gsub("[[:punct:]]", "_", tolower(companies$fake_name))
companies$fake_name <- gsub("[[:space:]]", "_", tolower(companies$fake_name))
types <- c("_llc", "_inc", "_and_company", "_pbc")
types <- sample(types, nrow(companies), replace = TRUE)
companies$fake_name <- paste0(companies$fake_name, types)
companies$fake_name[companies$company == "NULL"] <- "not_applicable"

hotel_raw <-
left_join(hotel_raw, companies, by = "company") %>%
mutate(company = fake_name) %>%
select(-fake_name)

# ------------------------------------------------------------------------------
# version for regression analysis

hotel_rates_all <-
hotel_raw %>%
filter(
is_canceled == 0 &
adr > 15 &
adr < 2000 &
hotel == "Resort Hotel" &
reservation_status == "Check-Out" &
deposit_type == "No Deposit" &
!(market_segment %in% c("Complementary", "Undefined"))
) %>%
select(-reservation_status, -is_canceled, avg_price_per_room = adr,
-reservation_status_date, -hotel, -arrival_date_month,
-deposit_type) %>%
mutate(year_day = yday(arrival_date)) %>%
relocate(avg_price_per_room) %>%
recipe() %>%
step_clean_levels(all_nominal()) %>%
prep() %>%
bake(new_data = NULL)

# ------------------------------------------------------------------------------
# pull off first year of data to compute historical ADR by day

year_2016_data <-
hotel_rates_all %>%
filter(arrival_date <= min(arrival_date) + years(1))

year_2016_stats <-
year_2016_data %>%
summarize(
hist_adr_raw = mean(avg_price_per_room),
hist_bookings = n(),
.by = year_day
) %>%
arrange(year_day)

year_2016_stats$historical_adr <-
loess(hist_adr_raw ~ year_day, data = year_2016_stats, span = .1, degree = 2)$fitted

# Add a value for the leap year
year_2016_stats_leap <-
tibble(year_day = 366, historical_adr = year_2016_stats$hist_adr_raw[nrow(year_2016_stats)])

year_2016_stats <-
bind_rows(year_2016_stats, year_2016_stats_leap) %>%
select(year_day, historical_adr)

hotel_rates <-
hotel_rates_all %>%
filter(arrival_date > min(arrival_date) + years(1)) %>%
left_join(year_2016_stats, by = "year_day") %>%
arrange(arrival_date) %>%
select(-arrival_date_year, -arrival_date_week_number)

# ------------------------------------------------------------------------------

board <- board_folder(here("pkgdown/assets/pins-board"), versioned = FALSE)
board |> pin_write(hotel_rates, "hotel_rates", type = "rds")
board |> write_board_manifest()

2 changes: 2 additions & 0 deletions pkgdown/assets/pins-board/_pins.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@ chimiometrie_2019:
- chimiometrie_2019/20230428T165546Z-e388a/
elevators:
- elevators/20230426T215431Z-8e16a/
hotel_rates:
- hotel_rates/20230627T201835Z-e8f9a/
small_fine_foods:
- small_fine_foods/20230427T000852Z-3bb6e/
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
file: hotel_rates.rds
file_size: 166586
pin_hash: e8f9a6097e497248
type: rds
title: 'hotel_rates: a pinned 15402 x 30 data frame'
description: ~
tags: ~
created: 20230627T201835Z
api_version: 1.0
Binary file not shown.