Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update storms data to 2020 #6000

Merged
merged 5 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Suggests:
covr,
DBI,
dbplyr (>= 1.4.3),
ggplot2,
knitr,
Lahman,
lobstr,
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# dplyr (development version)

* `storms` data updated to 2020 (@steveharoz, #5899).

* `coalesce()` accepts 1-D arrays (#5557).

* `filter()` forbids matrix results (#5973) and warns about data frame
Expand Down
19 changes: 15 additions & 4 deletions R/data-storms.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
#'
#' This data is a subset of the NOAA Atlantic hurricane database best track
#' data, \url{https://www.nhc.noaa.gov/data/#hurdat}. The data includes the
#' positions and attributes of 198 tropical storms, measured every six hours
#' positions and attributes of storms from 1975-2020, measured every six hours
#' during the lifetime of a storm.
#'
#' @seealso The script to create the storms data set: \url{https://github.com/tidyverse/dplyr/blob/master/data-raw/storms.R}
#'
#' @format A tibble with 10,010 observations and 13 variables:
#' @format A tibble with 11,859 observations and 13 variables:
#' \describe{
#' \item{name}{Storm Name}
#' \item{year,month,day}{Date of report}
Expand All @@ -19,9 +19,20 @@
#' -1 = Tropical Depression, 0 = Tropical Storm)}
#' \item{wind}{storm's maximum sustained wind speed (in knots)}
#' \item{pressure}{Air pressure at the storm's center (in millibars)}
#' \item{ts_diameter}{Diameter of the area experiencing tropical storm strength winds (34 knots or above)}
#' \item{hu_diameter}{Diameter of the area experiencing hurricane strength winds (64 knots or above)}
#' \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the area experiencing tropical storm strength winds (34 knots or above)}
#' \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area experiencing hurricane strength winds (64 knots or above)}
#' }
#' @examples
#'
#' # show a plot of the storm paths
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
#' library(ggplot2)
#' ggplot(storms) +
#' aes(x=long, y=lat, color=paste(year, name)) +
#' geom_path() +
#' guides(color='none') +
#' facet_wrap(~year)
#' }
#'
#' storms
"storms"
92 changes: 54 additions & 38 deletions data-raw/storms.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ library(tidyverse)
# in an unorthodox format: a csv that alternates between header/identifier rows
# and data rows.

# TO UPDATE: get the latest URL from https://www.nhc.noaa.gov/data/#hurdat, and rerun this code

# Read in data set so each line is a character string
storm_strings <- read_lines("http://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2015-070616.txt")
storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2020-052921.txt")
storm_strings <- read_lines(storm_file_complete)

# Identify the header lines that have three commas
library(stringr)
header_locations <- (1:length(storm_strings))[str_count(storm_strings, "\\,") == 3]

# Extract length of each sub-dataset
Expand All @@ -20,47 +22,60 @@ headers_df <- headers %>%
mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
select(name, skip, n_obs)

# Read in the sub-datasets as data frames
df_names <- c(
"date", "time", "record_type", "status", "lat", "long", "wind", "pressure",
"extent_34_NE", "extent_34_SE", "extent_34_SW", "extent_34_NW",
"extent_50_NE", "extent_50_SE", "extent_50_SW", "extent_50_NW",
"extent_64_NE", "extent_64_SE", "extent_64_SW", "extent_64_NW", "nas"
column_types <- list(
date = col_character(),
time = col_character(),
record_type = col_character(),
status = col_character(),
lat = col_character(),
long = col_character(),
wind = col_integer(),
pressure = col_integer(),
extent_34_NE = col_integer(),
extent_34_SE = col_integer(),
extent_34_SW = col_integer(),
extent_34_NW = col_integer(),
extent_50_NE = col_integer(),
extent_50_SE = col_integer(),
extent_50_SW = col_integer(),
extent_50_NW = col_integer(),
extent_64_NE = col_integer(),
extent_64_SE = col_integer(),
extent_64_SW = col_integer(),
extent_64_NW = col_integer(),
nas = col_integer()
)
column_names <- names(column_types)

storm_dfs <- vector("list", nrow(headers_df))
names(storm_dfs) <- headers_df$name

for (i in seq_along(headers_df$name)) {
storm_dfs[[i]] <- read_csv("data-raw/hurdat2.txt",
skip = headers_df$skip[i],
n_max = headers_df$n_obs[i],
col_names = df_names,
na = c("", "-99", "-999"),
col_types = list(
time = col_character(),
pressure = col_integer(),
extent_34_NE = col_integer(),
extent_34_SE = col_integer(),
extent_34_SW = col_integer(),
extent_34_NW = col_integer(),
extent_50_NE = col_integer(),
extent_50_SE = col_integer(),
extent_50_SW = col_integer(),
extent_50_NW = col_integer(),
extent_64_NE = col_integer(),
extent_64_SE = col_integer(),
extent_64_SW = col_integer(),
extent_64_NW = col_integer()
)
#### Parse each storm as its own sub-dataframe
storm_dataframes <- vector("list", nrow(headers_df))
for (i in 1:nrow(headers_df)) {
# get this storm's metadata
row_start = headers_df[i,]$skip + 1
row_end = headers_df[i,]$n_obs + row_start - 1
# subset of rows belonging to this storm
data_subset = storm_strings[row_start:row_end] %>%
paste(collapse = "\n") %>%
paste0("\n")
data_subset = read_csv(
data_subset,
col_names = column_names,
col_types = column_types,
na = c("", "-99", "-999")
)
# name at the front
data_subset$name = headers_df[i,]$name
data_subset = data_subset %>% relocate(name)
# add to list of storms
storm_dataframes[[i]] = data_subset
}

# Combine and clean the data sets
library(lubridate)

storms <- storm_dfs %>%
bind_rows(.id = "name") %>%
storms <- storm_dataframes %>%
bind_rows() %>%
mutate(
date = ymd(date),
year = year(date),
Expand All @@ -81,13 +96,13 @@ storms <- storm_dfs %>%
# wind = wind * 1.15078, # transforms knots to mph,
TSradius1 = extent_34_NE + extent_34_SW,
TSradius2 = extent_34_NW + extent_34_SE,
ts_diameter = pmax(TSradius1, TSradius2) * 1.15078, # to convert from nautical miles to miles
tropicalstorm_force_diameter = pmax(TSradius1, TSradius2),
HUradius1 = extent_64_NE + extent_64_SW,
HUradius2 = extent_64_NW + extent_64_SE,
hu_diameter = pmax(HUradius1, HUradius2) * 1.15078, # to convert from nautical miles to miles
hurricane_force_diameter = pmax(HUradius1, HUradius2),
status = recode(status, "HU" = "hurricane", "TS" = "tropical storm", "TD" = "tropical depression")
) %>%
select(name, year, month, day, hour, lat, long, status, category, wind, pressure, ts_diameter, hu_diameter)
select(name, year, month, day, hour, lat, long, status, category, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)

# Narrow to storms that have complete pressure record
completeish <- storms %>%
Expand All @@ -103,4 +118,5 @@ storms <- storms %>%
) %>%
mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))

devtools::use_data(storms)
# output for the package
usethis::use_data(storms, overwrite = TRUE)
Binary file modified data/storms.rda
Binary file not shown.
19 changes: 15 additions & 4 deletions man/storms.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.