# Shark attack log

In [33]:
# Libraries
library(tidyverse)
library(readxl)

In [34]:
# Helpers
quiet <- function(x) {
    # Suppress both messages and warnings
    suppressMessages(suppressWarnings(x))
}

clean_str <- function(s) {
    # Convert to lower case and trim whitespaces
    return(str_to_lower(str_squish(s)))
}

In [35]:
# Data
file_url <- r"(https://www.sharkattackfile.net/spreadsheets/GSAF5.xls)"

    # Load up the spreadsheet into a temporary file 
temp_file <- tempfile(fileext = ".xls")
download.file(file_url, temp_file, mode = "wb")

In [36]:
df <- quiet(
        read_excel(
            temp_file,
            col_names = TRUE, 
            col_types = NULL
        )
    )
# df |> head(n=10)

In [37]:
# df structure
df |> str()

tibble [7,058 × 23] (S3: tbl_df/tbl/data.frame)
 $ Date            : chr [1:7058] "27th November" "27th November" "10th November" "9th November" ...
 $ Year            : chr [1:7058] "2025" "2025" "2025" "2025" ...
 $ Type            : chr [1:7058] "Unprovoked" "Unprovoked" "Unprovoked" "Unprovoked" ...
 $ Country         : chr [1:7058] "Australia" "Australia" "Australia" "French Polynesia" ...
 $ State           : chr [1:7058] "NSW" "NSW" "Western Australia" "Marquesas Islands" ...
 $ Location        : chr [1:7058] "Crowdy Bay" "Crowdy Bay" "Prevelly Beach Magaret River" "Hakahau Bay" ...
 $ Activity        : chr [1:7058] "Swimming" "Swimming" "Foil Boarding" "Swimming" ...
 $ Name            : chr [1:7058] "Lukas Schindler" "Livia Mulheim" "Andy McDonald" "Not stated (Dentist)" ...
 $ Sex             : chr [1:7058] "M" "F" "M" "M" ...
 $ Age             : chr [1:7058] "26" "25" "61" "40" ...
 $ Injury          : chr [1:7058] "Serious leg injuries" "Not stated" "No Injury to self" "De

In [38]:
# df dimensions
df |> dim()

## Data cleanup

In [39]:
# Clean up the data frame
df <- df |>
    # Clean col names
    janitor::clean_names() |>
    mutate(
        # Year as integer
        year = as.integer(year),
        # Type of attack as factor
        type = factor(
            clean_str(type), 
            levels = c("unprovoked", "provoked", "questionable", "watercraft"),
            ordered = FALSE
        ),
        # Age as integer
        age = as.integer(age),
        # Sex as factor
        sex = factor(
            clean_str(sex), 
            levels = c("m", "f"),
            ordered = FALSE
        ),
        # Binary fatality status
        fatal = factor(
                case_match(
                    fatal_y_n,
                    "Y" ~ 1,
                    "N" ~ 0,
                    .default = NA_integer_
                ),
            levels = c(0, 1),
            ordered = FALSE
        )
    ) |>
    mutate(
        across(
            # Convert to lower case and purge whitespaces
            .cols = c("date", "time", "activity", "injury", "species"),
            .fns = ~ str_to_lower(str_squish(.x))
        )
    ) |>
    mutate(
        across(
            # Purge whitespaces
            .cols = c("country", "state", "location", "name"),
            .fns = ~ str_to_title(str_squish(.x))
        )
    ) |>
    select(-fatal_y_n)

[1m[22m[36mℹ[39m In argument: `age = as.integer(age)`.
[33m![39m NAs introduced by coercion"


###  Clean the time column

- recode `time` as time
- construct a `time_of_day` column

In [40]:
# Extract the raw column
raw_time <- df$time

# Clean up the values
hrs_mins <- str_match(raw_time, "(\\d{1,2})[a-z]*?(\\d{2})")
clean_time <- ifelse(!is.na(hrs_mins)[,1], paste(hrs_mins[,2], ":", hrs_mins[,3]), NA)

# Override the original column with clean time values
df["time"] <- hm(clean_time)

# Classify phases of day
day_phases <- r"(daybreak: 04:00–06:59
morning: 07:00–11:59
afternoon: 12:00–16:59
nightfall: 17:00–20:59
night: 21:00–03:59)"

# Match the string components
day_phases_components <- str_match_all(day_phases, "(\\w+):\\s*(\\d{2}:\\d{2})[–-](\\d{2}:\\d{2})")[[1]]

# Construct a key
day_phases_key <- tibble(
    phase = day_phases_components[,2],
    start = hm(day_phases_components[,3]),
    end = hm(day_phases_components[,4])
)

In [41]:
# Recode day phases
day_phases <- sapply(
    seq(nrow(df)),
    \(i) {
        t <- df$time[i]   # Or df[["time"]][i]
        raw_t <- raw_time[[i]]
        raw_s <- str_extract(raw_t, "[a-z ]+")

        if (!is.na(t)) {
            s_mask <- day_phases_key$start >= t
            e_mask <- day_phases_key$end <= t
            idx <- which(s_mask & e_mask)
            
            if (length(idx) == 1L) {
                phase <- day_phases_key[idx, "phase"]
                return(phase)
            } else {
                return("night")
            }
        } else if (!is.na(raw_s)) {
            a_match <- setdiff(
                unlist(str_split(raw_s, " ")),
                day_phases_key$phase
            )
            if (length(a_match) > 0) {
                return(raw_s)
            } else {
                return(NA_character_)
            }
        } else {
            return(NA_character_)
        }
    }
)

df <- df |>
    relocate(
        time,
        .after = "year"
    ) |>
    mutate(
        time_of_day = factor(day_phases, levels = unique(unlist(day_phases)))
    ) |>
    relocate(
        time_of_day,
        .after = "time"
    )

### Clean the date column

- recode `date` as date
- construct `date_notes` column

In [42]:
# Construct date_notes
df <- df |>
    mutate(
        date_notes = date
    )

In [43]:
# Construct a helper
clean_historical_date <- function(date_str, year_col = NA_integer_) {
      d <- date_str |>
        # Take reported for actual
        str_remove("^reported\\s+") |>
        # Remove suffixes
        str_replace_all("(?<=\\d)(st|nd|rd|th)", "")
  
    # Extract the year from the separate column
    year_present <- str_detect(d, "\\b\\d{4}\\b")
  
    d <- ifelse(
        !year_present & !is.na(year_col),
        paste(d, year_col),
        d
    )
  
    # Let lubridate parse the muddle
    parse_date_time(
        d,
        orders = c("d B Y", "B Y", "d-b-Y", "b-d-Y", "d B"),
        quiet = TRUE
    ) |> as.Date()
}

# Clean the column
df <- df |>
    mutate(
        date = clean_historical_date(date, year)
    )

### Clean species

- normalise `species` column
- construct `specimen_size` column

In [44]:
# Construct size columns
df <- df |>
    mutate(
        size_m = str_extract(species, "\\d*\\.?\\d+(?=m)"),
        size_ft = str_extract(species, "\\d*\\.?\\d+(?=ft)")
    )

In [45]:
# Clean the species column
df <- df |>
    mutate(
        species = str_replace_all(
            species,
            c("wfite" = "white",
              "shart" = "shark",
              "broze" = "bronze",
              "carribean" = "caribbean",
              "galapogas" = "galapagos",
              "shall" = "small",
              "rreef" = "reef",
              "black-tipped" = "blacktip")
        ),
        species =  case_when(
                str_detect(species, "bull") & str_detect(species, "tiger") ~ "bull or tiger shark",

                str_detect(species, "oceanic whitetip") ~ "oceanic whitetip shark",
                str_detect(species, "blacktip reef") ~ "blacktip reef shark",
                str_detect(species, "caribbean reef") ~ "caribbean reef shark",
                str_detect(species, "sand tiger|raggedtooth")~ "sand tiger shark",
                str_detect(species, "sandbar") ~ "sandbar shark",
                str_detect(species, "bronze whaler") ~ "bronze whaler",
                str_detect(species, "wobbegong") ~ "wobbegong shark",
                str_detect(species, "sevengill") ~ "sevengill shark",
                str_detect(species, "cookiecutter") ~ "cookiecutter shark",
                str_detect(species, "grey reef") ~ "grey reef shark",
                str_detect(species, "reef shark") ~ "reef shark",

                str_detect(species, "shovelnose guitarfish") ~ "shovelnose guitarfish",
                str_detect(species, "whale shark") ~ "whale shark",
                str_detect(species, "horn shark|horn") ~ "horn shark",
                str_detect(species, "hammerhead") ~ "hammerhead shark",

                str_detect(species, "lemon") ~ "lemon shark",
                str_detect(species, "nurse") ~ "nurse shark",
                str_detect(species, "blacktip") ~ "blacktip shark",
                str_detect(species, "mako") ~ "mako shark",
                str_detect(species, "blue pointer") ~ "blue pointer",
                str_detect(species, "blue") ~ "blue shark",
                str_detect(species, "dusky") ~ "dusky shark",
                str_detect(species, "galapagos") ~ "galapagos shark",
                str_detect(species, "reef") ~ "reef shark",

                str_detect(species, "porbeagle") ~ "porbeagle shark",
                str_detect(species, "basking") ~ "basking shark",

                str_detect(species, "bull") ~ "bull shark",
                str_detect(species, "tiger") ~ "tiger shark",
                str_detect(species, "great white|white shark|white shark") ~ "white shark",
                TRUE ~ NA_character_
            )


    )


### Final touches

- select cols
- filter out NA years

In [46]:
clean_df <- df |>
    select(date, year, time, time_of_day, type, country, state, location, activity, name, sex, injury, fatal, species, size_m, size_ft) |>
    filter(!is.na(year)) |>
    filter(year > 1900)

In [47]:
clean_df |> head(n=10)

date,year,time,time_of_day,type,country,state,location,activity,name,sex,injury,fatal,species,size_m,size_ft
<date>,<int>,<Period>,<fct>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<fct>,<chr>,<fct>,<chr>,<chr>,<chr>
2025-11-27,2025,6H 30M 0S,night,unprovoked,Australia,Nsw,Crowdy Bay,swimming,Lukas Schindler,m,serious leg injuries,0,bull shark,3.0,
2025-11-27,2025,6H 30M 0S,night,unprovoked,Australia,Nsw,Crowdy Bay,swimming,Livia Mulheim,f,not stated,1,bull shark,3.0,
2025-11-10,2025,17H 45M 0S,night,unprovoked,Australia,Western Australia,Prevelly Beach Magaret River,foil boarding,Andy Mcdonald,m,no injury to self,0,white shark,,
2025-11-09,2025,,not stated,unprovoked,French Polynesia,Marquesas Islands,Hakahau Bay,swimming,Not Stated (Dentist),m,deep gash to bicep,0,,3.0,
2025-11-05,2025,,mid afternoon,unprovoked,Usa,Hawaii,Pine Trees Hanalei Bay Kaui,swimming,Chance Swanson,m,injuries to legs,0,,,
2025-11-05,2025,,not stated,unprovoked,Usa,Texas,Matagorda Beach Matagorda,fishing,Chuck Bledsoe,m,laceration on top and undermeath right foot,0,,,
2025-11-04,2025,18H 0M 0S,night,unprovoked,Samoa,,Aga Reef Resort Lalomanu,surfing,Evan Campbell,m,lacerations to right leg,0,tiger shark,,
2025-10-14,2025,,not stated,unprovoked,Columbia,"Bolivar, Del Isolate",Catagena Province,swimming with sharks,Male Child,m,severe hand injury,0,nurse shark,,
2025-10-11,2025,18H 23M 0S,night,unprovoked,Australia,Queensland,Cook Esplanade Thursday Island,fishing/swimming,Samuel Nai,m,serious abdonminal injuries,0,bull or tiger shark,,
2025-10-07,2025,13H 30M 0S,night,unprovoked,Australia,South Australia,Kangaroo Island,surfing,Lee Berryman,m,lacerations to calf,0,bronze whaler,,


## EDA

In [48]:
#