# Shark attack log

In [186]:
# Libraries
import numpy as np
import pandas as pd
import re

from skimpy import clean_columns
from datetime import datetime

In [187]:
# Data
file_url = r"https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"

In [188]:
df = pd.read_excel(file_url, engine="xlrd")
# df.head(n=10)

In [189]:
# df structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7058 entries, 0 to 7057
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            7058 non-null   object 
 1   Year            7056 non-null   float64
 2   Type            7040 non-null   object 
 3   Country         7008 non-null   object 
 4   State           6571 non-null   object 
 5   Location        6491 non-null   object 
 6   Activity        6473 non-null   object 
 7   Name            6839 non-null   object 
 8   Sex             6479 non-null   object 
 9   Age             4064 non-null   object 
 10  Injury          7023 non-null   object 
 11  Fatal Y/N       6497 non-null   object 
 12  Time            3532 non-null   object 
 13  Species         3927 non-null   object 
 14  Source          7038 non-null   object 
 15  pdf             6799 non-null   object 
 16  href formula    6794 non-null   object 
 17  href            6796 non-null   o

In [190]:
# df dimensions
df.shape

(7058, 23)

## Data cleanup

In [191]:
# Clean col names
df = clean_columns(df)

# Year as integer
df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

# Type of attack as factor
df["type"] = df["type"].str.lower().str.strip().str.replace(" +", " ", regex=True)
df["type"] = pd.Categorical(
    df["type"],
    categories=["unprovoked", "provoked", "questionable", "watercraft"],
    ordered=False
)

# Age as integer
df["age"] = pd.to_numeric(df["age"], errors="coerce").astype("Int64")

# Sex as factor
df["sex"] = df["sex"].str.lower().str.strip().str.replace(" +", " ", regex=True)
df["sex"] = pd.Categorical(
    df["sex"],
    categories=["m", "f"],
    ordered=False
)

# Binary fatality status
fatality_map = {"Y": 1, "N": 0}

df["fatal"] = df.pop("fatal_y_n")
df["fatal"] = df["fatal"].map(fatality_map)
df["fatal"] = pd.to_numeric(df["fatal"], errors="coerce").astype("Int16")
df["fatal"] = pd.Categorical(
    df["fatal"],
    categories=[0,1],
    ordered=False
)

# Convert to lower case and purge whitespaces
df[["date", "time", "activity", "injury", "species"]] = (
    df[["date", "time", "activity", "injury", "species"]]
    .apply(lambda col: col.str.lower().str.strip().str.replace(" +", " ", regex=True))
)

# Purge whitespaces
df[["country", "state", "location", "name"]] = (
    df[["country", "state", "location", "name"]]
    .apply(lambda col: col.str.title().str.strip().str.replace(" +", " ", regex=True))
)

### Clean the time column

- recode `time` as time
- construct a `time_of_day` column



In [192]:
# Extract the raw column
raw_time = df.time

# Clean up the values
hrs_mins = raw_time.str.extract("(\\d{1,2})[a-z]*?(\\d{2})", expand=False)
clean_time = hrs_mins[0].str.zfill(2) + ":" + hrs_mins[1].str.zfill(2) # Pad the numbers

# Override the original column with clean time values
df["time"] = pd.to_datetime(clean_time, format="%H:%M", errors="coerce").dt.time

# Classify phases of day
day_phases = r"""daybreak: 04:00–06:59
morning: 07:00–11:59
afternoon: 12:00–16:59
nightfall: 17:00–20:59
night: 21:00–03:59)"""

# Match the string components
day_phases_components = re.findall("([a-z]+): (\\d{2}:\\d{2})[-–](\\d{2}:\\d{2})", day_phases)

# Construct a key
day_phases_key = pd.DataFrame(
    day_phases_components,
    columns=["phase", "start", "end"]
)

day_phases_key["start"] = pd.to_datetime(day_phases_key["start"], format="%H:%M").dt.time
day_phases_key["end"] = pd.to_datetime(day_phases_key["end"], format="%H:%M").dt.time


In [193]:
# Recode day phases
day_phases = []

for i in range(len(raw_time)):
    raw_t = raw_time[i]
    raw_m = re.search("[a-z ]+", str(raw_t))
    if raw_m:
        raw_s = raw_m.group()
    else:
        raw_s = None
    clean_t = df.loc[i, "time"]

    mask_s = day_phases_key.start > clean_t
    mask_e = day_phases_key.end < clean_t
    phase_row = day_phases_key[mask_s & mask_e]

    # Categorise time vals
    if pd.notna(clean_t):
        if len(phase_row) > 0:
            day_phases.append(phase_row["phase"].iat[0])
        else:
            day_phases.append("night")
    # Sift out existing categories
    elif raw_s is not None:
        raw_split = raw_s.split(" ")
        a_match = set(day_phases_key["phase"]).intersection(set(raw_split))
        if len(a_match) > 0:
            day_phases.append(raw_s)
        else:
            day_phases.append(None)
    # Discard the rest
    else:
        day_phases.append(None)

# Relocate time and time_of_day
time_col_idx = df.columns.get_loc("year") + 1
time_col = df.pop("time")
df.insert(time_col_idx, "time", time_col)

df.insert(time_col_idx + 1, "time_of_day", day_phases)

# Recode time_of_day as factor
df["time_of_day"] = pd.Categorical(
    df["time_of_day"], 
    categories=df["time_of_day"].dropna().unique(),
    ordered=False
)


### Clean the date column

- recode `date` as date
- construct `date_notes` column



In [194]:
# Construct date_notes
# df["date_notes"] = df.pop("date")
df["date_notes"] = df["date"]

In [195]:
# Construct a helper

# Clean the column


In [196]:


df[["date", "year", "time", "time_of_day", "type", "country", "state", "location", "activity", "name", "sex", "injury", "fatal", "species"]]

Unnamed: 0,date,year,time,time_of_day,type,country,state,location,activity,name,sex,injury,fatal,species
0,27th november,2025,06:30:00,night,unprovoked,Australia,Nsw,Crowdy Bay,swimming,Lukas Schindler,m,serious leg injuries,0,3m bull shark
1,27th november,2025,06:30:00,night,unprovoked,Australia,Nsw,Crowdy Bay,swimming,Livia Mulheim,f,not stated,1,3m bull shark
2,10th november,2025,17:45:00,night,unprovoked,Australia,Western Australia,Prevelly Beach Magaret River,foil boarding,Andy Mcdonald,m,no injury to self,0,great white shark
3,9th november,2025,NaT,,unprovoked,French Polynesia,Marquesas Islands,Hakahau Bay,swimming,Not Stated (Dentist),m,deep gash to bicep,0,3m shark
4,5th november,2025,NaT,mid afternoon,unprovoked,Usa,Hawaii,Pine Trees Hanalei Bay Kaui,swimming,Chance Swanson,m,injuries to legs,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7053,before 1903,0,NaT,,unprovoked,Australia,Western Australia,Roebuck Bay,diving,Male,m,fatal,1,
7054,before 1903,0,NaT,,unprovoked,Australia,Western Australia,,pearl diving,Ahmun,m,fatal,1,
7055,1900-1905,0,NaT,,unprovoked,Usa,North Carolina,Ocracoke Inlet,swimming,Coast Guard Personnel,m,fatal,1,
7056,1883-1889,0,NaT,,unprovoked,Panama,,"Panama Bay 8ºn, 79ºw",,Jules Patterson,m,fatal,1,
