# Shark attack log

In [241]:
# Libraries
import numpy as np
import pandas as pd
import re

from skimpy import clean_columns
from datetime import datetime

In [242]:
# Data
file_url = r"https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"

In [243]:
df = pd.read_excel(file_url, engine="xlrd")
# df.head(n=10)

In [244]:
# df structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7058 entries, 0 to 7057
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            7058 non-null   object 
 1   Year            7056 non-null   float64
 2   Type            7040 non-null   object 
 3   Country         7008 non-null   object 
 4   State           6571 non-null   object 
 5   Location        6491 non-null   object 
 6   Activity        6473 non-null   object 
 7   Name            6839 non-null   object 
 8   Sex             6479 non-null   object 
 9   Age             4064 non-null   object 
 10  Injury          7023 non-null   object 
 11  Fatal Y/N       6497 non-null   object 
 12  Time            3532 non-null   object 
 13  Species         3927 non-null   object 
 14  Source          7038 non-null   object 
 15  pdf             6799 non-null   object 
 16  href formula    6794 non-null   object 
 17  href            6796 non-null   o

In [245]:
# df dimensions
df.shape

(7058, 23)

## Data cleanup

In [246]:
# Clean col names
df = clean_columns(df)

# Year as integer
df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

# Type of attack as factor
df["type"] = df["type"].str.lower().str.strip().str.replace(" +", " ", regex=True)
df["type"] = pd.Categorical(
    df["type"],
    categories=["unprovoked", "provoked", "questionable", "watercraft"],
    ordered=False
)

# Age as integer
df["age"] = pd.to_numeric(df["age"], errors="coerce").astype("Int64")

# Sex as factor
df["sex"] = df["sex"].str.lower().str.strip().str.replace(" +", " ", regex=True)
df["sex"] = pd.Categorical(
    df["sex"],
    categories=["m", "f"],
    ordered=False
)

# Binary fatality status
fatality_map = {"Y": 1, "N": 0}

df["fatal"] = df.pop("fatal_y_n")
df["fatal"] = df["fatal"].map(fatality_map)
df["fatal"] = pd.to_numeric(df["fatal"], errors="coerce").astype("Int16")
df["fatal"] = pd.Categorical(
    df["fatal"],
    categories=[0,1],
    ordered=False
)

# Convert to lower case and purge whitespaces
df[["date", "time", "activity", "injury", "species"]] = (
    df[["date", "time", "activity", "injury", "species"]]
    .apply(lambda col: col.str.lower().str.strip().str.replace(" +", " ", regex=True))
)

# Purge whitespaces
df[["country", "state", "location", "name"]] = (
    df[["country", "state", "location", "name"]]
    .apply(lambda col: col.str.title().str.strip().str.replace(" +", " ", regex=True))
)

### Clean the time column

- recode `time` as time
- construct a `time_of_day` column



In [247]:
# Extract the raw column
raw_time = df.time

# Clean up the values
hrs_mins = raw_time.str.extract("(\\d{1,2})[a-z]*?(\\d{2})", expand=False)
clean_time = hrs_mins[0].str.zfill(2) + ":" + hrs_mins[1].str.zfill(2) # Pad the numbers

# Override the original column with clean time values
df["time"] = pd.to_datetime(clean_time, format="%H:%M", errors="coerce").dt.time

# Classify phases of day
day_phases = r"""daybreak: 04:00–06:59
morning: 07:00–11:59
afternoon: 12:00–16:59
nightfall: 17:00–20:59
night: 21:00–03:59)"""

# Match the string components
day_phases_components = re.findall("([a-z]+): (\\d{2}:\\d{2})[-–](\\d{2}:\\d{2})", day_phases)

# Construct a key
day_phases_key = pd.DataFrame(
    day_phases_components,
    columns=["phase", "start", "end"]
)

day_phases_key["start"] = pd.to_datetime(day_phases_key["start"], format="%H:%M").dt.time
day_phases_key["end"] = pd.to_datetime(day_phases_key["end"], format="%H:%M").dt.time


In [248]:
# Recode day phases
day_phases = []

for i in range(len(raw_time)):
    raw_t = raw_time[i]
    raw_m = re.search("[a-z ]+", str(raw_t))
    if raw_m:
        raw_s = raw_m.group()
    else:
        raw_s = None
    clean_t = df.loc[i, "time"]

    mask_s = day_phases_key.start > clean_t
    mask_e = day_phases_key.end < clean_t
    phase_row = day_phases_key[mask_s & mask_e]

    # Categorise time vals
    if pd.notna(clean_t):
        if len(phase_row) > 0:
            day_phases.append(phase_row["phase"].iat[0])
        else:
            day_phases.append("night")
    # Sift out existing categories
    elif raw_s is not None:
        raw_split = raw_s.split(" ")
        a_match = set(day_phases_key["phase"]).intersection(set(raw_split))
        if len(a_match) > 0:
            day_phases.append(raw_s)
        else:
            day_phases.append(None)
    # Discard the rest
    else:
        day_phases.append(None)

# Relocate time and time_of_day
time_col_idx = df.columns.get_loc("year") + 1
time_col = df.pop("time")
df.insert(time_col_idx, "time", time_col)

df.insert(time_col_idx + 1, "time_of_day", day_phases)

# Recode time_of_day as factor
df["time_of_day"] = pd.Categorical(
    df["time_of_day"], 
    categories=df["time_of_day"].dropna().unique(),
    ordered=False
)


### Clean the date column

- recode `date` as date
- construct `date_notes` column



In [249]:
# Construct date_notes
# df["date_notes"] = df.pop("date")
df["date_notes"] = df["date"]

In [250]:
# Clean the column
    # Take the reported for actual and remove suffixes
date_raw = (
    df["date_notes"].astype("str")
    .str.replace("reported ", "", regex=True)
    .str.replace("(?<=\\d)(st|nd|rd|th)", "", regex=True)
)

    # Parse the parsable and construct a mask for the rest
date_parsed = pd.to_datetime(date_raw, errors="coerce", format="mixed")

    # The case of D:M 
date_unparsed = date_parsed.isna()
day_month = date_raw[date_unparsed].str.extract("(\\d{1,2}).*?([a-z]+)")
date_construct = df.loc[date_unparsed, "year"].astype(str) + "-" + day_month[1].str[:3] + "-" + day_month[0].str.zfill(2)

date_parsed.loc[date_unparsed] = pd.to_datetime(date_construct, errors="coerce", format="%Y-%b-%d")

    # The case of M:Y
left_unparsed = date_parsed.isna()
month_year = date_raw[left_unparsed].str.extract("([a-z]+).*?(\\d{4})")
month_construct = month_year[1] + "-" + month_year[0].str[:3] + "-" + "01"

date_parsed.loc[left_unparsed] = pd.to_datetime(month_construct, errors="coerce", format="%Y-%b-%d")

df["date"] = date_parsed

### Clean species

- normalise `species` column
- construct `specimen_size` column

In [251]:
# Construct size columns
df["size_m"] = df["species"].str.extract("(\\d*\\.?\\d+)(?=m)")
df["size_ft"] = df["species"].str.extract("(\\d*\\.?\\d+)(?=ft)")

In [262]:
# Clean the species column
    # Correct typos
typos_dict = {
    "wfite": "white",
    "shart": "shark",
    "broze": "bronze",
    "carribean": "caribbean",
    "galapogas": "galapagos",
    "shall": "small",
    "rreef": "reef",
    "black-tipped": "blacktip"
}

df["species"] = df["species"].replace(typos_dict, regex=True)

    # Clean the species
detect_substrings = [
    df["species"].str.contains(r"bull.*tiger|tiger.*bull", na=False),

    df["species"].str.contains(r"oceanic whitetip", na=False),
    df["species"].str.contains(r"blacktip reef", na=False),
    df["species"].str.contains(r"caribbean reef", na=False),
    df["species"].str.contains(r"sand tiger|raggedtooth", na=False),
    df["species"].str.contains(r"sandbar", na=False),
    df["species"].str.contains(r"bronze whaler", na=False),
    df["species"].str.contains(r"wobbegong", na=False),
    df["species"].str.contains(r"sevengill", na=False),
    df["species"].str.contains(r"cookiecutter", na=False),
    df["species"].str.contains(r"grey reef", na=False),
    df["species"].str.contains(r"reef shark", na=False),

    df["species"].str.contains(r"shovelnose guitarfish", na=False),
    df["species"].str.contains(r"whale shark", na=False),
    df["species"].str.contains(r"horn shark|horn", na=False),
    df["species"].str.contains(r"hammerhead", na=False),

    df["species"].str.contains(r"lemon", na=False),
    df["species"].str.contains(r"nurse", na=False),
    df["species"].str.contains(r"blacktip", na=False),
    df["species"].str.contains(r"mako", na=False),
    df["species"].str.contains(r"blue pointer", na=False),
    df["species"].str.contains(r"blue", na=False),
    df["species"].str.contains(r"dusky", na=False),
    df["species"].str.contains(r"galapagos", na=False),
    df["species"].str.contains(r"reef", na=False),

    df["species"].str.contains(r"porbeagle", na=False),
    df["species"].str.contains(r"basking", na=False),

    df["species"].str.contains(r"bull", na=False),
    df["species"].str.contains(r"tiger", na=False),
    df["species"].str.contains(r"great white|white shark", na=False),
]

substitute_strings = [
    "bull or tiger shark",

    "oceanic whitetip shark",
    "blacktip reef shark",
    "caribbean reef shark",
    "sand tiger shark",
    "sandbar shark",
    "bronze whaler",
    "wobbegong shark",
    "sevengill shark",
    "cookiecutter shark",
    "grey reef shark",
    "reef shark",

    "shovelnose guitarfish",
    "whale shark",
    "horn shark",
    "hammerhead shark",

    "lemon shark",
    "nurse shark",
    "blacktip shark",
    "mako shark",
    "blue pointer",
    "blue shark",
    "dusky shark",
    "galapagos shark",
    "reef shark",

    "porbeagle shark",
    "basking shark",

    "bull shark",
    "tiger shark",
    "white shark",
]

df["species"] = np.select(detect_substrings, substitute_strings, default="")
df["species"] = df["species"].replace("", np.nan)


### Final touches

- select cols
- filter out NA years

In [264]:
clean_cols = [
    "date", "year", "time", "time_of_day", 
    "type", 
    "country", "state", "location", 
    "activity", "name", "sex", 
    "injury", "fatal", 
    "species", "size_m", "size_ft"
    ]
clean_df = df[clean_cols]
clean_df = clean_df[clean_df["year"].notna()]
clean_df = clean_df[clean_df["year"] > 1900]

In [265]:
clean_df.head(n=10)

Unnamed: 0,date,year,time,time_of_day,type,country,state,location,activity,name,sex,injury,fatal,species,size_m,size_ft
0,2025-11-27,2025,06:30:00,night,unprovoked,Australia,Nsw,Crowdy Bay,swimming,Lukas Schindler,m,serious leg injuries,0,bull shark,3.0,
1,2025-11-27,2025,06:30:00,night,unprovoked,Australia,Nsw,Crowdy Bay,swimming,Livia Mulheim,f,not stated,1,bull shark,3.0,
2,2025-11-10,2025,17:45:00,night,unprovoked,Australia,Western Australia,Prevelly Beach Magaret River,foil boarding,Andy Mcdonald,m,no injury to self,0,white shark,,
3,2025-11-09,2025,NaT,,unprovoked,French Polynesia,Marquesas Islands,Hakahau Bay,swimming,Not Stated (Dentist),m,deep gash to bicep,0,,3.0,
4,2025-11-05,2025,NaT,mid afternoon,unprovoked,Usa,Hawaii,Pine Trees Hanalei Bay Kaui,swimming,Chance Swanson,m,injuries to legs,0,,,
5,2025-11-05,2025,NaT,,unprovoked,Usa,Texas,Matagorda Beach Matagorda,fishing,Chuck Bledsoe,m,laceration on top and undermeath right foot,0,,,
6,2025-11-04,2025,18:00:00,night,unprovoked,Samoa,,Aga Reef Resort Lalomanu,surfing,Evan Campbell,m,lacerations to right leg,0,tiger shark,,
7,2025-10-14,2025,NaT,,unprovoked,Columbia,"Bolivar, Del Isolate",Catagena Province,swimming with sharks,Male Child,m,severe hand injury,0,nurse shark,,
8,2025-10-11,2025,18:23:00,night,unprovoked,Australia,Queensland,Cook Esplanade Thursday Island,fishing/swimming,Samuel Nai,m,serious abdonminal injuries,0,bull or tiger shark,,
9,2025-10-07,2025,13:30:00,night,unprovoked,Australia,South Australia,Kangaroo Island,surfing,Lee Berryman,m,lacerations to calf,0,bronze whaler,,


## EDA

In [None]:
#