# Shark attack log

In [21]:
# Libraries
import numpy as np
import pandas as pd
import re

from skimpy import clean_columns

In [22]:
# Data
file_url = r"https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"

In [23]:
df = pd.read_excel(file_url, engine="xlrd")
# df.head(n=10)

In [24]:
# df structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7058 entries, 0 to 7057
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            7058 non-null   object 
 1   Year            7056 non-null   float64
 2   Type            7040 non-null   object 
 3   Country         7008 non-null   object 
 4   State           6571 non-null   object 
 5   Location        6491 non-null   object 
 6   Activity        6473 non-null   object 
 7   Name            6839 non-null   object 
 8   Sex             6479 non-null   object 
 9   Age             4064 non-null   object 
 10  Injury          7023 non-null   object 
 11  Fatal Y/N       6497 non-null   object 
 12  Time            3532 non-null   object 
 13  Species         3927 non-null   object 
 14  Source          7038 non-null   object 
 15  pdf             6799 non-null   object 
 16  href formula    6794 non-null   object 
 17  href            6796 non-null   o

In [25]:
# df dimensions
df.shape

(7058, 23)

## Data cleanup

In [None]:
# Clean col names
df = clean_columns(df)

# Year as integer
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df["year"] = df["year"].fillna(-1)
df["year"] = df["year"].astype("int32")

# Type of attack as factor
df["type"] = pd.Categorical(
    df["type"],
    categories=["unprovoked", "provoked", "questionable", "watercraft"],
    ordered=False
)

# Age as integer
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["age"] = df["age"].fillna(-1)
df["age"] = df["age"].astype("int32")

# Sex as factor
df["sex"] = pd.Categorical(
    df["sex"],
    categories=["m", "f"],
    ordered=False
)

# Binary fatality status
fatality_map = {"Y": 1, "N": 0}

df["fatality"] = df.pop("fatal_y_n")
df["fatality"] = df["fatality"].map(fatality_map)
df["fatality"] = df["fatality"].fillna(-1)
df["fatality"] = df["fatality"].astype("int16")
df["fatality"] = pd.Categorical(
    df["fatality"],
    categories=[0,1],
    ordered=False
)



0       0
1       1
2       0
3       0
4       0
       ..
7053    1
7054    1
7055    1
7056    1
7057    1
Name: fatality, Length: 7058, dtype: int16