# Pandas Project - cleaning shark attacks csv

## Workflow determination
1. Determine steps to be taken
2. Set up csv in pandas and create copy to work with
3. Declare helper functions 
3. Gather information about shape and characteristics of dataframe
6. Check data types
4. Inspect data
9. Check null values
5. Change column names
8. Remove duplicates
10. Remove irrelevant columns
11. Convert dates
12. Clean data in column
12. And then correct data type
14. Set new index according to unique tracker
15. Export to new csv


## Set up 


First we import the necessary libraries

In [439]:
import re

import numpy as np
import pandas as pd

from string import punctuation
from datetime import datetime

Now we declare helper functions

In [440]:
def clean_col_names(df):
    """Take a dataframe and return sanitized column names."""

    table = str.maketrans(dict.fromkeys(punctuation))
    return [c.lower()
            .strip()
            .translate(table)
            .replace(" ", "_") for c in df.columns]


def cols_missing_value(df):
    """Return fraction of column missing value."""
    
    return ((df.isna().sum())[df.isna().sum() > 0] / len(df)) * 100

def invalid_year(date):
    """Return NaT if not valid year."""
    
    # Doesn't work because of 20 non-cooperating rows
    if date == date:
        if date > datetime.strptime('2020', '%y'):
            return pd.NaT
    return date

def search_and_set(df, col, word, regex=False, lst=[]):
    """Search for term and set correspending rows to value."""
    
    # If lst is set, we use every term therein, otherwise just 
    # set word as first item
        
    if not lst:
        lst = [word]
    
    if regex:
        for item in lst:
            df.loc[df[col].str.contains(
                item, case=False, na=False, regex=True), col] = word
    else:
        for item in lst:
            df.loc[df[col].str.contains(
                item, case=False, na=False), col] = word

def regex_test(txt, pat, needle=False):
    """Return NaN if string contains no space."""
    
    check = re.search(pat, str(txt))
    if check:
        return check.group() if needle else txt
    return np.NaN

Next we import the csv into pandas and inspect the head

In [441]:
df_original = pd.read_csv("attacks.csv", encoding="cp1252")
df = df_original.copy()
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


## Inspect

In [442]:
# rows and columns
df.shape

(25723, 24)

In [443]:
# Types of columns
df.dtypes

Case Number                object
Date                       object
Year                      float64
Type                       object
Country                    object
Area                       object
Location                   object
Activity                   object
Name                       object
Sex                        object
Age                        object
Injury                     object
Fatal (Y/N)                object
Time                       object
Species                    object
Investigator or Source     object
pdf                        object
href formula               object
href                       object
Case Number.1              object
Case Number.2              object
original order            float64
Unnamed: 22                object
Unnamed: 23                object
dtype: object

In [444]:
# Random sample of data
df.sample(20)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
2430,1994.07.08.b,08-Jul-1994,1994.0,Unprovoked,BRAZIL,Pernambuco,"Boa Viagem, Recife",Surfing,Sandro Paulo dos Santos,M,...,,JCOnline,1994.07.08.b-SandroPaulosDosSantos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1994.07.08.b,1994.07.08.b,3873.0,,
24367,,,,,,,,,,,...,,,,,,,,,,
18596,,,,,,,,,,,...,,,,,,,,,,
18859,,,,,,,,,,,...,,,,,,,,,,
16876,,,,,,,,,,,...,,,,,,,,,,
19948,,,,,,,,,,,...,,,,,,,,,,
14535,,,,,,,,,,,...,,,,,,,,,,
15547,,,,,,,,,,,...,,,,,,,,,,
15167,,,,,,,,,,,...,,,,,,,,,,
7516,0,,,,,,,,,,...,,,,,,,,,,


In [445]:
# Columns with NA values
cols_missing_value(df)

Case Number               66.170353
Date                      75.500525
Year                      75.508300
Type                      75.516075
Country                   75.694903
Area                      77.269370
Location                  77.599813
Activity                  77.615364
Name                      76.316915
Sex                       77.697003
Age                       86.506240
Injury                    75.609377
Fatal (Y/N)               77.595926
Time                      88.539439
Species                   86.533453
Investigator or Source    75.566614
pdf                       75.500525
href formula              75.504412
href                      75.500525
Case Number.1             75.500525
Case Number.2             75.500525
original order            75.473312
Unnamed: 22               99.996112
Unnamed: 23               99.992225
dtype: float64

### A few observations
   - column names contain punctuation, spaces and capitals
   - case number seems to be same as Date, and Year is a pary of previous two
   - many rows have no data at all
   - pdf is part of href formula, which in turn is equal to href
   - last 2 columns have no name and out of 25723 only 1 or 2 rows have a value
   - age and species values are mainly missing
   

## General cleanup

Now we know a little about our data set, we are going to start cleaning. First we tackle the big stuff. 

In [446]:
# Clean up column names
df.columns = clean_col_names(df)
df.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal_yn', 'time',
       'species', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number1', 'case_number2', 'original_order', 'unnamed_22',
       'unnamed_23'],
      dtype='object')

In [447]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)


In [448]:
cols_missing_value(df)

case_number                0.031686
date                       0.158428
year                       0.190114
type                       0.221800
country                    0.950570
area                       7.366920
location                   8.713561
activity                   8.776933
name                       3.485425
sex                        9.109632
age                       45.009506
injury                     0.602028
fatal_yn                   8.697719
time                      53.295311
species                   45.120406
investigator_or_source     0.427757
pdf                        0.158428
href_formula               0.174271
href                       0.158428
case_number1               0.158428
case_number2               0.158428
original_order             0.047529
unnamed_22                99.984157
unnamed_23                99.968314
dtype: float64

## Type

We start cleaning more in depth by looking at the type column. 

In [449]:
# We check the values for 'type'
df["type"].value_counts(dropna=False)

Unprovoked      4595
Provoked         574
Invalid          547
Sea Disaster     239
Boating          203
Boat             137
NaN               14
Questionable       2
Boatomg            1
Name: type, dtype: int64

In [450]:
# Change invalid and questionable to NaN
df["type"].replace(["Invalid", "Questionable"], np.nan, inplace=True)

# And merge boating
df["type"].replace(["Boat", "Boatomg"], "Boating", inplace=True)

df["type"].value_counts(dropna=False)

Unprovoked      4595
Provoked         574
NaN              563
Boating          341
Sea Disaster     239
Name: type, dtype: int64

## Clearing redundant columns

Since case_number is same as date, and original_order is a unique index, we will drop all 3 case_number columns. Also, the href_formula and pdf column are duplicate to href. Finally we remove the redundant unnamed columns

In [451]:
to_drop = ["case_number", "year", "pdf", 
           "href_formula", "case_number1", 
           "case_number2", "unnamed_22", "unnamed_23"]
df = df.drop(axis = 1, columns = to_drop)
df.head()

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href,original_order
0,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,09-Jun-2018,,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0
3,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6300.0
4,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,6299.0


## Removing bogus rows

When rows hold no real data, we remove them

In [452]:
# If there is no location we don't want to keep the row
cols_missing_value(df)

date                       0.158428
type                       8.919518
country                    0.950570
area                       7.366920
location                   8.713561
activity                   8.776933
name                       3.485425
sex                        9.109632
age                       45.009506
injury                     0.602028
fatal_yn                   8.697719
time                      53.295311
species                   45.120406
investigator_or_source     0.427757
href                       0.158428
original_order             0.047529
dtype: float64

In [453]:
# There are still many rows with very little data. We will drop all that have less than 2 columns of data
df.dropna(thresh=2, inplace=True)
df.dropna(subset=["country"], inplace=True)

In [454]:
df.sample(30)

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href,original_order
4798,24-Mar-1939,Unprovoked,PAPUA NEW GUINEA,Central Province,Port Moresby,Dived for a coin,Raho-Heni,M,,"FATAL, leg severed just below hip",Y,,"""a large shark""","The Papuan Villager, March 1939; G.P. Whitley,...",http://sharkattackfile.net/spreadsheets/pdf_di...,1505.0
761,24-Jul-2012,,USA,North Carolina,"Ocean Isle, Brunswick County",,male,M,12.0,Shark involvement unconfirmed,,11h45,Shark involvement not confirmed,"C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,5542.0
6153,1755,Unprovoked,SWEDEN,Skagerrak arm of the North Sea,Bohuslän,,Fishermen,M,,,UNKNOWN,,,"C. Moore, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,150.0
5136,03-Jan-1927,Unprovoked,AUSTRALIA,New South Wales,"Grey’s Point, Port Hacking",Swimming,Mervwyn Allum,M,15.0,"FATAL, leg bitten from thigh to ankle",Y,11h30,3.7 m [12'] shark,V.M. Coppleson.N.21.(1933); V.M. Coppleson (19...,http://sharkattackfile.net/spreadsheets/pdf_di...,1167.0
2177,25-Jul-1998,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Michael Rinto,M,13.0,Calf bitten,N,P.M.,,"S. Petersohn, GSAF; Orlando Sentinel, 7/24/199...",http://sharkattackfile.net/spreadsheets/pdf_di...,4126.0
2388,18-May-1995,Unprovoked,AUSTRALIA,Western Australia,"Bernier Island, Shark Bay",,Hutchins,,,No details,UNKNOWN,,,"T. Peake, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,3915.0
4311,1955,Provoked,CUBA,Havana Province,Cojimar,Fishing,Romilio,M,,Forearm slashed wrist to elbow by hooked shark...,N,,"""a little shark""","F. Poli, p.13",http://sharkattackfile.net/spreadsheets/pdf_di...,1992.0
4843,16-Aug-1937,,TURKEY,,Istanbul,Swimming,male,M,,"No injury, no attack",,,Invalid,C. Moore. GSAF,http://sharkattackfile.net/spreadsheets/pdf_di...,1460.0
2799,Dec-1986,Unprovoked,NEW CALEDONIA,,I'le Ouen,Spearfishing,Maurice Lilloux,,,Right leg bitten,N,,Tiger shark,W. Leander,http://sharkattackfile.net/spreadsheets/pdf_di...,3504.0
2604,03-Apr-1991,Unprovoked,USA,Hawaii,"One'ula Beach Park, 'Ewa Beach, O'ahu",Sitting on surfboard,Todd R. Wenke,M,,Deep lacerations to calf & ankle,N,17h30,"""Shark had a very large girth""","J. Borg, p.79; L. Taylor (1993), pp.110-111",http://sharkattackfile.net/spreadsheets/pdf_di...,3699.0


## Date
Next we look at the date. 

In [455]:
# Change date to datetime column
df.date = pd.to_datetime(df["date"], errors="coerce", )
df.head()

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href,original_order
0,2018-06-25,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018-06-18,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018-06-09,,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0
3,2018-06-08,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6300.0
4,2018-06-04,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,6299.0


In [456]:
# Get rid of future dates. Doesn't work because of 20 rows that aren't yet in date format it seems.
#df["date"] = df["date"].apply(invalid_year)
df.date.isna().values.sum()

821

## Activity column
This column takes a lot more work. There are many different values, more than 1500. We will try to use combine them to bring it down to 120. 

In [457]:
df.activity.value_counts()

Surfing                                                                                    970
Swimming                                                                                   862
Fishing                                                                                    431
Spearfishing                                                                               332
Bathing                                                                                    160
                                                                                          ... 
Fell from yardarm of British ship Rover                                                      1
Shark Fishing                                                                                1
Attempting to free the shark                                                                 1
Plane forced down, 3 men on rubber life raft. Put hand over side to feel drift of boat       1
Goggle-diving for seaweeds, but standing in water 

In [458]:
# Let's check what values we have in this column
df["activity"].unique()

array(['Paddling', 'Standing', 'Surfing', ...,
       'Crew swimming alongside their anchored ship',
       '4 men were bathing', 'Wreck of  large double sailing canoe'],
      dtype=object)

In [459]:
# Search and equalize common values in activity column

# Key is category, list in value are search patterns
regex_dict = {"shipwreck": ["wreck", "sea disaster", "battle", 
                            "sunk", "sank", "capsiz", "swamped",
                            "burning", "Fell", "adrift", "washed",
                            "overboard", "thrown", "sink", "swept",
                            "murder", "explosion", "freighter", "tanker",
                            "founder", "steam", "submarine", "torped",
                            "destroy", "collided", "drown", "desert",
                            "accident", "disappear", "dinghy", "raft"],
              "beach activity": [r"sta.*ding", "wading", "walking", 
                                 "gathering", "lying", "splash", "playing",
                                 "wading", "inflat", "sitting"],
              "shark handling": ["shark"],
              "surfing": ["surfboard", r"surf[a-z]?ng", "kite", r"sup\b", 
                          "boardi", "paddling"],
              "aircrash": ["airc", "air disaster", "air/sea disaster", 
                           "airlin", r"air.*force", "plane", "boeing"], 
              "canoeing": [r".*skiing", "canoe", "kayak"],
              "fishing": [r"fish[a-z]*", "hunting", "spear", "netting", "prawn",
                          "harpoo", "lobster", "crab", "collecting", "shrimp",
                          "oyster", "sardine", "hauling", "casting", "catching"],
              "washing": [r"wash[^e]"],
              "swimming": ["swimming", "treading", "dived", r"swim", 
                           "bathing", "floating", "jump", "kneeling"],
              "boating": ["boating", "sailing", "rowing", "cruising", "yacht"],
              "diving": ["diving", "snorkeling", "diver"],
              "research": ["photo", "film", "research", "scient"],
              np.NaN: ["unknown"]}

for k, v in regex_dict.items():
    search_and_set(df, "activity", k, regex=True, lst=v)

# Also we want to set values like unknown to NaN , 
#search_and_set(df, "activity", pd.NA, lst=["unknown"])

In [460]:
df["activity"].nunique()

122

## Sex column
This column is fairly straight forward. We can combine into M or F

In [461]:
# Clean up the column for sex
df["sex"].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [462]:
# Anything that's not F or M will be set to NaN
search_and_set(df, "sex", np.NaN, lst=[r"[^FM]"])

## Name column
We follow a similar process, but need to account for the fact that some names feature a '.' or ','.

In [463]:
df["name"].value_counts()

male              542
female             97
boy                23
boat               14
2 males            14
                 ... 
Naval aviator       1
Sabrina Garcia      1
John Brothers       1
Gordon Johnson      1
Carl Nakuina        1
Name: name, Length: 5196, dtype: int64

In [464]:
# First change how Jr is written, so we can check for punctuation later
regex_pat = re.compile(r"[,]* jr[.]*")
df["name"] = (df["name"]
              .str.lower()
              .str.strip()
              .str.replace(regex_pat, " jr", regex=True))

# Now remove everything with punctuation
search_and_set(df, "name", np.NaN, regex=True, 
               lst=[r"\d", r"\ba\s[a-z]*", r"male", r"[,:;]"])

# And remove all single 'names'
df["name"] = df["name"].str.title().apply(regex_test, args=(r"[ ]", ))


## Age
For age we decide only to keep values that are 1 or 2 digits. As well we change the dtype to int.

In [465]:
# Next we clean up the age column
df.age.value_counts()

17               154
18               150
19               142
20               139
15               139
                ... 
 30                1
13 or 14           1
2 to 3 months      1
17 & 35            1
74                 1
Name: age, Length: 157, dtype: int64

In [466]:
# First correct for babies
search_and_set(df, "age", "1", lst=["month"])
# Now if we find 1 or 2 digits we keep that value. 
df["age"] = df["age"].str.strip().apply(regex_test, args=(r"\d{1,2}", True))

In [467]:
# Now we've cleared bogus values, we can convert to integer
df["age"] = df["age"].fillna(0).astype(int)

## Cleaning fatal_yn
This column is fairly straight forward, and we can just use the method we used earlier as well.

In [468]:
df.fatal_yn.unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [469]:
regex_dict = {np.NaN: ["unknown", "2017"],
         "N": ["n", "m"],
         "Y": ["y"]
        }
for k,v in regex_dict.items():
    search_and_set(df, "fatal_yn", k, regex=True, lst=v)

## Setting a new index

If original_order is unique then we can use it as an index

In [470]:
df["original_order"].value_counts(dropna=False)

569.0     2
3854.0    1
725.0     1
708.0     1
590.0     1
         ..
3106.0    1
3080.0    1
2812.0    1
2712.0    1
6272.0    1
Name: original_order, Length: 6251, dtype: int64

In [471]:
# There are 2 rows with number 569. The highest value is 6303, so we change one of the 2 to 6304.

df.loc[(df["original_order"] == 569) & (df["type"] == "Unprovoked"), "original_order"] = 6304

In [472]:
# Getting rid of empty rows
df["original_order"].dropna(inplace=True)

In [473]:
# We change the column dtype to int
df["original_order"] = df["original_order"].astype(int)

In [474]:
# Now the original_order is unique, we can use it as an index
df.set_index("original_order", inplace=True)
df.head()

Unnamed: 0_level_0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href
original_order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6303,2018-06-25,Boating,USA,California,"Oceanside, San Diego County",surfing,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6302,2018-06-18,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",beach activity,,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
6301,2018-06-09,,USA,Hawaii,"Habush, Oahu",surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
6300,2018-06-08,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,surfing,,M,0,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6299,2018-06-04,Provoked,MEXICO,Colima,La Ticla,diving,Gustavo Ramos,M,0,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...


In [475]:
df.head()

Unnamed: 0_level_0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href
original_order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6303,2018-06-25,Boating,USA,California,"Oceanside, San Diego County",surfing,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6302,2018-06-18,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",beach activity,,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
6301,2018-06-09,,USA,Hawaii,"Habush, Oahu",surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
6300,2018-06-08,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,surfing,,M,0,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6299,2018-06-04,Provoked,MEXICO,Colima,La Ticla,diving,Gustavo Ramos,M,0,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...
