# Pandas Project - cleaning shark attacks csv

## Workflow determination
1. Determine steps to be taken
2. Set up csv in pandas and create copy to work with
3. Declare helper functions 
3. Gather information about shape and characteristics of dataframe
6. Check data types
4. Inspect data
9. Check null values
5. Change column names
8. Remove duplicates
10. Remove irrelevant columns
11. Convert dates
12. Clean data for various columns
12. Correct data types
14. Set new index according to unique tracker
15. Export to new csv


## Set up 


First we import the necessary libraries

In [1]:
import re

import numpy as np
import pandas as pd

from string import punctuation
from datetime import datetime

Now we declare helper functions

In [2]:
def clean_col_names(df):
    """Take a dataframe and return sanitized column names."""

    table = str.maketrans(dict.fromkeys(punctuation))
    return [c.lower()
             .strip()
             .translate(table)
             .replace(" ", "_") for c in df.columns]


def cols_missing_value(df):
    """Return percentage of column missing value."""
    
    return ((df.isna().sum())[df.isna().sum() > 0] / len(df)) * 100

def invalid_year(date):
    """Return NaT if not valid year."""
    
    # Doesn't work because of 20 non-cooperating rows
    if date == date:
        if date > datetime.strptime('2020', '%y'):
            return pd.NaT
    return date

def search_and_set(df, col, word, regex=False, lst=[]):
    """Search for term and set correspending rows to value."""
    
    # If lst is set, we use every term therein, otherwise just 
    # set word as first item
        
    if not lst:
        lst = [word]
    
    if regex:
        for item in lst:
            df.loc[df[col].str.contains(
                item, case=False, na=False, regex=True), col] = word
    else:
        for item in lst:
            df.loc[df[col].str.contains(
                item, case=False, na=False), col] = word

def regex_test(txt, pat, needle=False):
    """Return NaN if string does not contain pattern."""
    
    check = re.search(pat, str(txt))
    if check:
        # With needle set to True we return the part of matched string
        return check.group() if needle else txt
    return np.NaN

Next we import the csv into pandas and inspect the head

In [3]:
# Because of unicode characters we need to specify the encoding
df_original = pd.read_csv("attacks.csv", encoding="cp1252")

# We save a copy of the dataframe, just in case.
df = df_original.copy()
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


## Inspect

In [4]:
# rows and columns
df.shape

(25723, 24)

In [5]:
# Datatypes of columns
df.dtypes

Case Number                object
Date                       object
Year                      float64
Type                       object
Country                    object
Area                       object
Location                   object
Activity                   object
Name                       object
Sex                        object
Age                        object
Injury                     object
Fatal (Y/N)                object
Time                       object
Species                    object
Investigator or Source     object
pdf                        object
href formula               object
href                       object
Case Number.1              object
Case Number.2              object
original order            float64
Unnamed: 22                object
Unnamed: 23                object
dtype: object

In [6]:
# Random sample of data
df.sample(20)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
13858,,,,,,,,,,,...,,,,,,,,,,
11883,,,,,,,,,,,...,,,,,,,,,,
3311,1973.00.00.a,1973,1973.0,Unprovoked,SOUTH AFRICA,Eastern Cape Province,Queensberry Bay,Surfing,Gordon Harmer,M,...,,G. Harmer,1973.00.00.a-Harmer.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1973.00.00.a,1973.00.00.a,2992.0,,
24178,,,,,,,,,,,...,,,,,,,,,,
12811,,,,,,,,,,,...,,,,,,,,,,
16416,,,,,,,,,,,...,,,,,,,,,,
20781,,,,,,,,,,,...,,,,,,,,,,
6438,0,,,,,,,,,,...,,,,,,,,,,
23994,,,,,,,,,,,...,,,,,,,,,,
13085,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Columns with NA values (percentage)
cols_missing_value(df)

Case Number               66.170353
Date                      75.500525
Year                      75.508300
Type                      75.516075
Country                   75.694903
Area                      77.269370
Location                  77.599813
Activity                  77.615364
Name                      76.316915
Sex                       77.697003
Age                       86.506240
Injury                    75.609377
Fatal (Y/N)               77.595926
Time                      88.539439
Species                   86.533453
Investigator or Source    75.566614
pdf                       75.500525
href formula              75.504412
href                      75.500525
Case Number.1             75.500525
Case Number.2             75.500525
original order            75.473312
Unnamed: 22               99.996112
Unnamed: 23               99.992225
dtype: float64

### A few observations
   - column names contain punctuation, spaces and capitals
   - case number seems to be same as Date, and Year is a pary of previous two
   - many rows have no data at all
   - pdf is part of href formula, which in turn is equal to href
   - last 2 columns have no name and out of 25723 only 1 or 2 rows have a value
   - age and species values are mainly missing
   

## General cleanup

Now we know a little about our data set, we are going to start cleaning. First we tackle the big stuff. 

In [8]:
# We clean up th column names, so it's easier to work with.
df.columns = clean_col_names(df)
df.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal_yn', 'time',
       'species', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number1', 'case_number2', 'original_order', 'unnamed_22',
       'unnamed_23'],
      dtype='object')

In [9]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)


In [10]:
cols_missing_value(df)

case_number                0.031686
date                       0.158428
year                       0.190114
type                       0.221800
country                    0.950570
area                       7.366920
location                   8.713561
activity                   8.776933
name                       3.485425
sex                        9.109632
age                       45.009506
injury                     0.602028
fatal_yn                   8.697719
time                      53.295311
species                   45.120406
investigator_or_source     0.427757
pdf                        0.158428
href_formula               0.174271
href                       0.158428
case_number1               0.158428
case_number2               0.158428
original_order             0.047529
unnamed_22                99.984157
unnamed_23                99.968314
dtype: float64

## Clearing redundant columns

Since case_number is same as date, and original_order is a unique index, we will drop all 3 case_number columns. Also, the href_formula and pdf column are duplicate to href. Finally we remove the redundant unnamed columns

In [11]:
to_drop = ["case_number", "year", "pdf", 
           "href_formula", "case_number1", 
           "case_number2", "unnamed_22", "unnamed_23"]
df = df.drop(axis = 1, columns = to_drop)
df.head()

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href,original_order
0,25-Jun-2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,18-Jun-2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,09-Jun-2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0
3,08-Jun-2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6300.0
4,04-Jun-2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,6299.0


## Removing bogus rows

When rows hold no real data, we remove them

In [12]:
# If there is no location we don't want to keep the row
cols_missing_value(df)

date                       0.158428
type                       0.221800
country                    0.950570
area                       7.366920
location                   8.713561
activity                   8.776933
name                       3.485425
sex                        9.109632
age                       45.009506
injury                     0.602028
fatal_yn                   8.697719
time                      53.295311
species                   45.120406
investigator_or_source     0.427757
href                       0.158428
original_order             0.047529
dtype: float64

In [13]:
# There are still many rows with very little data. 
# We will drop all that have less than 2 columns of data
df.dropna(thresh=2, inplace=True)
df.dropna(subset=["country"], inplace=True)

In [14]:
df.sample(30)

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href,original_order
6048,08-Dec-1846,Sea Disaster,MEXICO,Veracruz,Vera Cruz,Wreck of the USS Somers,,M,,"FATAL, some were taken by sharks",Y,,,"Report of the loss of the Somers, J.H.W.",http://sharkattackfile.net/spreadsheets/pdf_di...,255.0
4057,30-Sep-1959,Unprovoked,PHILIPPINES,Leyte Island,Luang Dulag,Swimming,Francisco Daguinot,M,28.0,"No details, survived",N,,,"Manila Times, 10/2/1959",http://sharkattackfile.net/spreadsheets/pdf_di...,2246.0
191,24-Dec-2016,Unprovoked,AUSTRALIA,Western Australia,"Bundegi Sanctuary Zone, Ningaloo",Snorkeling,female,F,,Buttock bitten,N,Morning,a small shark,"Perth Now, 12/28/2016",http://sharkattackfile.net/spreadsheets/pdf_di...,6112.0
5731,Reported 18-Jun-1888,Invalid,AUSTRALIA,Victoria,"Point Cook, Port Phillip Bay",The cutter yacht Cutty Sark sank,"Claude Hadley, William Grundy, Albert Faulkner...",M,,Probable drowning & scavenging,,,Shark involvement prior to death unconfirmed,"New York Times, 6/18/1888",http://sharkattackfile.net/spreadsheets/pdf_di...,572.0
1129,06-Mar-2009,Provoked,BAHAMAS,Exuma Islands,,Spearfishing,Luis Hernandez,M,48.0,Lacerations to right forearm after he poked th...,N,,7' shark,"Sun Sentinel, 5/8/2009",http://sharkattackfile.net/spreadsheets/pdf_di...,5174.0
4522,1949-1950,Boating,ITALY,Tyrrhenian Sea,San Vincenzo,"Fishing, on a boat",male,M,,No injury to occupant,N,,White shark,A. De Maddalena; V. Biagi (pers. Comm.),http://sharkattackfile.net/spreadsheets/pdf_di...,1781.0
5189,08-Feb-1924,Invalid,AUSTRALIA,Queensland,Currumbin,Bathing,Frederick Dullroy,M,,Probable drowning & scavenging,,16h00,Shark involvement prior to death unconfirmed,11/02/1924,http://sharkattackfile.net/spreadsheets/pdf_di...,1114.0
2472,16-Sep-1993,Unprovoked,EL SALVADOR,La Libertad,near El Cocal Beach,Surfing,Jose Diter Roque,,15.0,Left leg gashed,N,,3.7 m [12'] white shark,"Tampa Tribune, 9/18/1993",http://sharkattackfile.net/spreadsheets/pdf_di...,3831.0
4590,Reported 24-Dec-1946,Boating,AUSTRALIA,Queensland,"Auckland Creek, Gladstone",,Moored fishing launch of Harry Lone,,,Shark jumped into cockpit,N,,7' shark,"V.M. Coppleson (1958), p.181",http://sharkattackfile.net/spreadsheets/pdf_di...,1713.0
1186,06-Dec-2006,Boating,AUSTRALIA,New South Wales,Mowarry Point,Fishing,"6 m Seaduce - Occupants: Allen Roberts, Jason ...",,,Shark bit boats sea anchor,N,Morning,"White shark, 4.5 to 5 m",S. Chenhall,http://sharkattackfile.net/spreadsheets/pdf_di...,5117.0


## Type

We start cleaning more in depth by looking at the type column. 

In [15]:
# We check the values for 'type'
df["type"].value_counts(dropna=False)

Unprovoked      4562
Provoked         571
Invalid          542
Sea Disaster     231
Boating          202
Boat             137
NaN                4
Questionable       2
Boatomg            1
Name: type, dtype: int64

In [16]:
# Change invalid and questionable to NaN
df["type"].replace(["Invalid", "Questionable"], np.nan, inplace=True)

# And merge boating
df["type"].replace(["Boat", "Boatomg"], "Boating", inplace=True)

df["type"].value_counts(dropna=False)

Unprovoked      4562
Provoked         571
NaN              548
Boating          340
Sea Disaster     231
Name: type, dtype: int64

## Date
Next we look at the date. 

In [17]:
# Change date to datetime column
df.date = pd.to_datetime(df["date"], errors="coerce", )
df.head()

Unnamed: 0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href,original_order
0,2018-06-25,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6303.0
1,2018-06-18,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6302.0
2,2018-06-09,,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,6301.0
3,2018-06-08,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,6300.0
4,2018-06-04,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,6299.0


In [18]:
# Get rid of future dates. Doesn't work because of 20 rows that aren't yet in date format it seems.
#df["date"] = df["date"].apply(invalid_year)
df.date.isna().values.sum()

821

## Activity column
This column takes a lot more work. There are many different values, more than 1500. We will try to use combine them to bring it down to 120. 

In [19]:
df["activity"].value_counts()

Surfing                                                                                                                                                                                      970
Swimming                                                                                                                                                                                     862
Fishing                                                                                                                                                                                      431
Spearfishing                                                                                                                                                                                 332
Bathing                                                                                                                                                                                      160
                                   

In [20]:
# Let's check what values we have in this column
df["activity"].unique()

array(['Paddling', 'Standing', 'Surfing', ...,
       'Crew swimming alongside their anchored ship',
       '4 men were bathing', 'Wreck of  large double sailing canoe'],
      dtype=object)

In [21]:
# Key is category, list in dict-value are search patterns
regex_dict = {"shipwreck": ["wreck", "sea disaster", "battle", 
                            "sunk", "sank", "capsiz", "swamped",
                            "burning", "Fell", "adrift", "washed",
                            "overboard", "thrown", "sink", "swept",
                            "murder", "explosion", "freighter", "tanker",
                            "founder", "steam", "submarine", "torped",
                            "destroy", "collided", "drown", "desert",
                            "accident", "disappear", "dinghy", "raft"],
              "beach activity": [r"sta.*ding", "wading", "walking", 
                                 "gathering", "lying", "splash", "playing",
                                 "wading", "inflat", "sitting"],
              "shark handling": ["shark"],
              "surfing": ["surfboard", r"surf[a-z]?ng", "kite", r"sup\b", 
                          "boardi", "paddling"],
              "aircrash": ["airc", "air disaster", "air/sea disaster", 
                           "airlin", r"air.*force", "plane", "boeing"], 
              "canoeing": [r".*skiing", "canoe", "kayak"],
              "fishing": [r"fish[a-z]*", "hunting", "spear", "netting", "prawn",
                          "harpoo", "lobster", "crab", "collecting", "shrimp",
                          "oyster", "sardine", "hauling", "casting", "catching"],
              "washing": [r"wash[^e]"],
              "swimming": ["swimming", "treading", "dived", r"swim", 
                           "bathing", "floating", "jump", "kneeling"],
              "boating": ["boating", "sailing", "rowing", "cruising", "yacht"],
              "diving": ["diving", "snorkeling", "diver"],
              "research": ["photo", "film", "research", "scient"],
              np.NaN: ["unknown"]}

for k, v in regex_dict.items():
    search_and_set(df, "activity", k, regex=True, lst=v)

In [22]:
# Check the result
df["activity"].nunique()

122

## Sex column
This column is fairly straight forward. We can combine into M or F

In [23]:
# What unique values we have at first
df["sex"].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [24]:
# Anything that's not F or M will be set to NaN
search_and_set(df, "sex", np.NaN, lst=[r"[^FM]"])
df["sex"].unique()

array(['F', 'M', nan], dtype=object)

## Name column
We follow a similar process, but need to account for the fact that some names feature a '.' or ','.

In [25]:
df["name"].value_counts()

male                 542
female                97
boy                   23
boat                  14
2 males               14
                    ... 
Geeteh Toussaint       1
Damien Johnson         1
Zebulon Critchlow      1
Ryan Eckstrum          1
Kenneth Hall           1
Name: name, Length: 5196, dtype: int64

In [26]:
# First change how Jr is written, so we can check for punctuation later
regex_pat = re.compile(r"[,]* jr[.]*")
df["name"] = (df["name"]
              .str.lower()
              .str.strip()
              .str.replace(regex_pat, " jr", regex=True))

# Now remove everything with punctuation
search_and_set(df, "name", np.NaN, regex=True, 
               lst=[r"\d", r"\ba\s[a-z]*", "male", r"[,:;]"])

# And remove all single 'names'
df["name"] = df["name"].str.title().apply(regex_test, args=(r"[ ]", ))


## Age
For age we decide only to keep values that are 1 or 2 digits. As well we change the dtype to int.

In [27]:
# Next we clean up the age column
df.age.value_counts()

17             154
18             150
19             142
15             139
20             139
              ... 
81               1
 28              1
?    &   14      1
36 & 26          1
17 & 16          1
Name: age, Length: 157, dtype: int64

In [28]:
# First correct for babies
search_and_set(df, "age", "1", lst=["month"])
# Now if we find 1 or 2 digits we keep that value. 
df["age"] = df["age"].str.strip().apply(regex_test, args=(r"\d{1,2}", True))

In [29]:
# Now we've cleared bogus values, we can convert to integer
df["age"] = df["age"].fillna(0).astype(int)

## Fatal_yn
This column is fairly straight forward, and we can just use the method we used earlier as well.

In [30]:
df.fatal_yn.unique()

array(['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y'],
      dtype=object)

In [31]:
regex_dict = {np.NaN: ["unknown", "2017"],
         "N": ["n", "m"],
         "Y": ["y"]
        }
for k,v in regex_dict.items():
    search_and_set(df, "fatal_yn", k, regex=True, lst=v)

## Setting a new index

If original_order is unique then we can use it as an index

In [32]:
df["original_order"].value_counts(dropna=False)

569.0     2
3854.0    1
725.0     1
708.0     1
590.0     1
         ..
3106.0    1
3080.0    1
2812.0    1
2712.0    1
6272.0    1
Name: original_order, Length: 6251, dtype: int64

In [33]:
# There are 2 rows with number 569. The highest value is 6303, so we change one of the 2 to 6304.

df.loc[(df["original_order"] == 569) & (df["type"] == "Unprovoked"), "original_order"] = 6304

In [34]:
# Getting rid of empty rows
df["original_order"].dropna(inplace=True)

In [35]:
# We change the column dtype to int
df["original_order"] = df["original_order"].astype(int)

In [36]:
# Now the original_order is unique, we can use it as an index
df.set_index("original_order", inplace=True)
df.head()

Unnamed: 0_level_0,date,type,country,area,location,activity,name,sex,age,injury,fatal_yn,time,species,investigator_or_source,href
original_order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6303,2018-06-25,Boating,USA,California,"Oceanside, San Diego County",surfing,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6302,2018-06-18,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",beach activity,,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
6301,2018-06-09,,USA,Hawaii,"Habush, Oahu",surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...
6300,2018-06-08,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,surfing,,M,0,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
6299,2018-06-04,Provoked,MEXICO,Colima,La Ticla,diving,Gustavo Ramos,M,0,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...


## Export 
Now that the dataset is a lot cleaner, we will save it as a new csv-file. 

In [37]:
df.to_csv("cleaned_sharks.csv")