In [9]:
import pandas as pd 
import numpy as np
import re
import os
import Cleaning as cl

In [10]:
pd.options.display.max_rows = 50

# Importing the data file
sharksDf = pd.read_csv("../INPUT/attacks.csv",encoding = "ISO-8859-1") 

#El uso del encoding viene a raiz de un error que tenía si no lo ponía: 
# https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python

In [11]:
sharksDf.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [12]:
sharksDf.shape

(25723, 24)

## We remove the rows and columns that we don't need for a statistical analysis

In [13]:
sharksDf = sharksDf.drop(columns=["Case Number","Name", "Injury", "Investigator or Source", "pdf", "href formula","href","original order","Case Number.1","Case Number.2"])
sharksDf = sharksDf.drop('Unnamed: 22', 1)
sharksDf = sharksDf.drop('Unnamed: 23', 1)

In [14]:
#Removing null rows and duplicates
sharksDf = sharksDf.dropna(how='all', axis=0)
sharksDf = sharksDf.drop_duplicates()




In [15]:
null_cols = sharksDf.isnull().sum()
null_cols[null_cols > 0]

Year              2
Type              4
Country          49
Area            452
Location        536
Activity        542
Sex             565
Age            2815
Fatal (Y/N)     539
Time           3339
Species        2827
dtype: int64

## Now I proceed to cleaning the different columns

   ### Cleaning "Type" column

In [16]:
df = sharksDf["Type"].value_counts()
df.head(30)

# Type should be either "provoked" or "unprovoked"

Unprovoked      4581
Provoked         572
Invalid          547
Sea Disaster     238
Boating          203
Boat             137
Questionable       2
Boatomg            1
Name: Type, dtype: int64

It seems like some mistake was made when inputing 'Boating' or  'Sea Disaster'; those inputs correspond more to the 
"Activity" column. To have a cleaner dataframe, I've decided to only keep 3 categories: Provoked, Unprovoked, and Uncertain

In [17]:
# Replacing all incorrect entries with "Uncertain"
sharksDf = sharksDf.replace({'Type': {"Invalid": "Uncertain", "Sea Disaster": "Uncertain","Boating": "Uncertain", "Boat": "Uncertain", "Questionable": "Uncertain", "Boatomg": "Uncertain"}})

# Replacing null values with the type "Uncertain"
sharksDf[sharksDf['Type'].isnull()]=sharksDf[sharksDf['Type'].isnull()].fillna('Uncertain')
sharksDf.Type.isnull().sum()

0

In [18]:
sharksDf["Type"].value_counts()

Unprovoked    4581
Uncertain     1132
Provoked       572
Name: Type, dtype: int64

In [19]:
null_cols = sharksDf.isnull().sum()
null_cols[null_cols >= 0]


Date              0
Year              2
Type              0
Country          49
Area            450
Location        535
Activity        542
Sex             565
Age            2812
Fatal (Y/N)     538
Time           3336
Species        2823
dtype: int64

Now, the 'Type' column is nice and clean

In [20]:
sharksDf["Year"].value_counts()

2015.0    143
2017.0    136
2016.0    130
2011.0    128
2014.0    127
         ... 
1801.0      1
1638.0      1
1834.0      1
1723.0      1
1786.0      1
Name: Year, Length: 249, dtype: int64

  ### Cleaning "Year" column

In [21]:
sharksDf['Year']=sharksDf['Date'].apply(cl.checkDate) # Imported function

In [22]:
null_cols = sharksDf.isnull().sum()
null_cols[null_cols >= 0]

Date              0
Year              0
Type              0
Country          49
Area            450
Location        535
Activity        542
Sex             565
Age            2812
Fatal (Y/N)     538
Time           3336
Species        2823
dtype: int64

  ### Cleaning "Country" column

In [23]:
sharksDf['Country'].value_counts()
sharksDf['Country'].isnull().sum()
sharksDf=sharksDf.fillna('Unknown')
sharksDf

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018,Uncertain,USA,California,"Oceanside, San Diego County",Paddling,F,57,N,18h00,White shark
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,N,14h00 -15h00,Unknown
2,09-Jun-2018,2018,Uncertain,USA,Hawaii,"Habush, Oahu",Surfing,M,48,N,07h45,Unknown
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,Unknown,N,Unknown,2 m shark
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,Unknown,N,Unknown,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,Unknown,Y,Unknown,Unknown
6298,Before 1903,1903,Unprovoked,AUSTRALIA,Western Australia,Unknown,Pearl diving,M,Unknown,Y,Unknown,Unknown
6299,1900-1905,1900,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,Unknown,Y,Unknown,Unknown
6300,1883-1889,1883,Unprovoked,PANAMA,Unknown,"Panama Bay 8ºN, 79ºW",Unknown,M,Unknown,Y,Unknown,Unknown


In [24]:
null_cols = sharksDf.isnull().sum()
null_cols[null_cols >= 0]


Date           0
Year           0
Type           0
Country        0
Area           0
Location       0
Activity       0
Sex            0
Age            0
Fatal (Y/N)    0
Time           0
Species        0
dtype: int64

  ### Cleaning "Species" column

In [25]:
sharksDf["Species "].value_counts()

Unknown                                                2824
White shark                                             163
Shark involvement prior to death was not confirmed      105
Invalid                                                 102
Shark involvement not confirmed                          88
                                                       ... 
Bull shark, 3 m                                           1
5'7" shark                                                1
Tiger shark, 4.5 to 5.5 m [14'9" to 18'], 2000-lb         1
4.5 m to 5.5m white shark                                 1
2.3 m [7.5'] white shark, identified by M. Smale          1
Name: Species , Length: 1550, dtype: int64

In [26]:
sharksDf["Species "] = sharksDf["Species "].apply(cl.sharkType)

sharksDf['Species ']=sharksDf['Species '].replace('M SHARK','Unknown')


In [27]:
sharksDf

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species
0,25-Jun-2018,2018,Uncertain,USA,California,"Oceanside, San Diego County",Paddling,F,57,N,18h00,WHITE SHARK
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,N,14h00 -15h00,Unknown
2,09-Jun-2018,2018,Uncertain,USA,Hawaii,"Habush, Oahu",Surfing,M,48,N,07h45,Unknown
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,Unknown,N,Unknown,Unknown
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,Unknown,N,Unknown,TIGER SHARK
...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,1903,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,Unknown,Y,Unknown,Unknown
6298,Before 1903,1903,Unprovoked,AUSTRALIA,Western Australia,Unknown,Pearl diving,M,Unknown,Y,Unknown,Unknown
6299,1900-1905,1900,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,Unknown,Y,Unknown,Unknown
6300,1883-1889,1883,Unprovoked,PANAMA,Unknown,"Panama Bay 8ºN, 79ºW",Unknown,M,Unknown,Y,Unknown,Unknown


In [28]:
sharksDf["Species "].value_counts()

Unknown                        4233
WHITE SHARK                     622
TIGER SHARK                     258
BULL SHARK                      170
NURSE SHARK                      95
                               ... 
CARCHARINID SHARKMAKO SHARK       1
UNIDENTIFIED SHARK                1
WHITETIP SHARKWHITE SHARK         1
MAKO SHARKBONITO SHARK            1
BONNETHED SHARK                   1
Name: Species , Length: 132, dtype: int64

  ### Cleaning "Activity" column

In [29]:
sharksDf["Activity"].value_counts()

Surfing                                                                           971
Swimming                                                                          865
Unknown                                                                           545
Fishing                                                                           430
Spearfishing                                                                      331
                                                                                 ... 
Hauling dead shark aboard, when another shark leapt out of the water & bit him      1
Testing anti-shark cage                                                             1
Fell from the jetty                                                                 1
Surfing / Swimming                                                                  1
Spearfishing, but walking carrying fish on end of speargun                          1
Name: Activity, Length: 1532, dtype: int64

In [30]:
sharksDf["Activity"] = sharksDf["Activity"].apply(cl.activity)

In [31]:
sharksDf["Activity"].value_counts()

SWIMMING/DIVING                                2100
SURFING/WATERSPORTS                            1561
FISHING                                        1095
UNKNOWN                                         545
STANDING                                         99
                                               ... 
GRABBING SHARK FOR A SELFIE                       1
CATCHING A TURTLE                                 1
SINKING OF THE FERRYBOAT BONGBONG 1               1
FLOATING NEAR BOAT & OBSERVING BIOLUMINESCE       1
WASHING HANDS                                     1
Name: Activity, Length: 616, dtype: int64

  ### Cleaning "Fatal (Y/N)" column

In [32]:
sharksDf["Fatal (Y/N)"].value_counts()

N            4283
Y            1382
Unknown       538
UNKNOWN        70
 N              7
Uncertain       1
M               1
y               1
2017            1
N               1
Name: Fatal (Y/N), dtype: int64

In [33]:
sharksDf["Fatal (Y/N)"] = sharksDf["Fatal (Y/N)"].apply(cl.cleanFatal)

In [34]:
sharksDf["Fatal (Y/N)"].value_counts()

N          4293
Y          1383
Unknown     609
Name: Fatal (Y/N), dtype: int64

In [35]:
os.getcwd()
sharksDf.to_csv("../OUTPUT/sharks_clean.csv")