# Project Pandas Shark Attack

- NaN/Null cleananse
- Index by Original Order
- Date in datetime format (DD/MM/YY)
    - Using Date column to complete related columns
    - Circa YY
- Categorization / One Hot Encoding
- Grouping by

In [26]:
import pandas as pd

import re

import numpy as np

import chardet

In [27]:
shark_attack = pd.read_csv(r'C:/Users/Admin/01_ Module_01_IronHack/project-pandas/attacks.csv', encoding = 'ISO-8859-1')

In [28]:
# Getting the encoding to open the file 

with open('C:/Users/Admin/01_ Module_01_IronHack/project-pandas/attacks.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(1000))
print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [29]:
# Cleaning rows that do not complie with basic needs
# Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

clean_df = shark_attack.dropna(axis=0, how='all', thresh = 2, subset = ['Case Number', 'Date'])

In [30]:
# Cleaning Unnamed Columns with no relevance 
# Reference: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html

final_df = clean_df.drop(columns=['Unnamed: 22', 'Unnamed: 23'])

In [31]:
# Format cleaning for column Date

final_df['Date'] = [re.findall('\d+\-\w+\-\d+|\w+\-\d+|\d{4}|\d+\-\w+\-\-\d+|\d+\-\w+\s\d+|\d+\W\d+|\d+\W+\w\.\w\.|\w{5}\s\w{3}\s\w{2}',
                               item) for item in final_df['Date']]

final_df['Date'] = final_df['Date'].str[0]

In [32]:
# Format cleaning column Case Number
final_df['Case Number'] = [re.findall('\d+\.\d+\.\d+|\d\.\d+|\w+\-\d+|\w+|\w+',
                                     item) for item in final_df['Case Number']]

final_df['Case Number'] = final_df['Case Number'].str[0] 

In [33]:
# Format cleaning Year
# Filling nulls
final_df['Year'] = final_df['Year'].fillna(0)

In [34]:
# Categorizing by Location
location_cat = pd.get_dummies(final_df['Location'])
location_cat.head()

Unnamed: 0,Unnamed: 1,A pearl farm in Roebuck Bay,"Ambatolaoka, Nosy Be Island",Bellingen,"Black Head, south of Taree","Boa Viagem Beach, Recife","Boca de la Leña, La Unión",Botany Bay,Bunker Bay,Chennai (formerly Madras),...,off Neuvitas,"off Paoay, Ilocos Norte Province",off yacht Serenade,the pearling beds,"uShaka Aquarium, Durban",Southern Wharf,½ mile offshore & 9 miles north of Fort Pierce,Île Saint-Paul,Île de Casey,Île de Sable
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# Counting Fatal cases by activity 
count_activity = final_df.groupby(['Activity']).agg({'Fatal (Y/N)':'count'}).reset_index()
count_activity.head()

Unnamed: 0,Activity,Fatal (Y/N)
0,,1
1,,1
2,a canoe was pursuing a schooner that had forc...,1
3,"""Boat accident""",1
4,"""Climbing up to ship after repairing the stern...",1


In [36]:
# Grouping by Location / Activity

group_l_f = final_df.groupby(['Location', 'Activity']).agg({'Fatal (Y/N)' : 'count'})
group_l_f.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fatal (Y/N)
Location,Activity,Unnamed: 2_level_1
,Swimming,1
A pearl farm in Roebuck Bay,Hookah diving,1
"Ambatolaoka, Nosy Be Island",Scuba diving,1
Bellingen,Fishing,1
"Black Head, south of Taree",Fishing,1


In [37]:
final_df = final_df.astype({'original order' : 'int32', 'Year' : 'int32'})

In [38]:
# Setting Original Order as Index
final_df = final_df.set_index('original order')

In [39]:
final_df.head()

Unnamed: 0_level_0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2
original order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6303,2018.06.25,25-Jun-2018,2018,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25
6302,2018.06.18,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18
6301,2018.06.09,09-Jun-2018,2018,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09
6300,2018.06.08,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08
6299,2018.06.04,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04


In [40]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6301 entries, 6303 to 2
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Case Number             6301 non-null   object
 1   Date                    6294 non-null   object
 2   Year                    6301 non-null   int32 
 3   Type                    6297 non-null   object
 4   Country                 6251 non-null   object
 5   Area                    5846 non-null   object
 6   Location                5761 non-null   object
 7   Activity                5757 non-null   object
 8   Name                    6091 non-null   object
 9   Sex                     5736 non-null   object
 10  Age                     3471 non-null   object
 11  Injury                  6273 non-null   object
 12  Fatal (Y/N)             5762 non-null   object
 13  Time                    2948 non-null   object
 14  Species                 3464 non-null   object
 15  Inve

In [41]:
final_df = final_df[['Date', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Age', 'Fatal (Y/N)']]

In [42]:
final_df.to_csv('shark_attack.csv', encoding='utf-8')