# Importing libraries

In [237]:
import pandas as pd
import numpy as np
import pandas_profiling
import re

In [238]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Reading the file

In [239]:
data = pd.read_csv('GSAF5.csv', encoding = 'iso-8859-1')

In [240]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [241]:
data.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,


# Having a look at the columns

### This might give us some clues about duplicated columns, useless data, etcetera.

In [242]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], dtype='object')

# Defining the goal: Last 100 years

### We will just analyse the last 100 years, so we create a new df.
### We assume that we can accept the database reliability if we take the last 100 years.
### We want to check if shark attacks have significantly increased in the last 100 years, per country.

In [243]:
last_100 = data.loc[data['Year'] >= data['Year'].max()-100]

In [244]:
last_100.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


# Columns - Definitely not analyzing that data

In [245]:
duplicate_columns = ['Name', "Sex ", "Age"]

last_100 = last_100.drop(columns = duplicate_columns,axis=1)

# Columns - Having a look to make a decision

### Now, let's check what's in the rest column. Although we could decide to drop some columns just by looking at its name, we'll have a look just in case we find something interesting.

### First, we'll have a look at the columns we think won't be useful.

## Case Number

In [246]:
last_100['Case Number.1'] == last_100['Case Number']

0        True
1        True
2        True
3        True
4       False
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29       True
30       True
31       True
32       True
33      False
34       True
35       True
36       True
37       True
38       True
39       True
40       True
41       True
42       True
43       True
44       True
45       True
46       True
47       True
48       True
49       True
50       True
51       True
52       True
53       True
54       True
55       True
56       True
57       True
58       True
59       True
60       True
61       True
62       True
63       True
64       True
65       True
66       True
67       True
68       True
69       True
70       True
71    

In [247]:
last_100['Case Number.2'] == last_100['Case Number']

0        True
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29       True
30       True
31       True
32       True
33       True
34       True
35       True
36       True
37       True
38       True
39       True
40       True
41       True
42       True
43       True
44       True
45       True
46       True
47       True
48       True
49       True
50       True
51       True
52       True
53       True
54       True
55       True
56       True
57       True
58       True
59       True
60       True
61       True
62       True
63       True
64       True
65       True
66       True
67       True
68       True
69       True
70       True
71    

### They're not exactly the same, but nearly, so we will drop Case Number.1 and Case Number.2

## Href | Href formula -> URL Reference

In [248]:
last_100['href formula'] == last_100['href']

0        True
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20      False
21       True
22       True
23       True
24       True
25       True
26       True
27      False
28       True
29       True
30       True
31       True
32       True
33       True
34       True
35       True
36       True
37       True
38       True
39       True
40       True
41       True
42       True
43       True
44       True
45       True
46       True
47       True
48       True
49       True
50       True
51       True
52       True
53       True
54       True
55       True
56       True
57       True
58       True
59       True
60       True
61      False
62       True
63       True
64       True
65       True
66       True
67       True
68       True
69       True
70       True
71    

### Again, not exactly the same, but pretty much.

### We checked which column had better structured data and picked "href formula". We drop the "href", rename the "href formula" and also removed the spaces.

In [249]:
last_100['href formula'].value_counts()

http://sharkattackfile.net/spreadsheets/pdf_directory/1931.09.21.a-b-Holaday-Barrows.pdf        2
http://sharkattackfile.net/spreadsheets/pdf_directory/1929.03.04.a-b.Roads-Aldridge.pdf         2
http://sharkattackfile.net/spreadsheets/pdf_directory/1935.06.05.R-SolomonIslands.pdf           2
http://sharkattackfile.net/spreadsheets/pdf_directory/1916.07.12.a-b-Stillwell-Fisher.pdf       2
http://sharkattackfile.net/spreadsheets/pdf_directory/1916.12.08.a-b-German.pdf                 2
http://sharkattackfile.net/spreadsheets/pdf_directory/1921.11.27.a-b-Jack.pdf                   2
http://sharkattackfile.net/spreadsheets/pdf_directory/1923.00.00.a-NJ fisherman.pdf             2
http://sharkattackfile.net/spreadsheets/pdf_directory/1934.12.23.a-b-Inman.pdf                  2
http://sharkattackfile.net/spreadsheets/pdf_directory/1982.08.29.a-Keet.pdf                     1
http://sharkattackfile.net/spreadsheets/pdf_directory/2005.02.13-FtLauderdale.pdf               1
http://sharkattackfi

In [250]:
last_100['href formula'] = last_100['href formula'].str.replace(' ','')

In [251]:
last_100 = last_100.rename(index=str, columns={"href formula": "URL Reference"})

## Unnamed

In [252]:
last_100['Unnamed: 22'].value_counts(dropna = False)

NaN             5084
stopped here       1
Name: Unnamed: 22, dtype: int64

In [253]:
last_100['Unnamed: 23'].value_counts(dropna = False)

NaN       5084
Teramo       1
Name: Unnamed: 23, dtype: int64

### We just checked that these columns have no real value, so we'll drop them.

## Original Order

### We already have the year and the month, so we'll get rid of this column.

In [254]:
last_100['original order'].value_counts(dropna = False)

5661    2
5739    2
3847    2
2047    1
4943    1
4931    1
2884    1
4935    1
2888    1
4939    1
2892    1
2896    1
4927    1
4947    1
2900    1
4951    1
2904    1
4955    1
2908    1
4959    1
2880    1
4923    1
2876    1
2856    1
4891    1
2844    1
4895    1
2848    1
4899    1
2852    1
4903    1
4907    1
4963    1
2860    1
4911    1
2864    1
4915    1
2868    1
4919    1
2872    1
2912    1
4967    1
2916    1
2968    1
5007    1
2960    1
913     1
5011    1
2964    1
917     1
5015    1
921     1
2956    1
5019    1
2972    1
925     1
5023    1
2976    1
929     1
5027    1
909     1
5003    1
4887    1
4983    1
2920    1
4971    1
2924    1
4975    1
2928    1
4979    1
2932    1
2936    1
2952    1
4987    1
2940    1
4991    1
2944    1
4995    1
2948    1
4999    1
2840    1
2836    1
933     1
4787    1
2724    1
4775    1
2728    1
4779    1
2732    1
4783    1
2736    1
2740    1
2720    1
4791    1
2744    1
4795    1
2748    1
4799    1
2752    1
4803    1


## Bye bye columns!

In [255]:
dropcols = ['Case Number.1', 'Case Number.2', 'href', 'Unnamed: 22', 'Unnamed: 23', 'original order']

last_100 = last_100.drop(columns = dropcols,axis=1)

# Columns - Having a look to make a decision II

## Type

In [256]:
last_100['Type'].value_counts(dropna = False)

Unprovoked      3717
Provoked         506
Invalid          414
Boat             194
Sea Disaster     169
Boating           85
Name: Type, dtype: int64

## Location

In [257]:
last_100['Location'] = last_100['Location'].str.strip()
last_100['Location'].value_counts(dropna = False)

NaN                                                                                                              343
New Smyrna Beach, Volusia County                                                                                 159
Daytona Beach, Volusia County                                                                                     31
Myrtle Beach, Horry County                                                                                        17
Ponce Inlet, Volusia County                                                                                       17
Melbourne Beach, Brevard County                                                                                   16
Isle of Palms, Charleston County                                                                                  15
Boa Viagem, Recife                                                                                                13
Cocoa Beach, Brevard County                                     

## Activity

In [258]:
last_100['Activity'] = last_100['Activity'].str.strip()
last_100['Activity'].value_counts(dropna = False)

Surfing                                                                                                                                                                     904
Swimming                                                                                                                                                                    704
NaN                                                                                                                                                                         397
Fishing                                                                                                                                                                     353
Spearfishing                                                                                                                                                                324
Wading                                                                                                                  

## Injury

In [259]:
last_100['Injury'] = last_100['Injury'].str.strip()
last_100['Injury'].value_counts(dropna = False)

FATAL                                                                                                                                                                                                         442
Survived                                                                                                                                                                                                       86
No injury                                                                                                                                                                                                      73
Foot bitten                                                                                                                                                                                                    72
Leg bitten                                                                                                                                                      

## Fatal (Y/N)

In [260]:
last_100['Fatal (Y/N)'] = last_100['Fatal (Y/N)'].str.replace("F", "Y")
last_100['Fatal (Y/N)'] = last_100['Fatal (Y/N)'].str.strip()
last_100.loc[~last_100['Fatal (Y/N)'].isin(['N', 'Y']), 'Fatal (Y/N)'] = 'UNKNOWN'
last_100["Fatal (Y/N)"].value_counts(dropna = False)

N          3935
Y          1074
UNKNOWN      76
Name: Fatal (Y/N), dtype: int64

## Time

In [261]:
last_100['Time'].value_counts(dropna = False)

NaN                                                                      2425
Afternoon                                                                 150
11h00                                                                     121
12h00                                                                      99
Morning                                                                    97
15h00                                                                      96
16h00                                                                      92
14h00                                                                      92
14h30                                                                      70
16h30                                                                      70
17h30                                                                      69
13h00                                                                      68
17h00                                                           

## Species

In [262]:
last_100['Species '] = last_100['Species '].str.strip()
last_100['Species '].value_counts(dropna = False)

NaN                                                                                                                                                                                        2196
White shark                                                                                                                                                                                 153
Shark involvement not confirmed                                                                                                                                                              77
Tiger shark                                                                                                                                                                                  64
Bull shark                                                                                                                                                                                   45
4' shark                                

## Investigator or Source

In [263]:
last_100['Investigator or Source'] = last_100['Investigator or Source'].str.strip()
last_100['Investigator or Source'].value_counts(dropna = False)

S. Petersohn, GSAF                                                                                                                                                 88
C. Creswell, GSAF                                                                                                                                                  79
R. Collier                                                                                                                                                         54
T. Peake, GSAF                                                                                                                                                     49
C. Moore, GSAF                                                                                                                                                     47
M. Levine, GSAF                                                                                                                                                    44
R. C

## pdf

### We'll assume that we actually have access to these files, so we'll keep this column in case any of the URL References does not work or does not exist.

In [264]:
last_100['pdf'] = last_100['pdf'].str.strip()
last_100['pdf'].value_counts(dropna = False)

1916.07.12.a-b-Stillwell-Fisher.pdf      2
1923.00.00.a-NJ fisherman.pdf            2
1934.12.23.a-b-Inman.pdf                 2
1929.03.04.a-b.Roads-Aldridge.pdf        2
1916.12.08.a-b-German.pdf                2
1921.11.27.a-b-Jack.pdf                  2
1935.06.05.R-SolomonIslands.pdf          2
1931.09.21.a-b-Holaday-Barrows.pdf       2
1987.11.00-StLucia-scavengin.pdf         1
1956.07.20-Smedley.pdf                   1
2014.02.20-FrenchPolynesia.pdf           1
1994.07.24-Carlos_Martins.pdf            1
2008.07.00-UK-dinghy.pdf                 1
2007.08.25-Smith.pdf                     1
2015.04.13-Rackley.pdf                   1
1989.10.14-Mehl.pdf                      1
2000.03.14-Ruth.pdf                      1
1942.01.04-Steadman.pdf                  1
2005.01.14-raceboat.pdf                  1
2007.01.19-Houghton.pdf                  1
2005.09.02.b-Grause.pdf                  1
1988.05.04-Rhoades.pdf                   1
1967.08.27-Beatty.pdf                    1
2007.09.17-

# Columns - Definitely keeping that data

## Date


In [265]:
last_100['Date'] = last_100['Date'].str.strip()
last_100['Date'].value_counts(dropna = False)

1957                                11
1942                                 9
1956                                 8
1958                                 7
1941                                 7
1950                                 7
1949                                 6
1940                                 5
28-Jul-95                            5
12-Apr-01                            5
05-Oct-03                            5
Oct-60                               5
1970s                                5
Aug-56                               5
1955                                 5
1959                                 5
1954                                 5
1960                                 4
1938                                 4
09-Jul-94                            4
27-Jul-52                            4
14-Jun-12                            4
1952                                 4
09-Jan-10                            4
1995                                 4
28-Dec-14                

### Let's convert this column to "Month", because we're not that bothered about the concrete day.

In [266]:
last_100['Date'] = last_100['Date'].str.replace(r'[0-9]', "")
last_100['Date'] = last_100['Date'].str.replace("-", "")
last_100['Date'] = last_100['Date'].str.replace("Reported", "")
last_100['Date'] = last_100['Date'].str.strip()
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
last_100.loc[~last_100['Date'].isin(months), 'Date'] = 'Undetermined'

In [267]:
last_100['Date'].value_counts(dropna = False)

Jul             539
Aug             509
Sep             453
Jan             431
Jun             414
Oct             373
Dec             363
Apr             362
Mar             347
Nov             346
May             322
Feb             319
Undetermined    307
Name: Date, dtype: int64

In [268]:
last_100 = last_100.rename(index=str, columns={"Date": "Month"})

## Year

### This looks ok.

In [269]:
last_100['Year'].value_counts(dropna = False)

2015    139
2011    128
2014    125
2013    122
2008    121
2009    120
2012    117
2007    112
2005    103
2006    103
2016    103
2010    101
2000     97
1960     93
1959     93
2004     92
2003     92
2001     92
2002     88
1962     86
1961     78
1995     76
1964     66
1998     65
1999     65
1963     61
1996     61
1966     58
1997     57
1993     56
1992     56
1994     56
1988     55
1958     54
1989     53
1956     51
1965     51
1983     50
1975     49
1981     49
1967     48
1968     46
1955     43
1950     43
1970     42
1954     42
1942     41
1984     41
1957     41
1982     40
1986     39
1976     39
1974     38
1991     38
1990     38
1929     37
1985     37
1953     36
1980     35
1987     35
1972     35
1935     32
1951     31
1944     31
1949     31
1936     30
1969     30
1947     30
1937     30
1948     29
1952     29
1931     29
1943     28
1971     28
1973     27
1932     27
1934     27
1941     27
1946     26
1977     26
1930     26
1928     26
1916     25
1979

## Country

In [270]:
last_100['Country'] = last_100['Country'].str.strip()
last_100['Country'].value_counts(dropna = False)

USA                               1923
AUSTRALIA                         1063
SOUTH AFRICA                       519
PAPUA NEW GUINEA                   128
BRAZIL                             100
BAHAMAS                             90
NEW ZEALAND                         85
MEXICO                              67
ITALY                               54
REUNION                             53
FIJI                                49
PHILIPPINES                         48
NEW CALEDONIA                       46
MOZAMBIQUE                          39
JAPAN                               29
EGYPT                               27
PANAMA                              26
CUBA                                25
IRAN                                25
HONG KONG                           24
SOLOMON ISLANDS                     24
SPAIN                               24
NaN                                 21
FRENCH POLYNESIA                    19
JAMAICA                             17
CROATIA                  

## Area

In [271]:
#some of the data include directions such as 2 miles west from ..., some of the data are coordinates
last_100['Area'] = last_100['Area'].str.strip()
last_100['Area'].value_counts(dropna = False)

Florida                                                   947
New South Wales                                           381
NaN                                                       285
California                                                268
Queensland                                                253
Hawaii                                                    246
KwaZulu-Natal                                             189
Western Cape Province                                     173
Western Australia                                         154
Eastern Cape Province                                     147
South Carolina                                            126
South Australia                                            91
North Carolina                                             90
Pernambuco                                                 70
Victoria                                                   63
Texas                                                      62
Torres S

## What's left?

In [273]:
last_100.head()

Unnamed: 0,Case Number,Month,Year,Type,Country,Area,Location,Activity,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,URL Reference
0,2016.09.18.c,Sep,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,2016.09.18.b,Sep,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2016.09.18.a,Sep,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,2016.09.17,Sep,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,2016.09.15,Sep,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...


In [274]:
last_100.to_csv('Shark behaviour research.csv', index = False)

# Pending improvements

### - Implement bins.
### - Visualization.
### - Deeper cleaning.