### Import libraries

In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
import datetime

### Load dataset

In [2]:
data = pd.read_csv('shark_attack_messy.csv', encoding= 'ISO-8859-1')

### Set display options:

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
pd.set_option('display.width', 2000)

## Step 1. Explore the dataframe:

### General information and overview:

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 24 columns):
Case Number               5992 non-null object
Date                      5992 non-null object
Year                      5992 non-null int64
Type                      5992 non-null object
Country                   5949 non-null object
Area                      5590 non-null object
Location                  5496 non-null object
Activity                  5465 non-null object
Name                      5792 non-null object
Sex                       5425 non-null object
Age                       3311 non-null object
Injury                    5965 non-null object
Fatal (Y/N)               5973 non-null object
Time                      2779 non-null object
Species                   3058 non-null object
Investigator or Source    5977 non-null object
pdf                       5992 non-null object
href formula              5991 non-null object
href                      5989 non-null object
C

In [6]:
data.shape

(5992, 24)

In [7]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


Percentage of null values:

In [8]:
data.isnull().sum()/5992*100

Case Number                0.000000
Date                       0.000000
Year                       0.000000
Type                       0.000000
Country                    0.717623
Area                       6.708945
Location                   8.277704
Activity                   8.795060
Name                       3.337784
Sex                        9.462617
Age                       44.742991
Injury                     0.450601
Fatal (Y/N)                0.317089
Time                      53.621495
Species                   48.965287
Investigator or Source     0.250334
pdf                        0.000000
href formula               0.016689
href                       0.050067
Case Number.1              0.000000
Case Number.2              0.000000
original order             0.000000
Unnamed: 22               99.983311
Unnamed: 23               99.966622
dtype: float64

In [9]:
#data.profile_report()

In [10]:
data.describe()

Unnamed: 0,Year,original order
count,5992.0,5992.0
mean,1925.204606,2997.548899
std,286.473712,1729.86021
min,0.0,2.0
25%,1942.0,1499.75
50%,1975.0,2997.5
75%,2003.0,4495.25
max,2016.0,5993.0


### Check for similar content inside similar named columns:

#### href and href formula:

In [11]:
href_diff = data[(data['href formula'] == data['href']) == False]
href_diff[['href formula','href']].head()

Unnamed: 0,href formula,href
20,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
27,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
61,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
107,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...
114,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...


In [12]:
href_diff[['href formula','href']].index

Int64Index([20, 27, 61, 107, 114, 134, 180, 193, 232, 262, 263, 264, 271, 272, 293, 305, 323, 347, 361, 362, 363, 364, 365, 367, 377, 378, 379, 380, 381, 382, 383, 384, 448, 449, 823, 1217, 1218, 2274, 2477, 3019, 3549, 3603, 3917, 3928, 4394, 4642, 4668, 4719, 5317, 5458, 5686, 5694, 5819, 5857], dtype='int64')

Print full field to check for differences

In [13]:
for i in list(href_diff.index):
    print(i)
    print(href_diff['href formula'][i])
    print(href_diff['href'][i])
    print('\n')

20
http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.29-Spain.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.29-Spain.pdf


27
http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23.a-Cutbirth.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23-Cutbirth.pdf


61
http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.a-Girl.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.a-Girl.pdf


107
http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf


114
http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf
http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.1

Overall href seems way cleaner and useful, except for some of the rows, which I'll try to change later.

#### Case Number, Case Number.1 and Case Number.2:

In [14]:
case_diff = data[((data['Case Number.1'] == data['Case Number.2']) == False) | ((data['Case Number'] == data['Case Number.1']) == False) | ((data['Case Number'] == data['Case Number.2']) == False)]
case_diff[['Case Number','Case Number.1','Case Number.2']]

Unnamed: 0,Case Number,Case Number.1,Case Number.2
4,2016.09.15,2016.09.16,2016.09.15
33,2016.07.14.4,2016.07.14.R,2016.07.14.4
97,2016.01.24.b,2015.01.24.b,2016.01.24.b
116,2015.12.23,2015.11.07,2015.12.23
121,2015.10.28.a,2015.10.28,2015.10.28.a
169,2015.07-10,2015.07.10,2015.07.10
3296,1967.07.05,1967/07.05,1967.07.05
3569,"1962,08.30.b",1962.08.30.b,"1962,08.30.b"
3654,1961.09.02.R,"1961.09,06.R",1961.09.02.R
4177,1952.08.05,1952.08.04,1952.08.05


Case Number and Case Number 2 are identical, so Case Number 2 column can be dropped.

I look for differences in the rest of fields of the non-coincidental rows.

In [15]:
case_diff

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,
33,2016.07.14.4,Reported 14-Jul-2016,2016,Unprovoked,BAHAMAS,,Tiger Beach,Scuba Diving,Michael Dornellas,M,,Face bruised when partly blind shark collided ...,N,,"Lemon shark, 9'","GrindTV, 7/14/2016",2016.07.14.R-TigerBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.07.14.R,2016.07.14.4,5960,,
97,2016.01.24.b,24-Jan-16,2016,Unprovoked,USA,Texas,Off Surfside,Spearfishing,Keith Love,M,,"Bruised ribs & tail bone, speargun broken and ...",N,09h30 / 10h00,Bull sharks x 2,K. Love,2016.01.24.b-Love.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.01.24.b,2016.01.24.b,5896,,
116,2015.12.23,07-Nov-15,2015,Invalid,USA,Florida,"Paradise Beach, Melbourne, Brevard County",Surfing,Ryla Underwood,F,9,Lower left leg injured,N,11h00,Shark involvement not confirmed,"Fox25Orlando, 11/7/2015",2015.11.07-Underwood.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.11.07,2015.12.23,5876,,
121,2015.10.28.a,28-Oct-15,2015,Unprovoked,USA,Hawaii,"Malaka, Oahu",Body boarding,Raymond Senensi,M,10,"Lacerations & puncture wounds to right thigh, ...",N,14h50,,"Star Advertiser, 10/28/2015",2015.10.28-Senensi.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.10.28,2015.10.28.a,5871,,
169,2015.07-10,10-Jul-15,2015,Unprovoked,USA,California,"Huntington Beach, Orange County",Surfing,Danny Miskin,M,38,"No injury, shark bumped & damaged board",N,08h45,"White shark, 7'","KTLA, 7/10/2015",2015.07.10-Miskin.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.07.10,2015.07.10,5823,,
3296,1967.07.05,05-Jul-67,1967,Unprovoked,TURKEY,Mugla Province,Kucukada Island,Spearfishing,Gungor Guven,M,36,FATAL,Y,13h40,,"C. Moore, GSAF",1967.07.05-Guven.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1967/07.05,1967.07.05,2697,,
3569,"1962,08.30.b",30-Aug-62,1962,Boat,TURKEY,Antalya Province,Ucagiz,,Occupant: Hasan Olta,M,,No injury,N,,,"C.Moore, GSAF",1962.08.30.b-pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1962.08.30.b,"1962,08.30.b",2424,,
3654,1961.09.02.R,Reported 06-Sep-1961,1961,Provoked,ITALY,Venice Province,Chioggia,Fishing,Pollione Perrini & Fioravante Perini,M,33 & 37,Left foot & right hand bitten by netted shark ...,N,,1 m shark,"C. Moore, GSAF",1961.09.06.R-Chioggia.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,"1961.09,06.R",1961.09.02.R,2339,,
4177,1952.08.05,05-Aug-52,1952,Provoked,ITALY,Teramo,Giulianova,Fishing,Vittorio Speca,,19,Multiple injuries PROVOKED INCIDENT,Y,02h00,2m shark,"C. Moore, GSAF",1952.08.05-Speca.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1952.08.04,1952.08.05,1816,,


Not many differences. Most of the differences seem to be typographical errors, which will be fixed later, in order to be able to drop the duplicated columns.

### Explore unique values of activities, countries, areas, injuries and types:

#### Type:

In [16]:
data.Type.unique()

array(['Unprovoked', 'Boat', 'Provoked', 'Invalid', 'Sea Disaster',
       'Boating'], dtype=object)

In [17]:
unique_type = data.pivot_table(index=['Type'],aggfunc='size')
unique_type

Type
Boat             200
Boating          110
Invalid          519
Provoked         557
Sea Disaster     220
Unprovoked      4386
dtype: int64

#### Country:

In [18]:
unique_country = data.pivot_table(index=['Country'],aggfunc='size')
unique_country = unique_country.sort_values(ascending=False)
unique_country[:10]

Country
USA                 2116
AUSTRALIA           1279
SOUTH AFRICA         565
PAPUA NEW GUINEA     133
NEW ZEALAND          125
BRAZIL               102
BAHAMAS               98
MEXICO                81
ITALY                 71
FIJI                  62
dtype: int64

#### Area:

In [19]:
data.Area.unique()

array(['Florida', 'Victoria', 'Western Australia', 'Hawaii',
       'North Province', 'South Carolina', 'California', nan,
       'New Providence Island', 'Alicante Province', 'Hong Kong',
       'New South Wales', 'Kochi Prefecture', 'Tasmania', 'Abaco Islands',
       'Queensland', 'Alabama', 'Canary Islands', 'Massachusetts',
       'North Carolina', 'Isla Provedencia', 'Western Cape Province',
       'Texas', 'Suez', 'North Island', 'Bali', 'Tuamotos', 'Grand Terre',
       'Boa Vista Island', 'Vanua Levu', 'South Australia',
       'Santa Catarina State', 'South Province', 'Altagracia Province',
       'Grand Cayman', 'KwaZulu-Natal', 'Fujairah Emirate',
       'Grand Canary Island', 'Pernambuco', 'Eastern Cape Province',
       'Inhambane Province', 'Phuket', 'Alicante', 'Bora Bora',
       'Grand Bahama Island', 'Le Port', 'Rangiroa',
       'Saint-Gilles-les-Bains', 'Sardinia', 'Sinaloa', 'Central Tuamotu',
       'd\x92Étang-Salé', 'Granada',
       'Cargados Carajos Shoals (S

Very heterogeneous data

In [20]:
unique_area = data.pivot_table(index=['Area'],aggfunc='size')
unique_area = unique_area.sort_values(ascending=False)
unique_area[:10]

Area
Florida                  990
New South Wales          468
Queensland               300
Hawaii                   282
California               276
KwaZulu-Natal            210
Western Cape Province    188
Western Australia        172
Eastern Cape Province    156
South Carolina           145
dtype: int64

#### Activity:

In [21]:
print(len(list(data.Activity.unique())))

1493


Again, very heterogeneous data for this field

In [22]:
unique_activity = data.pivot_table(index=['Activity'],aggfunc='size')
unique_activity  = unique_activity.sort_values(ascending=False)
unique_activity[:10]

Activity
Surfing         904
Swimming        819
Fishing         414
Spearfishing    321
Bathing         153
Wading          144
Diving          123
Standing         97
Snorkeling       77
Scuba diving     74
dtype: int64

#### Injury:

In [23]:
print(len(list(data.Injury.unique())))

3596


Even more heterogeneous

In [24]:
unique_injury = data.pivot_table(index=['Injury'],aggfunc='size')
unique_injury = unique_injury.sort_values(ascending=False)
unique_injury[:10]

Injury
FATAL                729
Survived              97
Foot bitten           82
No injury             77
Leg bitten            71
Left foot bitten      48
No details            43
Right foot bitten     39
Hand bitten           28
Thigh bitten          27
dtype: int64

### Impressions:

- Two empty columns, which seem useless (unnamed)
- 2 int variables - Rest of variables are mainly text
- names of variables are messy
- href_formula and href seem pretty similar
- Case_Number, Case_Number.1 and Case_Number.2 seem pretty similar
    - Check the cases where there are differences between the values of those columns.
- Fatal (Y/N) change to 0 and 1 ??
- Age is type object
- Dates and hours are also strings
- Very heterogeneous values for injuries and data, mainly for the former

## Step 2. Dataframe cleaning:

### Delete unnamed_22 and unnamed_23:

In [25]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'], dtype='object')

In [26]:
data = data.drop(['Unnamed: 22', 'Unnamed: 23'], axis =1)

In [27]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989


### Transform Case Number and Delete Case Number.1 and Case Number.2

In [28]:
data['Case Number'] = data['Case Number'].str.replace('-','.')

In [29]:
data['Case Number'] = data['Case Number'].str.replace(',','.')

In [30]:
for i in list(case_diff.index):
    print(data['Case Number'][i])

2016.09.15
2016.07.14.4
2016.01.24.b
2015.12.23
2015.10.28.a
2015.07.10
1967.07.05
1962.08.30.b
1961.09.02.R
1952.08.05
1934.01.08.R
1900.00.00.R
1911.07.31.R


In [31]:
data = data.drop(['Case Number.1'], axis =1)

In [32]:
data = data.drop(['Case Number.2'], axis =1)

In [33]:
data.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


### Transform href formula and drop href:

#### a) Transform href formula

List of indexes with incorrections and list of correct links

In [34]:
a = [193, 264, 271, 272, 293, 448, 823, 2477, 3019, 3917, 4394, 5819]

In [35]:
b = [
    'http://sharkattackfile.net/spreadsheets/pdf_directory/Court',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.14-Bandy.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.02.b-Vandenberg.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.02.b-Vandenberg.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.08.12-Hayes.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/2013.05.27.b-Ena.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/2010.02.01-Ward.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/1989.00.00-Virginia-spearfisherman.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/1975.01.19-Barrowman.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/1958.08.31.a-NV-Ensenada.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/1945.07.30-USSIndianapolis.pdf',
    'http://sharkattackfile.net/spreadsheets/pdf_directory/1819.07.08-Cadiz.pdf'
]

In [36]:
data['href formula'][193] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/Court'
data['href formula'][264] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.14-Bandy.pdf'
data['href formula'][271] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.02.b-Vandenberg.pdf'
data['href formula'][272] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.02.b-Vandenberg.pdf',
data['href formula'][293] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/2014.08.12-Hayes.pdf',
data['href formula'][448] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/2013.05.27.b-Ena.pdf'
data['href formula'][823] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/2010.02.01-Ward.pdf'
data['href formula'][2477] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/1989.00.00-Virginia-spearfisherman.pdf'
data['href formula'][3019] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/1975.01.19-Barrowman.pdf'
data['href formula'][3917] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/1958.08.31.a-NV-Ensenada.pdf'
data['href formula'][4394] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/1945.07.30-USSIndianapolis.pdf'
data['href formula'][5819] = 'http://sharkattackfile.net/spreadsheets/pdf_directory/1819.07.08-Cadiz.pdf'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

<span style="color: #ff6600">**I've done it step by step because I couldn't figure out how to put it in a  loop...**</span>

In [37]:
#href_dict = dict(zip(a,b))

In [38]:
#href_copy = data.copy(deep=True)

In [39]:
#href_copy['href formula'][3917]

In [40]:
#for k,v in href_dict:
        #href_copy['href formula'][int(k)] = v        

In [41]:
#lambda a,b:href_copy['href formula'][a] = b

#### a) Drop href

In [42]:
data = data.drop(['href'], axis =1)

## Step 3. Dataframe manipulation:

### Rename columns:

In [43]:
data.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species ', 'Investigator or Source', 'pdf', 'href formula', 'original order'], dtype='object')

In [44]:
data.columns = ['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
                'activity', 'name', 'sex', 'age', 'injury', 'fatal', 'time', 
                'species', 'investigator_or_source', 'pdf', 'href', 'original_order']

In [45]:
data.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species,investigator_or_source,pdf,href,original_order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


### Change age type to numeric:

In [47]:
pd.to_numeric(data['age'],downcast='signed')

ValueError: Unable to parse string "60s" at position 6

All non-numeric values will be transformed to NaN, using arg errors='coerce':

In [None]:
pd.to_numeric(data['age'],downcast='signed',errors='coerce')
data.head()

### Change year type to numeric:

All non-numeric values will be transformed to NaN, using arg errors='coerce':

In [None]:
pd.to_numeric(data['year'],downcast='signed',errors='coerce')
data.head()

### Change original_order type to numeric:

All non-numeric values will be transformed to NaN, using arg errors='coerce':

In [48]:
pd.to_numeric(data['original_order'],downcast='signed',errors='coerce')
data.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species,investigator_or_source,pdf,href,original_order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


### New column for those entries which have date and time:

In [49]:
data['date_time'] = data['date'] + ' ' + data['time']

In [50]:
data['date_time'][0:5]

0    18-Sep-16 13h00
1    18-Sep-16 11h00
2    18-Sep-16 10h43
3                NaN
4                NaN
Name: date_time, dtype: object

In [51]:
data['date_time'] = pd.to_datetime(data['date_time'],errors='coerce',dayfirst=True)

In [52]:
data.head(2)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal,time,species,investigator_or_source,pdf,href,original_order,date_time
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993,2016-09-18 13:00:00
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992,2016-09-18 11:00:00


In [53]:
data['date_time'][0:5]

0   2016-09-18 13:00:00
1   2016-09-18 11:00:00
2   2016-09-18 10:43:00
3                   NaT
4                   NaT
Name: date_time, dtype: datetime64[ns]

Drop time column and reorder columns:

In [54]:
data = data.drop(['time'], axis =1)

In [55]:
column_order = ['case_number', 'date', 'date_time', 'year', 'type', 'country', 'area',
                'location', 'activity', 'name', 'sex', 'age', 'injury', 
                'fatal', 'species', 'investigator_or_source', 'pdf', 
                'href', 'original_order']

In [56]:
data = data[column_order]

In [57]:
data.head()

Unnamed: 0,case_number,date,date_time,year,type,country,area,location,activity,name,sex,age,injury,fatal,species,investigator_or_source,pdf,href,original_order
0,2016.09.18.c,18-Sep-16,2016-09-18 13:00:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016-09-18 11:00:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016-09-18 10:43:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,NaT,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,NaT,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


### Change date to date type:

In [58]:
data['date'] = pd.to_datetime(data['date'],errors='coerce',dayfirst=True)

In [59]:
data.head()

Unnamed: 0,case_number,date,date_time,year,type,country,area,location,activity,name,sex,age,injury,fatal,species,investigator_or_source,pdf,href,original_order
0,2016.09.18.c,2016-09-18,2016-09-18 13:00:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,2016-09-18,2016-09-18 11:00:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,2016-09-18,2016-09-18 10:43:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,2016-09-17,NaT,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,2016-09-16,NaT,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


### Change time to time type:

In [60]:
#data['time'].isnull().sum()

In [61]:
#dt = datetime.datetime.strptime(try1,"%H:%M").time()
#print(type(dt))

In [62]:
#data['time'] = data['time'].str.replace('h',':')

In [63]:
#data['time'].head(10)

In [64]:
#for items in data['time'].iteritems():
 #   try:
  #      datetime.datetime.strptime(items,"%H:%M").time()
   # except:
    #    data['time'][items] = np.nan

In [65]:
#data.head()

## Step 4. Explore the new dataframe:

### General information and overview:

In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 19 columns):
case_number               5992 non-null object
date                      5135 non-null datetime64[ns]
date_time                 2175 non-null datetime64[ns]
year                      5992 non-null int64
type                      5992 non-null object
country                   5949 non-null object
area                      5590 non-null object
location                  5496 non-null object
activity                  5465 non-null object
name                      5792 non-null object
sex                       5425 non-null object
age                       3311 non-null object
injury                    5965 non-null object
fatal                     5973 non-null object
species                   3058 non-null object
investigator_or_source    5977 non-null object
pdf                       5992 non-null object
href                      5992 non-null object
original_order            5992 n

In [67]:
data.shape

(5992, 19)

In [68]:
data.head()

Unnamed: 0,case_number,date,date_time,year,type,country,area,location,activity,name,sex,age,injury,fatal,species,investigator_or_source,pdf,href,original_order
0,2016.09.18.c,2016-09-18,2016-09-18 13:00:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,2016-09-18,2016-09-18 11:00:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,2016-09-18,2016-09-18 10:43:00,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,2016-09-17,NaT,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,2016-09-16,NaT,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


Percentage of null values:

In [69]:
data.isnull().sum()/5992*100

case_number                0.000000
date                      14.302403
date_time                 63.701602
year                       0.000000
type                       0.000000
country                    0.717623
area                       6.708945
location                   8.277704
activity                   8.795060
name                       3.337784
sex                        9.462617
age                       44.742991
injury                     0.450601
fatal                      0.317089
species                   48.965287
investigator_or_source     0.250334
pdf                        0.000000
href                       0.000000
original_order             0.000000
dtype: float64

In [70]:
#data.profile_report()

In [71]:
data.describe()

Unnamed: 0,year,original_order
count,5992.0,5992.0
mean,1925.204606,2997.548899
std,286.473712,1729.86021
min,0.0,2.0
25%,1942.0,1499.75
50%,1975.0,2997.5
75%,2003.0,4495.25
max,2016.0,5993.0


##   

## Conclusions:

Started with a __(shape)__ dataframe and ended up with a __(shape)__ dataframe. Still significantly big to extract conclusions but way cleaner and easier to analyze

## Improvements:

- **'investigator_or_source'** could use some cleaning. I think maybe trying to separate source and date, maybe using regex.
- Bearing in mind the content of the **'species'** column, I don't think it is really useful. Would like to find a way to sort it out.
- Try to do some text mining to the **'injury'** column to see if it can be categorized.
- Investigate seasonality of attacks with timeseries.