In [167]:
# import libraries
import pandas as pd
import numpy as np
from string import digits
from datetime import datetime


In [168]:
# import data set into a pandas dataframe
# data = pd.read_csv('attacks.csv', nrows=1000)
data = pd.read_csv('attacks.csv')


In [169]:
# review datacolumns
data.columns


Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [123]:
# review data estructure
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [None]:
# analize data
data.head(20)

In [170]:
def clean_headers(df):
    df_ = df.copy()
    df_.columns = df_.columns.str.strip()
    return df_


In [171]:
# Chosing relevants columns from main data
def select_useful_columns_1(df):
    df_ = df.copy()
    df_ = df_[['Case Number','Date', 'Type', 'Country', 'Area', 'Activity', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Species']]
    # data.head(5)  
    return df_


In [172]:
# Change some titles names
def rename_columns(df):
    df_ = df.copy()
    df_.columns = ['Case Number','Date', 'Type', 'Country', 'Area', 'Activity', 'Sex', 'Age', 'Injury', 'Fatal', 'Species']
    # data.head(5)    
    return df_


In [173]:
# remove empty rows
def remove_empty_rows(df):
    df_ = df.copy()
    df_['Case Number'] = df_['Case Number'].fillna(0)
    df_['length'] = df_['Case Number'].str.len()
    df_ = df_[df_['length'] > 2]
    return df_


In [174]:
# Extract date from a object
def extract_date(date_obj):
    str_date = str(date_obj)
    año = str_date[:4]
    mes = str_date[5:7]
    dia = '01'
    strDate = año +'-'+ mes +'-'+ dia
    try:
        date_time = datetime.strptime(strDate, '%Y-%m-%d').date()
    except:
        strDate = '1900-01-01'
        date_time = datetime.strptime(strDate, '%Y-%m-%d').date()

    return date_time


In [175]:
# Get date from field with date in
def get_date(df):
    df_ = df.copy()
    df_['Date'] = df_['Case Number'].apply(extract_date)
    return df_


In [214]:
# null's treatment
def nulls_treatment(df):
    df_ = df.copy()
    df_.dropna(axis=1, how='all', inplace=True)
    df_.dropna(axis=0, how='all', inplace=True)

    main_cols = ['Type', 'Country', 'Area', 'Activity', 'Sex', 'Injury','Fatal','Species']
    for col in main_cols:
        df_[col] = df_[col].fillna('Unidentified')
    return df_


In [176]:
# categorize type of shark attak
def categorized_type(type_event):
    if type_event == 'Unidentified':
        return 0
    if type_event == 'Boat':
        return 1
    if type_event == 'Boating':
        return 2
    if type_event == 'Boatomg':
        return 3
    if type_event == 'Invalid':
        return 4
    if type_event == 'Provoked':
        return 5
    if type_event == 'Questionable':
        return 6
    if type_event == 'Sea Disaster':
        return 7
    if type_event == 'Unprovoked':
        return 8


In [177]:
def categorize_type(df):
    df_ = df.copy()
    df_['Type_Id'] = df_['Type'].apply(categorized_type)
    df_['Type_Id']
    return df_


In [178]:
# numeric columns treatment
def to_numeric(df, col):
    df_ = df.copy()
    df_[col] = pd.to_numeric(df_[col], errors='coerce')
    df_[col] = df_[col].fillna(0)
    return df_


In [179]:
# return just numeric values in string
def get_nums(nums):
    num = str(nums)[:2]
    return ''.join(n for n in str(num) if n in digits)


In [180]:
# get numeric values from data
def get_numeric(df, col):
    df_ = df.copy()
    df_[col] = df_[col].apply(get_nums)
    return df_


In [206]:
# replace values from column
def replace_specific_value(df, col, val_or, val_to):
    df_ = df.copy()
    df_[col] = df_[col].replace(val_or,val_to)
    return df_


In [207]:
# Select final columns
def select_useful_columns_2(df):
    df_ = df.copy()
    df_= df_[['Type', 'Country', 'Activity', 'Area', 'Sex', 'Age', 'Injury', 'Fatal', 'Species', 'Date']]
    return df_


In [208]:
# to arrange columns appropriately
def re_order_columns(df):
    df_ = df.copy()
    col_order = ['Date', 'Type', 'Country', 'Area', 'Activity', 'Sex', 'Age', 'Injury', 'Fatal', 'Species']
    df_ = df_[col_order]
    # data.head(5)    
    return df_


In [209]:
# Main Pipeline 
df1 = clean_headers(data)
df2 = select_useful_columns_1(df1)
del df1
df3 = rename_columns(df2)
del df2
df4 = remove_empty_rows(df3)
del df3
df5 = get_date(df4)
del df4
df6 = nulls_treatment(df5)
del df5
df7 = categorize_type(df6)
del df6
df8 = to_numeric(df7, 'Age')
del df7
df9 = get_numeric(df8, 'Age')
del df8
df10 = replace_specific_value(df9, 'Fatal', ['M','UNKNOWN','y','2017'], ['N','Unidentified','Y','Unidentified'])
del df9
df11 = replace_specific_value(df10, 'Sex', ['.','N','lli'], ['Unidentified','M','Unidentified'])
del df10
df12 = select_useful_columns_2(df11)
del df11
df13 = re_order_columns(df12)
del df12


In [210]:
# Save dataset into a csv file.
df13.to_csv('shark_attack_clean_dataset.csv')

In [213]:
df13.isnull().sum()

Date        0
Type        0
Country     0
Area        0
Activity    0
Sex         0
Age         0
Injury      0
Fatal       0
Species     0
dtype: int64