# What is the correlation between the time of the year and the human activity? 
    - By Year, indexing the month of the year
    - By Activities

In [1]:
#import pymysql
#from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import re

In [2]:
year=2016
#year=int(input('Enter the year: '))

In [3]:
def acquisition():
    df=pd.read_csv('/Users/jidekickpush/Documents/GitHub/0323_2020DATAPAR/Labs/module_1/Pipelines-Project/Data/GSAF5.csv', encoding ='cp1252')
    return df

In [4]:
def data_cleaning(df):
    #From the table overview, we can see the following statements:
    #* columns 'unnamed: 22' and 'unnamed: 23' are not referenced in the description of the dataset and doesn't contain any (relevant) information.
    #* columns 'Case Number.1' and 'Case Number.2'are duplicates of 'Case Number'
    #* columns 'date' cannot be nomalised cause of the differents syntaxes but the information can be extract from column 'Case Number'
    #=> Proceed to drop those columns
    null_cols = df.isnull().sum()
    null_cols
    null_cols[null_cols > 0]
    df=df.drop(['Unnamed: 22','Unnamed: 23','Case Number.1','Case Number.2','Date'], axis=1)
    
    # Some names of the columns aren't clean or clear enough. Below the list of columns renamed
    #* Sex: remove a blank space at the end.
    df.rename(columns={'Sex ':'Sex', 'Country':'Place'}, inplace = True)
    
    
    #Among the total 5900 events registered, only 137 happened before 1700.
    #To evaluate only statistically relevant data, events registered before 1700 will not be considered
    df=df[df['Year']>1700]
    
    #Let's fix 'Sex' column: Typo found on 2 entrances.
    #For 'Place': We've reduced the list of countries from the original set of 196 categories, to 174.
    #=>For that purpose we have used both regular expressions and manual replacement.
    df.replace({'Sex':{'M ':'M'}}, inplace=True)
    
    #remove end ?
    #remove start/end blank spaces
    #remove 2nd country after /
    #df.columnname.str.replace('word','newword')
    df.replace(regex={
    r'\?':'', 
    r'\s\/\s[A-Z\s]+': '', 
    r'\s$':'', r'^\s':''
    }, inplace=True)
    
    #On 'Place' column, manually fixed some duplicates
    df.replace({'Place': { 'UNITED ARAB EMIRATES (UAE)':'UNITED ARAB EMIRATES', 
    'Fiji':'FIJI', 'ST. MAARTIN':'ST. MARTIN', 
    'Seychelles':'SEYCHELLES', 
    'Sierra Leone':'SIERRA LEONE', 
    'St Helena': 'ST HELENA', 
    'ENGLAND': 'UNITED KINGDOM', 
    'SCOTLAND': 'UNITED KINGDOM'}
    }, inplace=True)
    
    #Normalizing column Activity
    #Reduce from the original 1418 unique values on Activity to 5: 'Surfing', 'Swimming', 'Fishing', 'Diving' & 'Others'.
    df.rename(columns={'Activity':'unActivity'}, inplace=True)
    df_activity = df['unActivity']
    activity = []
    for a in df_activity:
        if re.search(r'Surf[\w\s\,]+|surf[\w\s\,]+|[\w\s\,]+surf[\w\s\,]+', str(a)):
            a = 'Surfing'
        elif re.search(r'Fish[\w\s\,]+|fish[\w\s\,]+|[\w\s\,]+fish[\w\s\,]+', str(a)):
            a = 'Fishing'
        elif re.search(r'Spear[\w\s\,]+|spear[\w\s\,]+|[\w\s\,]+spear[\w\s\,]+', str(a)):
            a = 'Fishing'
        elif re.search(r'Swim[\w\s\,]+|swim[\w\s\,]+|[\w\s\,]+swim[\w\s\,]+', str(a)):
            a = 'Swimming'
        elif re.search(r'Div[\w\s\,]+|div[\w\s\,]+|[\w\s\,]+div[\w\s\,]+', str(a)):
            a = 'Diving'
        else: a = 'Others'
        activity.append(a)
    df['Activity'] = activity
    df = df.drop(['unActivity'], axis=1)
    
    
    #Create a new column for dates, getting the information from the column 'Case Number'
    df['Date']=df['Case Number']
    df['Date'].replace(regex = {r'.[A-Za-z]$':''}, inplace = True)
    
    #Create a new column for the month, extracting it from the 'Case Number' column
    #* check if percentage of unrelevant dates : month missing in the data
    #=> drop the rows without specified month
    df['Month']=[m[5:7] for m in df['Case Number']]
    
    #Percentage of month not specified in the df is less than 10%, we decided to do not keep them:
    # Get 'Months' of indexes for which column month has value 00
    indexNames = df[ df['Month'] == '00' ].index
    # Delete these row indexes from dataFrame
    df.drop(indexNames , inplace=True)
    
    #Normalizing the hour, keeping only the values that correspond to a 24h value
    ###df['Time'] = df['Time'].replace(regex = {r'\s[\w\-\d\/\()]+|\-[\w\-\d\/]+|j$|^\>|^\<':'', r'h':':'})
    ###hour = []
    #time = df['Time']
    #for h in time:
     #   if re.search(r'\d{2}\:\d{2}', str(h)) == None:
     #       h = 'Unknown'
    #    hour.append(h)
    #df['Hour'] = hour
    
    #Change column types
    #Change the column Fatal (Y/N) to a boolean, normalizing all the entries to True or False.
    #The few unknown values have been trated as non fatal.
    df.rename(columns={ 'Fatal (Y/N)' : 'Fatal'}, inplace=True)
    df = df.replace({'Fatal': { 'N' : '0', 'Y' : '1', 'n' : '0', 'y' : '1', 'UNKNOWN' : '0', 'F' : '0', '#VALUE!' : '0'}})
    df['Fatal'].astype(bool)
    
    
    return df

In [5]:
def filter_by_year(df):
    global year
    filtered=df[df.Year==year]
    return filtered

In [None]:
def display_seasonality_attacks(df)
    #Binning the data by season on a new column
    season_labels=['Winter','Spring','Summer','Fall']
    cutoffs= [1,4,7,10,12]
    bins = pd.cut(df['Month'], cutoffs, labels=season_labels)
    seasonality = seasonality.round({'Ratio' : 2})
    df['Season']=bins
    
    #Ratio of attacks per person
    seasonality = df.pivot_table(index=['Season'], values=['Date'], aggfunc= len,fill_value=0)
    seasonality = seasonality.rename(columns= {'Date':'Count'})
    seasonality['Ratio'] = seasonality['Count'] * 100 / seasonality['Count'].sum()
    seasonality = seasonality.round({'Ratio':2})
    #display(seasonality)
    return seasonality

In [None]:
def attack_seasonality_per_activity(df):
    df.groupby('Activity').Season.value_counts(normalize=True)
    return

In [6]:
def save_cleaned_table(df):
    #Filtered Table¶
    #10 columns with categorized data selected to further analysis.
    #Excluded from this final table the columns with unique values.
    filtered_table = df[['Date', 'Year', 'Month', 'Hour', 'Place', 'Area','Location', 'Activity', 'Sex', 'Fatal']]
    filtered_table.to_csv("/Users/jidekickpush/Documents/GitHub/0323_2020DATAPAR/Labs/module_1/Pipelines-Project/output/cleaned_df_GSAF5.csv")

In [7]:
if __name__=='__main__':
    data_raw=acquisition()
    data_cleaned=data_cleaning(data_raw)
    data_filtered=filter_by_year(data_cleaned)
    data_seasonality = display_seasonality_attacks(data_filtered)
    display(data_seasonality)

AttributeError: 'DataFrame' object has no attribute 'pd'