## Import packages

In [1]:
import pandas as pd
import os
import nltk
import string
import math
from nltk import word_tokenize
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from itertools import groupby
from dateutil.parser import parse

## Global functions

In [2]:
def get_output_file(filename):
    return os.path.join(os.getcwd(), 'output_data', filename)

def import_data():
    return pd.read_csv(get_output_file('01_data_extraction_final.csv'), index_col=0)

def import_ner_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_ner.csv'), index_col=0)

def import_dictionary_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_dictionary.csv'), index_col=0)

def export_ner_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_ner.csv'))
    
def export_dictionary_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_dictionary.csv'))

def export_combined_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_final.csv'))


## Data preprocessing using Name Entity Recognition (NER)
<span style="color:red"><b>Important</b>: The NER process takes long processing time (few hours), this is just a one time call and save to a local file, not required for every run.</span>    

Use standford NER tagger to extract following information:
* Organization
* Location
* Date
* Time

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv</span>

In [None]:
############################################
# Setup environment
############################################
java_path = 'C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe'
os.environ['JAVAHOME'] = java_path

############################################
# Define taggers
############################################
ner7_model_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\english.muc.7class.distsim.crf.ser.gz"
ner_jar_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\stanford-ner.jar"
st_ner7 = StanfordNERTagger(ner7_model_path, ner_jar_path)

############################################
# Import data
############################################
df = import_data()

#############################################################################
# # Loop through all rows and extract possible name entities from description
#############################################################################
count = 1
total = len(df.index)
date_list = []
organization_list = []
time_list = []
location_list = []

def list_name_entities(tagging_result):
    
    entities = {'DATE': set(), 
                'ORGANIZATION': set(), 
                'LOCATION': set(), 
                'TIME': set()}
    
    for tag, chunk in groupby(tagging_result, lambda x:x[1]):
        if tag in entities.keys():
            entity = ' '.join(w.strip() for w, t in chunk)
            entities[tag].add(entity)
            
    for key, value in entities.items():
        entities[key] = ', '.join(value)
        
    return entities

for index, row in df.iterrows():
    
    sent = row['description']
    sent_ner7 = st_ner7.tag(word_tokenize(sent))
    entities = list_name_entities(sent_ner7)
    
    date_list.append(entities['DATE'])
    organization_list.append(entities['ORGANIZATION'])
    time_list.append(entities['TIME'])
    location_list.append(entities['LOCATION'])
    
    print(f"Processing row {count} out of {total} with index {index}.")
    count = count + 1

############################################
# Add new colmumns into original dataframe
############################################
df['date'] = date_list
df['organization'] = organization_list
df['time'] = time_list
df['location'] = location_list

############################################
# Export data
############################################
export_ner_data(df)

## Data preprocessing using dictionary
Extract following information using dictionary matching
* Occupation
* Injured body parts
* Is fatal?
* Activity

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_dictionary.csv</span>

In [3]:
################################
# Import data
################################
df = import_data()

################################
# Create is_fatal column
################################
def detect_fatality(case_title):
    
    def generate_fatality_keywords():
        fatality_list = ['death', 'killed', 'dead', 'fatal', 'fatally', 'dies', 'died']
        stopword_fatality = ['fall', 'going', 'passing', 'expiration', 'loss', 'exit', 'remove', 'off', 'waste']
        final_list = []
        final_list = final_list + fatality_list
        for fatal_kw in fatality_list:

            dead_keywords = wn.synsets(fatal_kw)
            dead = wn.synsets(fatal_kw)[0]
            keywords = list(set([w for s in dead.closure(lambda s: s.hyponyms()) for w in s.lemma_names()]))
            for kw in keywords:
                if kw not in final_list and kw not in stopword_fatality:
                    final_list.append(kw.replace('_', ' ').lower())
        return final_list
    
    fatality_keyword_list = generate_fatality_keywords()
    case_tokens = word_tokenize(str(case_title).lower())
    is_fatal = False
    
    for case_t in case_tokens:
        if case_t.strip() in fatality_keyword_list:
            is_fatal = True
    return is_fatal

df['is_fatal'] = df['title'].apply(detect_fatality)

################################
# Create activity column
################################
df['activity'] = df['title']

################################
# Create body_parts column
################################
stop = stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
dict_body_parts = ['ankle', 'arch', 'arm', 'armpit', 'beard', 'breast', 'calf', 'cheek', 'chest', 'chin', 'earlobe', 
                   'elbow', 'eyebrow', 'eyelash', 'eyelid', 'face', 'finger', 'forearm', 'forehead', 'gum', 'heel', 
                   'hip', 'jaw', 'knee', 'knuckle', 'leg', 'lip', 'mouth', 'head']

def detect_body_parts(keywords):
    tokens = word_tokenize(keywords)
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_lower = [t.lower() for t in tokens_nop]
    tokens_nostop = [t for t in tokens_lower if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    body_parts = [t for t in set(tokens_lem) if t in dict_body_parts]
    return ', '.join(body_parts)

df['body_parts'] = (df['title'] + ' ' + df['description'] + ' ' + df['keywords']).apply(detect_body_parts)

################################
# Create occupation column
################################


################################
# Export data
################################
export_dictionary_data(df)

## Combine all features
Combine columns from separated files and export to final csv. Additional handling:
* Date column will pick the first date in array
* Not picking Orgnisation, Person, Location as too many missing values

Input files: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv, 02_data_preprocessing_dictionary.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing.csv</span> 

In [6]:
################################
# Import data
################################
df_ner = import_ner_data()
df_dict = import_dictionary_data()

################################
# Combine data
################################
df = pd.DataFrame()
df['acitivty'] = df_dict['activity']
df['date'] = df_ner['date']
df['body_parts'] = df_dict['body_parts']
df['is_fatal'] = df_dict['is_fatal']
df['occupation'] = None
df['topics'] = None

################################
# Transformation
################################

def parse_date(dates):
    dates = str(dates)
    dates = [d.strip() for d in dates.split(',')]
    for d in dates:
        date = None
        try:
            date = parse(d)
            return date
        except ValueError:
            continue       
    return None

df['date'] = df['date'].apply(parse_date)

################################
# Export data
################################
export_combined_data(df)

################################
# Inspect data
################################
df.head(10)

Unnamed: 0_level_0,acitivty,date,body_parts,is_fatal,occupation,topics
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
202561825,Employee Falls From Flatbed Trailer And Later...,2013-08-30 00:00:00,,True,,
200361855,Two Workers Are Struck By Motor Vehicle And O...,2013-08-27 00:00:00,,True,,
200361863,Employee Is Struck By Bales Of Wire And Killed,2013-08-26 00:00:00,"face, leg, head",True,,
201079324,Employee Is Splashed With Hot Water And Is Bu...,2013-07-14 00:00:00,leg,False,,
202658258,Employee Suffers Burns While Moving Soup,2013-06-30 00:00:00,"arm, chest",False,,
202685947,Employee Injures Self With Knife,,,False,,
202673471,Foreman Is Fatally Crushed When Forklift Tips...,2013-05-13 00:00:00,,True,,
202369575,Employee Fractures Abdomen When Run Over By T...,2013-04-23 00:00:00,,False,,
202509832,Employee Suffers Abdominal Fracture In Fall F...,2013-04-09 00:00:00,,False,,
201129681,Carpenter Injured In Abdomen When Saw Kicks B...,2013-04-01 00:00:00,,False,,


In [9]:
df.describe(include='all')

Unnamed: 0,acitivty,date,body_parts,is_fatal,occupation,topics
count,15963,12880,8069,15963,0.0,0.0
unique,14393,4816,388,2,0.0,0.0
top,Electric Shock,2011-06-22 00:00:00,finger,False,,
freq,539,17,1569,10433,,
