## Import packages

In [None]:
import pandas as pd
import os
import nltk
import string
import math
from nltk import word_tokenize
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from itertools import groupby
from dateutil.parser import parse

## Global functions

In [None]:
def get_output_file(filename):
    return os.path.join(os.getcwd(), 'output_data', filename)

def import_data():
    return pd.read_csv(get_output_file('01_data_extraction_final.csv'), index_col=0)

def import_ner_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_ner.csv'), index_col=0)

def import_dictionary_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_dictionary.csv'), index_col=0)

def import_combined_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_final.csv'), index_col=0)

def export_ner_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_ner.csv'))
    
def export_dictionary_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_dictionary.csv'))

def export_combined_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_final.csv'))

def export_combined_data_ml(df):
    df.to_csv(get_output_file('02_data_preprocessing_final_ml.csv'))

## Data preprocessing using Name Entity Recognition (NER)
<span style="color:red"><b>Important</b>: The NER process takes long processing time (few hours), this is just a one time call and save to a local file, not required for every run.</span>    

Use standford NER tagger to extract following information:
* Organization
* Location
* Date
* Time

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv</span>

In [None]:
############################################
# Setup environment
############################################
java_path = 'C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe'
os.environ['JAVAHOME'] = java_path

############################################
# Define taggers
############################################
ner7_model_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\english.muc.7class.distsim.crf.ser.gz"
ner_jar_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\stanford-ner.jar"
st_ner7 = StanfordNERTagger(ner7_model_path, ner_jar_path)

############################################
# Import data
############################################
df = import_data()

#############################################################################
# # Loop through all rows and extract possible name entities from description
#############################################################################
count = 1
total = len(df.index)
date_list = []
organization_list = []
time_list = []
location_list = []

def list_name_entities(tagging_result):
    
    entities = {'DATE': set(), 
                'ORGANIZATION': set(), 
                'LOCATION': set(), 
                'TIME': set()}
    
    for tag, chunk in groupby(tagging_result, lambda x:x[1]):
        if tag in entities.keys():
            entity = ' '.join(w.strip() for w, t in chunk)
            entities[tag].add(entity)
            
    for key, value in entities.items():
        entities[key] = ', '.join(value)
        
    return entities

for index, row in df.iterrows():
    
    sent = row['description']
    sent_ner7 = st_ner7.tag(word_tokenize(sent))
    entities = list_name_entities(sent_ner7)
    
    date_list.append(entities['DATE'])
    organization_list.append(entities['ORGANIZATION'])
    time_list.append(entities['TIME'])
    location_list.append(entities['LOCATION'])
    
    print(f"Processing row {count} out of {total} with index {index}.")
    count = count + 1

############################################
# Add new colmumns into original dataframe
############################################
df['date'] = date_list
df['organization'] = organization_list
df['time'] = time_list
df['location'] = location_list

############################################
# Export data
############################################
export_ner_data(df)

############################################
# Inspect data
############################################
df.head(10)

## Data preprocessing using dictionary
Extract following information using dictionary matching
* Occupation
* Injured body parts
* Is fatal?
* Activity

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_dictionary.csv</span>

In [None]:
################################
# Import data
################################
df = import_data()

################################
# Create is_fatal column
################################
def detect_fatality(case_title):
    
    def generate_fatality_keywords():
        fatality_list = ['death', 'killed', 'dead', 'fatal', 'fatally', 'dies', 'died']
        stopword_fatality = ['fall', 'going', 'passing', 'expiration', 'loss', 'exit', 'remove', 'off', 'waste']
        final_list = []
        final_list = final_list + fatality_list
        for fatal_kw in fatality_list:

            dead_keywords = wn.synsets(fatal_kw)
            dead = wn.synsets(fatal_kw)[0]
            keywords = list(set([w for s in dead.closure(lambda s: s.hyponyms()) for w in s.lemma_names()]))
            for kw in keywords:
                if kw not in final_list and kw not in stopword_fatality:
                    final_list.append(kw.replace('_', ' ').lower())
        return final_list
    
    fatality_keyword_list = generate_fatality_keywords()
    case_tokens = word_tokenize(str(case_title).lower())
    is_fatal = False
    
    for case_t in case_tokens:
        if case_t.strip() in fatality_keyword_list:
            is_fatal = True
    return is_fatal

df['is_fatal'] = df['title'].apply(detect_fatality)

################################
# Create activity column
################################
df['activity'] = df['title']

################################
# Create body_parts column
################################
stop = stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
dict_body_parts = ['abdomen','ankle','arch','arm','armpit','back','beard','breast','buttock','calf','cheek',
                   'chest','chin','collarbone','ear laceration','earlobe','elbow','eyebrow','eyelash','eyelid',
                   'face','femur','finger','forearm','forehead','forehead','groin','gum','hand','head','heel',
                   'hip','jaw','knee','knuckle','leg','lip','lungs','mouth','neck','pelvis','ribs','right temple',
                   'shoulder','spleen','thigh','throat','thumb','torso','wrist']
body_parts_mapping = {
    'abdominal fracture': 'abdomen',
    'hypertrophic heart disease': 'heart',
    'respiratory': 'lungs',
    'brain death': 'brain'
}

def detect_body_parts(text):
    tokens = word_tokenize(text)
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_lower = [t.lower() for t in tokens_nop]
    tokens_nostop = [t for t in tokens_lower if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    body_parts = [t for t in set(tokens_lem) if t in dict_body_parts]
    
    text_lower = text.lower()
    body_parts = body_parts + [value for key, value in body_parts_mapping.items() if key in text_lower]
    return ', '.join(body_parts)

df['body_parts'] = (df['title'] + ' ' + df['description'] + ' ' + df['keywords']).apply(detect_body_parts)

################################
# Create occupation column
################################
stop = stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
occupations = {"construction": ["construction"],
               "cleaner": ["cleaner", "housekeeping", "cleaning"],
               "electrician": ["electrician"],
               "welder": ["welder", "welding"],
               "farmer": ["agriculture", "farm", "pruner"],
               "firefighter": ["firefighter"],
               "operator": ["operator"],
               "plumber": ["plumber", "plumbing"],
               "painter": ["painter"],
               "Smelter workers": ["molten"],
               "driver": ["driver"],
               "logger": ["logging"],
               "roofer": ["roofer"],
               "machinist": ["machinist"],
               }

def detect_occupation(text):
    tokens = word_tokenize(text)
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_lower = [t.lower() for t in tokens_nop]
    tokens_nostop = [t for t in tokens_lower if t not in stop]
    tokens_lem_noun = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_lem_verb = [wnl.lemmatize(t, pos='v') for t in tokens_lem_noun]

    occupation = set()
    for t in tokens_lem_verb:
        for key in occupations.keys():
            if (t in occupations[key]):
                occupation.add(key)

    return ', '.join(occupation)

df['occupation'] = (df['title'] + ' ' + df['description'] + ' ' + df['keywords']).apply(detect_occupation)

################################
# Export data
################################
export_dictionary_data(df)

################################
# Inspect data
################################
df.head(10)

## Combine all features
Combine columns from separated files and export to final csv. Additional handling:
* Date column will pick the first date in array
* Not picking Orgnisation, Person, Location as too many missing values

Input files: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv, 02_data_preprocessing_dictionary.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_final.csv, 02_data_preprocessing_final_ml.csv</span> 

In [None]:
################################
# Import data
################################
df = import_data()
df_ner = import_ner_data()
df_dict = import_dictionary_data()

################################
# Combine data
################################
df['acitivty'] = df_dict['activity']
df['date'] = df_ner['date']
df['body_parts'] = df_dict['body_parts']
df['occupation'] = df_dict['occupation']
df['is_fatal'] = df_dict['is_fatal']
df['topics'] = None

################################
# Transformation
################################

def parse_date(dates):
    dates = str(dates)
    dates = [d.strip() for d in dates.split(',')]
    for d in dates:
        date = None
        try:
            date = parse(d)
            return date
        except ValueError:
            continue       
    return None

df['date'] = df['date'].apply(parse_date)

################################
# Export data
################################
export_combined_data(df)

################################
# Inspect data
################################
df.head(10)

In [None]:
df.describe(include='all')

In [None]:
################################
# Import data
################################
df = import_combined_data()

############################################################
# Convert column with array values to separated indicators
############################################################

def array_to_indicators(df, col, prefix):
    labels = set()
    for values in df[col]:
        for value in str(values).split(','):
            labels.add(value.strip())

    for label in labels:
        df[f'{prefix}_{label}'] = 0

    for index, row in df.iterrows():
        values = str(row[col]).split(',')
        labels = [value.strip() for value in values]
        for label in labels:
            df.loc[index, f'{prefix}_{label}'] = 1

    del df[col]
    return df
    
df = array_to_indicators(df, 'body_parts', 'body')

################################
# Export data
################################
export_combined_data_ml(df)

################################
# Inspect data
################################
df.head(10)

In [None]:
df.describe(include='all')