## Import packages

In [2]:
import pandas as pd
import os
import nltk
import string
import math
from nltk import word_tokenize
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from itertools import groupby
from dateutil.parser import parse

## Global functions

In [9]:
def get_output_file(filename):
    return os.path.join(os.getcwd(), 'output_data', filename)

def import_data():
    return pd.read_csv(get_output_file('01_data_extraction_final.csv'), index_col=0)

def import_ner_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_ner.csv'), index_col=0)

def import_dictionary_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_dictionary.csv'), index_col=0)

def import_combined_data():
    return pd.read_csv(get_output_file('02_data_preprocessing_final.csv'), index_col=0)

def export_ner_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_ner.csv'))
    
def export_dictionary_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_dictionary.csv'))
    
def export_combined_data(df):
    df.to_csv(get_output_file('02_data_preprocessing_final.csv'))

def export_combined_data_ml(df):
    df.to_csv(get_output_file('02_data_preprocessing_final_ml.csv'))

## Data preprocessing using Name Entity Recognition (NER)
<span style="color:red"><b>Important</b>: The NER process takes long processing time (few hours), this is just a one time call and save to a local file, not required for every run.</span>    

Use standford NER tagger to extract following information:
* Organization
* Location
* Date
* Time

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv</span>

In [None]:
############################################
# Setup environment
############################################
java_path = 'C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe'
os.environ['JAVAHOME'] = java_path

############################################
# Define taggers
############################################
ner7_model_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\english.muc.7class.distsim.crf.ser.gz"
ner_jar_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\stanford-ner.jar"
st_ner7 = StanfordNERTagger(ner7_model_path, ner_jar_path)

############################################
# Import data
############################################
df = import_data()

#############################################################################
# # Loop through all rows and extract possible name entities from description
#############################################################################
count = 1
total = len(df.index)
date_list = []
organization_list = []
time_list = []
location_list = []

def list_name_entities(tagging_result):
    
    entities = {'DATE': set(), 
                'ORGANIZATION': set(), 
                'LOCATION': set(), 
                'TIME': set()}
    
    for tag, chunk in groupby(tagging_result, lambda x:x[1]):
        if tag in entities.keys():
            entity = ' '.join(w.strip() for w, t in chunk)
            entities[tag].add(entity)
            
    for key, value in entities.items():
        entities[key] = ', '.join(value)
        
    return entities

for index, row in df.iterrows():
    
    sent = row['description']
    sent_ner7 = st_ner7.tag(word_tokenize(sent))
    entities = list_name_entities(sent_ner7)
    
    date_list.append(entities['DATE'])
    organization_list.append(entities['ORGANIZATION'])
    time_list.append(entities['TIME'])
    location_list.append(entities['LOCATION'])
    
    print(f"Processing row {count} out of {total} with index {index}.")
    count = count + 1

############################################
# Add new colmumns into original dataframe
############################################
df['date'] = date_list
df['organization'] = organization_list
df['time'] = time_list
df['location'] = location_list

############################################
# Export data
############################################
export_ner_data(df)

############################################
# Inspect data
############################################
df.head(10)

## Data preprocessing using dictionary
Extract following information using dictionary matching
* Occupation
* Injured body parts
* Is fatal?
* Activity

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_dictionary.csv</span>

In [6]:
################################
# Import data
################################
df = import_data()

################################
# Create is_fatal column
################################
def detect_fatality(case_title):
    
    def generate_fatality_keywords():
        fatality_list = ['death', 'killed', 'dead', 'fatal', 'fatally', 'dies', 'died']
        stopword_fatality = ['fall', 'going', 'passing', 'expiration', 'loss', 'exit', 'remove', 'off', 'waste']
        final_list = []
        final_list = final_list + fatality_list
        for fatal_kw in fatality_list:

            dead_keywords = wn.synsets(fatal_kw)
            dead = wn.synsets(fatal_kw)[0]
            keywords = list(set([w for s in dead.closure(lambda s: s.hyponyms()) for w in s.lemma_names()]))
            for kw in keywords:
                if kw not in final_list and kw not in stopword_fatality:
                    final_list.append(kw.replace('_', ' ').lower())
        return final_list
    
    fatality_keyword_list = generate_fatality_keywords()
    case_tokens = word_tokenize(str(case_title).lower())
    is_fatal = False
    
    for case_t in case_tokens:
        if case_t.strip() in fatality_keyword_list:
            is_fatal = True
    return is_fatal

df['is_fatal'] = df['title'].apply(detect_fatality)

################################
# Create activity column
################################
df['activity'] = df['title']

################################
# Create body_parts column
################################
stop = stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
dict_body_parts = ['abdomen','ankle','arch','arm','armpit','back','beard','breast','buttock','calf','cheek',
                   'chest','chin','collarbone','ear laceration','earlobe','elbow','eyebrow','eyelash','eyelid',
                   'face','femur','finger','forearm','forehead','forehead','groin','gum','hand','head','heel',
                   'hip','jaw','knee','knuckle','leg','lip','lungs','mouth','neck','pelvis','ribs','right temple',
                   'shoulder','spleen','thigh','throat','thumb','torso','wrist']
body_parts_mapping = {
    'abdominal fracture': 'abdomen',
    'hypertrophic heart disease': 'heart',
    'respiratory': 'lungs',
    'brain death': 'brain'
}

def detect_body_parts(text):
    tokens = word_tokenize(text)
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_lower = [t.lower() for t in tokens_nop]
    tokens_nostop = [t for t in tokens_lower if t not in stop]
    tokens_lem = [wnl.lemmatize(t) for t in tokens_nostop]
    body_parts = [t for t in set(tokens_lem) if t in dict_body_parts]
    
    text_lower = text.lower()
    body_parts = body_parts + [value for key, value in body_parts_mapping.items() if key in text_lower]
    return ', '.join(body_parts)

df['body_parts'] = (df['title'] + ' ' + df['description'] + ' ' + df['keywords']).apply(detect_body_parts)

################################
# Create occupation column
################################
stop = stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
occupations = {"construction_worker": ["construction", "forklift"],
               "cleaner": ["cleaner", "housekeeping", "cleaning"],
               "electrician": ["electrician"],
               "welder": ["welder", "welding"],
               "farmer": ["agriculture", "farm", "pruner"],
               "firefighter": ["firefighter"],
               "operator": ["operator"],
               "plumber": ["plumber", "plumbing"],
               "painter": ["painter"],
               "smelter_workers": ["molten"],
               "driver": ["driver"],
               "logger": ["logging"],
               "roofer": ["roofer"],
               "machinist": ["machinist"],
               "transformer": ["transformer"],
               "carpenter": ["carpenter"],
               "laborer": ["laborer"],
               "rigger": ["rigger"],
               "engineer": ["engineer"],
               "carpenter": ["carpentry"],
               "aquatic_worker": ["aquatic"]
               }
occupation_mapping = {
    "timber faller": "timber_faller",
    "avalanche forecaster": "avalanche_forecaster",
    "tile setter": "tile_setter",
    "abrasive blaster": "abrasive_blaster",
    "car body workman": "workman",
    "flatbed trailer": "construction_worker",
    "chicken house": "poultry_farmer",
    "vinyl sidings": "siding_worker"
}

def detect_occupation(text):
    tokens = word_tokenize(text)
    tokens_nop = [t for t in tokens if t not in string.punctuation]
    tokens_lower = [t.lower() for t in tokens_nop]
    tokens_nostop = [t for t in tokens_lower if t not in stop]
    tokens_lem_noun = [wnl.lemmatize(t) for t in tokens_nostop]
    tokens_lem_verb = [wnl.lemmatize(t, pos='v') for t in tokens_lem_noun]

    occupation = set()
    for t in tokens_lem_verb:
        for key in occupations.keys():
            if (t in occupations[key]):
                occupation.add(key)
                
    text_lower = text.lower()
    for key, value in occupation_mapping.items():
        if key in text_lower:
            occupation.add(value)

    return ', '.join(occupation)

df['occupation'] = (df['title'] + ' ' + df['description'] + ' ' + df['keywords']).apply(detect_occupation)

################################
# Export data
################################
export_dictionary_data(df)

################################
# Inspect data
################################
df.head(10)

Unnamed: 0_level_0,title,description,keywords,victims,is_fatal,activity,body_parts,occupation
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201079928,Employee Is Burned By Forklift Radiator Fluid,At approximately 11:30 a.m. on November 13 2...,burn industrial truck waste proc fac pa...,,False,Employee Is Burned By Forklift Radiator Fluid,"groin, abdomen, back, leg","construction_worker, driver"
202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...,truck flatbed truck trailer fall abdomen,,True,Employee Falls From Flatbed Trailer And Later...,abdomen,construction_worker
200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...,construction undrgrd power line highway ...,1 317290559 Fatality Other Occupation not re...,True,Two Workers Are Struck By Motor Vehicle And O...,abdomen,construction_worker
200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...,waste proc fac industrial truck struck b...,,True,Employee Is Struck By Bales Of Wire And Killed,"face, torso, abdomen, head, back, leg",construction_worker
201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...,truck driver pump tank hot water struc...,,False,Employee Is Splashed With Hot Water And Is Bu...,"shoulder, abdomen, leg","operator, driver"
202658258,Employee Suffers Burns While Moving Soup,On June 30 2013 Employee #1 was working in ...,burn spill arm chest abdomen,,False,Employee Suffers Burns While Moving Soup,"arm, chest, abdomen",
202685947,Employee Injures Self With Knife,An incident occurred as Employee #1 was attem...,knife puncture abdomen struck by slip ...,,False,Employee Injures Self With Knife,abdomen,
202673471,Foreman Is Fatally Crushed When Forklift Tips...,At approximately 6:30 a.m. on May 13 2013 E...,construction equipment operator industri...,1 316211614 Fatality Fracture Supervisors m...,True,Foreman Is Fatally Crushed When Forklift Tips...,"back, abdomen","construction_worker, operator"
202369575,Employee Fractures Abdomen When Run Over By T...,On April 23 2013 Employee #1 a baggage tug...,cart struck by run over fracture abdomen,,False,Employee Fractures Abdomen When Run Over By T...,"abdomen, abdomen",operator
202509832,Employee Suffers Abdominal Fracture In Fall F...,On April 9 2013 Employee #1 was installing ...,installing ladder scaffold structure mo...,1 316817097 Hospitalized injury Fracture Car...,False,Employee Suffers Abdominal Fracture In Fall F...,"abdomen, abdomen",siding_worker


In [8]:
df.describe(include='all')

Unnamed: 0,title,description,keywords,victims,is_fatal,activity,body_parts,occupation
count,15964,15964,15964,15964.0,15964,15964,15964.0,15964.0
unique,14394,15959,15900,4548.0,2,14394,1466.0,153.0
top,Electric Shock,On October 3 2009 Employee #1 and several c...,heart attack,,False,Electric Shock,,
freq,539,2,5,11417.0,10434,539,5477.0,8726.0


## Combine all features
Combine columns from separated files and export to final csv. Additional handling:
* Date column will pick the first date in array
* Not picking Orgnisation, Person, Location as too many missing values

Input files: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv, 02_data_preprocessing_dictionary.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_final.csv</span> 

In [10]:
################################
# Import data
################################
df = import_data()
df_ner = import_ner_data()
df_dict = import_dictionary_data()

################################
# Combine data
################################
df['activity'] = df_dict['activity']
df['date'] = df_ner['date']
df['body_parts'] = df_dict['body_parts']
df['occupation'] = df_dict['occupation']
df['is_fatal'] = df_dict['is_fatal']
df['topics'] = None

################################
# Transformation
################################

def parse_date(dates):
    dates = str(dates)
    dates = [d.strip() for d in dates.split(',')]
    for d in dates:
        date = None
        try:
            date = parse(d)
            return date
        except ValueError:
            continue       
    return None

df['date'] = df['date'].apply(parse_date)

################################
# Export data
################################
export_combined_data(df)

################################
# Inspect data
################################
df.head(10)

Unnamed: 0_level_0,title,description,keywords,victims,activity,date,body_parts,occupation,is_fatal,topics
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
201079928,Employee Is Burned By Forklift Radiator Fluid,At approximately 11:30 a.m. on November 13 2...,burn industrial truck waste proc fac pa...,,Employee Is Burned By Forklift Radiator Fluid,2013-11-13 00:00:00,"groin, abdomen, back, leg","construction_worker, driver",False,
202561825,Employee Falls From Flatbed Trailer And Later...,On August 30 2013 Employee #1 was working f...,truck flatbed truck trailer fall abdomen,,Employee Falls From Flatbed Trailer And Later...,2013-08-30 00:00:00,abdomen,construction_worker,True,
200361855,Two Workers Are Struck By Motor Vehicle And O...,On August 27 2013 Employees #1 and #2 of T...,construction undrgrd power line highway ...,1 317290559 Fatality Other Occupation not re...,Two Workers Are Struck By Motor Vehicle And O...,2013-08-27 00:00:00,abdomen,construction_worker,True,
200361863,Employee Is Struck By Bales Of Wire And Killed,On August 26 2013 Employee #1 with Lee Iro...,waste proc fac industrial truck struck b...,,Employee Is Struck By Bales Of Wire And Killed,2013-08-26 00:00:00,"face, torso, abdomen, head, back, leg",construction_worker,True,
201079324,Employee Is Splashed With Hot Water And Is Bu...,On July 14 2013 Employee #1 vacuum pump tr...,truck driver pump tank hot water struc...,,Employee Is Splashed With Hot Water And Is Bu...,2013-07-14 00:00:00,"shoulder, abdomen, leg","operator, driver",False,
202658258,Employee Suffers Burns While Moving Soup,On June 30 2013 Employee #1 was working in ...,burn spill arm chest abdomen,,Employee Suffers Burns While Moving Soup,2013-06-30 00:00:00,"arm, chest, abdomen",,False,
202685947,Employee Injures Self With Knife,An incident occurred as Employee #1 was attem...,knife puncture abdomen struck by slip ...,,Employee Injures Self With Knife,,abdomen,,False,
202673471,Foreman Is Fatally Crushed When Forklift Tips...,At approximately 6:30 a.m. on May 13 2013 E...,construction equipment operator industri...,1 316211614 Fatality Fracture Supervisors m...,Foreman Is Fatally Crushed When Forklift Tips...,2013-05-13 00:00:00,"back, abdomen","construction_worker, operator",True,
202369575,Employee Fractures Abdomen When Run Over By T...,On April 23 2013 Employee #1 a baggage tug...,cart struck by run over fracture abdomen,,Employee Fractures Abdomen When Run Over By T...,2013-04-23 00:00:00,"abdomen, abdomen",operator,False,
202509832,Employee Suffers Abdominal Fracture In Fall F...,On April 9 2013 Employee #1 was installing ...,installing ladder scaffold structure mo...,1 316817097 Hospitalized injury Fracture Car...,Employee Suffers Abdominal Fracture In Fall F...,2013-04-09 00:00:00,"abdomen, abdomen",siding_worker,False,


In [12]:
df.describe(include='all')

Unnamed: 0,title,description,keywords,victims,activity,date,body_parts,occupation,is_fatal,topics
count,15964,15964,15964,15964.0,15964,12881,10487,7238,15964,0.0
unique,14394,15959,15900,4548.0,14394,4819,1465,152,2,0.0
top,Electric Shock,On October 3 2009 Employee #1 and several c...,heart attack,,Electric Shock,2011-06-22 00:00:00,head,construction_worker,False,
freq,539,2,5,11417.0,539,17,988,2136,10434,
