## Import packages

In [1]:
import pandas as pd
import os
import nltk
from nltk import word_tokenize
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from itertools import groupby

## User defined functions

In [2]:
def import_data():
    filename = os.getcwd() + '\\output_data\\01_data_extraction.csv'
    return pd.read_csv(filename, index_col=0)

def export_ner_data(df):
    filename = os.getcwd() + '\\output_data\\02_data_preprocessing_ner.csv'
    df.to_csv(filename)
    
def export_grammar_data(df):
    filename = os.getcwd() + '\\output_data\\02_data_preprocessing_grammar.csv'
    df.to_csv(filename)
    
def export_keywords_data(df):
    filename = os.getcwd() + '\\output_data\\02_data_preprocessing_keywords.csv'
    df.to_csv(filename)

def export_combined_data(df):
    filename = os.getcwd() + '\\output_data\\02_data_preprocessing.csv'
    df.to_csv(filename)
    
def list_name_entities(tagging_result):
    
    entities = {'DATE': set(), 
                'ORGANIZATION': set(), 
                'LOCATION': set(), 
                'TIME': set()}
    
    for tag, chunk in groupby(tagging_result, lambda x:x[1]):
        if tag in entities.keys():
            entity = ' '.join(w.strip() for w, t in chunk)
            entities[tag].add(entity)
            
    for key, value in entities.items():
        entities[key] = ', '.join(value)
        
    return entities

## Data preprocessing using Name Entity Recognition (NER)
<span style="color:red"><b>Important</b>: The NER process takes long processing time (few hours), this is just a one time call and save to a local file, not required for every run.</span>    

Use standford NER tagger to extract following information:
* Organization
* Location
* Date
* Time

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv</span>

In [3]:
# Setup environment
java_path = 'C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe'
os.environ['JAVAHOME'] = java_path

# Define taggers
ner7_model_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\english.muc.7class.distsim.crf.ser.gz"
ner_jar_path = os.getcwd() + "\\tools\\stanford-ner-2017-06-09\\stanford-ner.jar"
st_ner7 = StanfordNERTagger(ner7_model_path, ner_jar_path)

# Import data
df = import_data()
df = df[:20] # Try with small set of data

# Loop through all rows
count = 1
total = len(df.index)
date_list = []
organization_list = []
time_list = []
location_list = []

for index, row in df.iterrows():
    
    sent = row['description']
    sent_ner7 = st_ner7.tag(word_tokenize(sent))
    entities = list_name_entities(sent_ner7)
    
    date_list.append(entities['DATE'])
    organization_list.append(entities['ORGANIZATION'])
    time_list.append(entities['TIME'])
    location_list.append(entities['LOCATION'])
    
    print(f"Processing row {count} out of {total} with index {index}.")
    count = count + 1

# Add new colmumns into original dataframe
df['date'] = date_list
df['organization'] = organization_list
df['time'] = time_list
df['location'] = location_list

# Export data
export_ner_data(df)

Processing row 1 out of 20 with index 202561825.
Processing row 2 out of 20 with index 200361855.
Processing row 3 out of 20 with index 200361863.
Processing row 4 out of 20 with index 201079324.
Processing row 5 out of 20 with index 202658258.
Processing row 6 out of 20 with index 202685947.
Processing row 7 out of 20 with index 202673471.
Processing row 8 out of 20 with index 202369575.
Processing row 9 out of 20 with index 202509832.
Processing row 10 out of 20 with index 201129681.
Processing row 11 out of 20 with index 202081899.
Processing row 12 out of 20 with index 202082020.
Processing row 13 out of 20 with index 201562840.
Processing row 14 out of 20 with index 202674610.
Processing row 15 out of 20 with index 202457990.
Processing row 16 out of 20 with index 201510823.
Processing row 17 out of 20 with index 202478632.
Processing row 18 out of 20 with index 200361178.
Processing row 19 out of 20 with index 202692364.
Processing row 20 out of 20 with index 202615324.


## Data preprocessing using grammar parser
Use nltk grammar parser to extract following information:
* Activity
* Is fatal?

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_grammar.csv</span>

In [4]:
# Import data
df = import_data()

# Write your steps here...

# Export data
export_grammar_data(df)

## Data preprocessing using taxonomy matching
Apply taxonomy matching to extract following information from keywords
* Occupation
* Injured body parts

Input file: <span style="color:blue; font-weight:bold">01_data_extraction.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing_keywords.csv</span>

In [5]:
# Import data
df = import_data()

# Write your steps here...

# Export data
export_keywords_data(df)

## Combine all features
Combine columns from separated files and export to final csv.

Input files: <span style="color:blue; font-weight:bold">02_data_preprocessing_ner.csv, 02_data_preprocessing_grammar.csv, 02_data_preprocessing_keywords.csv</span>  
Output file: <span style="color:blue; font-weight:bold">02_data_preprocessing.csv</span> 