## Import packages

In [14]:
import nltk
from nltk import word_tokenize,pos_tag, ne_chunk,FreqDist,sent_tokenize
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
import string
from nltk.corpus import stopwords
from itertools import groupby
import gensim
from gensim import corpora,models
from operator import itemgetter
from nltk.corpus import wordnet as wn


## Read in Input Files from the Preprocessing Steps


In [15]:
csv_file = open('C:\\NUS_ISS\\KE5205_Text_Mining\\Assignment\\02_data_preprocessing_final.csv','r')


## Create a list of different categories of customised stopwords and combine it together with NLTK's english stopword list


In [16]:
temporal_stopwords = ['p','m','january','february','march','april','may','june','july','august','september','october','november','december']
custom_stopwords = [line.strip() for line in open('custom_stopwords.txt','r')]
entity_stopwords = ['hospital','employee','employer','worker','coworker','supervisor']
stopset =stopwords.words('english') + temporal_stopwords + custom_stopwords + entity_stopwords

## Define helper functions

In [17]:
def preprocess(data,stopword_lis):
    wnl = nltk.WordNetLemmatizer()
    data=data.strip()
    #remove punct
    text_no_punct = ' '.join(word.strip(string.punctuation) for word in data.split())

    #tokenize
    tokens = nltk.regexp_tokenize(data.lower(), pattern='\w+')
    stop_word_remove = [token.lower() for token in tokens if token.lower().strip() not in stopword_lis]
    
    #remove digits
    no_digit_no_stopwords = [tok for tok in stop_word_remove if not tok.isdigit()]
    result_post_lemma = [wnl.lemmatize(t) for t in no_digit_no_stopwords]
    remove_stop_words_post_lemma = [token.lower() for token in result_post_lemma if token.lower().strip() not in stopword_lis]

    return remove_stop_words_post_lemma

def retag_documents(work_sheet,model_file,dictionary_ref,topic_list_off,filename):

    output_file = open(filename,'w')
    output_file.write('case_id,title,description,keywords,victims,activity,date,body_part,occupation,is_fatal,topics,topic_desc\n')
    count = 0
    for rowz in work_sheet:
        rowz_value = rowz.split(',')
        if count > 0:
            id_tag = rowz_value[0]
            title_tag = rowz_value[1].lower().strip()
            summary_tag = rowz_value[2].lower().strip()
            metatag = rowz_value[3].lower()
            text_list_tag = preprocess(summary_tag, stopset)
            result_doc_topics = model_file.get_document_topics(dictionary_ref.doc2bow(text_list_tag))
            #assign only the highest probablity topic 
            result_data_max = max(result_doc_topics, key=itemgetter(1))
            output_file.write(rowz.replace('\n', '')+','+str(result_data_max[0])+','+topic_list_off[result_data_max[0]]+'\n')
        count=count+1
    output_file.close()


## Define Topic Modeling Related Functions

In [18]:
def create_document_list(input_file):
    doc_list = []
    count_title = 0
    for row in csv_file:
        row_value = row.split(',')
        if count_title > 0:
            id = row_value[0]
            title = row_value[1].lower()
            summary = row_value[2].lower()
            metatag = row_value[3].lower()
            if title.lower().strip() != 'inspectionopen datesicestablishment name':
                text_list = preprocess(str(summary), stopset)
                if len(text_list) > 0:
                    doc_list.append(text_list)
        count_title = count_title + 1
    return doc_list

def create_dictionary(docu_list):
    dictionary = corpora.Dictionary(docu_list)
    return dictionary

def create_dictionary_bow(dict_list,docs_list):
    print(dict_list)
    dict_list.filter_extremes(no_below=15, no_above=0.85)
    dtm_train = [dict_list.doc2bow(d) for d in docs_list]
    return dtm_train

def create_topic_model(training_rep,dictionary):

    ldamodel1 = models.ldamodel.LdaModel(training_rep, num_topics=15, id2word=dictionary, passes=20)
    ldamodel1.save('lda_model1.mod')
    return ldamodel1

def clean_topic_list(topic_list_raw):
    topic_list_official = []
    for topics, topic_desc in topic_list_raw:
        topic_post_clean = nltk.regexp_tokenize(topic_desc.lower(), pattern='\w+')
        result = '+'.join([w for w in topic_post_clean if not w.isdigit()])
        print(str(topics)+' '+str(result))
        topic_list_official.append(result)
    return topic_list_official

# Run Topic Modelling Process - Using Gensim

In [19]:
topic_output_filename='ocha_data_topics.csv'

# Create Document List

document_list = create_document_list(csv_file)

#Create Word to ID mappings 
dictionary_list = create_dictionary(document_list)

#Create Bag of Words 
dict_bow = create_dictionary_bow(dictionary_list,document_list)

#Create Topic Model
topic_model = create_topic_model(dict_bow,dictionary_list)
topic_list_extracted = topic_model.show_topics(num_topics=15, num_words=8)

topic_list_clean = clean_topic_list(topic_list_extracted)
csv_file2 = open('C:\\NUS_ISS\\KE5205_Text_Mining\\Assignment\\02_data_preprocessing_final.csv','r')

# Tag each ocha report to a topic
retag_documents(csv_file2,topic_model,dictionary_list,topic_list_clean,topic_output_filename)

Dictionary(22390 unique tokens: ['edco', 'waste', 'recycling', 'service', 'operating']...)
0 work+investigation+time+incident+safety+stated+training+equipment
1 power+line+electrical+electric+energized+conductor+volt+electrocuted
2 degree+burn+water+second+hot+trench+third+arm
3 emergency+fire+room+transported+service+area+called+later
4 tree+crane+operator+struck+beam+load+fell+steel
5 pipe+air+pressure+valve+line+well+pump+water
6 line+lift+pole+cable+ground+rope+wire+tower
7 conveyor+machine+roller+caught+belt+arm+bin+hand
8 machine+finger+hand+press+number+operating+hospitalized+operator
9 wall+floor+foot+concrete+car+side+fell+rail
10 tank+fire+gas+explosion+burn+chemical+hospitalized+vapor
11 truck+forklift+trailer+vehicle+tire+side+tractor+struck
12 sheet+board+construction+work+gun+plywood+using+new
13 fell+foot+ladder+roof+hospitalized+fall+injury+fracture
14 cut+saw+piece+metal+blade+hand+cutting+inch


Evaluate Topic Model

In [20]:
import numpy as np

lda_coherence = [n for _, n in topic_model.top_topics(dict_bow)]
print(np.mean(lda_coherence))

-2.50962415077


Once the documents in the ocha report has been tagged with individual topics, we will manually examine the generated csv file to gain an understanding of the topics and try to discover underlying causes. We will attempt to derive possible causes of accidents through the topics identified. This can be a highly iterative and subjective process especially finding the best n topics for the ocha data.