# Library and Data Preparation 

In [1]:
import os
import fnmatch
import TWB.xliff as xliff
import spacy
import pandas as pd
import queue
import numpy as np
import unicodedata

In [2]:
# normalize unicode characters
def NFD(s):
        return unicodedata.normalize('NFD', s)

# put your data path here
data_dir = os.getcwd()
metadata = pd.read_excel(data_dir + 'Hackathon-for-Good-2019_TWB-Challenge_Metadata.xlsx')

# choose only text documents
#accepted_documents = metadata.loc[(metadata['Format'] == 'doc') | (metadata['Format'] == 'pdf')].drop_duplicates()

# fix some problems with the encoding of special characters in filenames
#accepted_documents['Filename'] = accepted_documents['Filename'].apply(NFD)
# we use all documents after all
metadata['Filename'] = metadata['Filename'].apply(NFD)

# update data path with the sdlxliff directory
data_dir += '/hackathon-for-good-2019_TWB-challenge_files/'

In [3]:
metadata

Unnamed: 0,Date,Wordcount,Filename,Source_lang,Source_country,Target_lang,Target_country,NGO,Format,System
0,2016-12-12 00:56:11,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
1,2016-12-12 01:53:23,121.0,387words.docx,en,US,fr,FR,Translators without Borders,doc,Kató 1
2,2016-12-12 09:30:28,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
3,2016-12-12 12:57:12,121.0,387words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
4,2016-12-12 19:04:04,45371.3,gtr_charts_FINAL.PPTX,en,GB,ar,SA,British Red Cross,ppt,Kató 1
5,2016-12-12 19:04:04,45371.3,gtr_charts_FINAL.PPTX,en,GB,fr,FR,British Red Cross,ppt,Kató 1
6,2016-12-12 19:04:04,45371.3,gtr_charts_FINAL.PPTX,en,GB,es,ES,British Red Cross,ppt,Kató 1
7,2016-12-12 19:04:04,45371.3,gtr_graphs_FINAL.XLSX,en,GB,ar,SA,British Red Cross,xls,Kató 1
8,2016-12-12 19:04:04,45371.3,gtr_graphs_FINAL.XLSX,en,GB,fr,FR,British Red Cross,xls,Kató 1
9,2016-12-12 19:04:04,45371.3,gtr_graphs_FINAL.XLSX,en,GB,es,ES,British Red Cross,xls,Kató 1


In [4]:
# store all sdlxliff filenames into a list
document_names = []

# trying to get all doc documents based on extensions
for document in os.listdir(data_dir):    
    if fnmatch.fnmatch(document,'*.sdlxliff'):
        '''
        if fnmatch.fnmatch(document,'*.doc*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.DOC*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.txt*'):
            document_names.append(document)    
        elif fnmatch.fnmatch(document,'*.pdf*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.PDF*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.odt*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.rtf*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.dotx*'):
            document_names.append(document)
        '''
        document_names.append(document)

In [5]:
# get the translated contents of all documents
document_contents = []
document_source_langs = []
document_target_langs = []

for i in range(len(document_names)):
    document = document_names[i]
    temp_xliff = xliff.XLIFF(data_dir + document)
    document_contents.append(temp_xliff.source)
    document_source_langs.append(temp_xliff.source_lang)
    document_target_langs.append(temp_xliff.target_lang)

In [6]:
# same unicode fix as for accepted_documents
for i in range(len(document_names)):
    document_names[i] = NFD(document_names[i])

In [7]:
# remove the sdlxliff extension as most files in the metadata are without it
filenames = []
for i in range(len(document_names)):
    filenames.append(document_names[i].replace('.sdlxliff', ''))

In [8]:
# find the documents that most probably have no metadata, they are a lot
documents_without_metadata = []

count = 0
for name in filenames:
    temp = metadata.loc[metadata['Filename'] == name]
    if temp.empty:
        # for the special case that the translated version's extension is used
        temp2 = metadata.loc[metadata['Filename'] == name + '.sdlxliff']
        if temp2.empty:
            count += 1       
            documents_without_metadata.append(name)

print(count)

3151


In [9]:
# find the few entries in the metadata that are not in the documents 
# and the corresponding document filenames for the metadata filenames with wrong encoding
import difflib

filename_correspondence = {}
count = 0
for name in list(metadata['Filename'].drop_duplicates()):
    if name not in filenames and name not in document_names:
        max_similarity = 0.0
        corresponding_document = None
        for filename in filenames:
            # filename letter similarity
            seq = difflib.SequenceMatcher(None, filename, name)
            if (seq.ratio() > max_similarity):
                max_similarity = seq.ratio()
                corresponding_document = filename
        # this threshold was manually checked and it produces only one false positive
        if max_similarity > 0.5:
            filename_correspondence[name] = corresponding_document
        else:
            count += 1
            print(name)

# delete the false positive 
del filename_correspondence['Patient_Release_Form_-_Final_4.25.docx']                
print(count)

Ø·Ø¹Ø§Ù_x0085_Ù_x0083_Ù_x0085__Ø§Ù_x0084_Ù_x008a_Ù_x0088_Ù_x0085_.docx
å_x0080__x008b_äººè_x0087_ªå_x0082_³.docx
2


In [10]:
# helper function that changes the wrongly encoded metadata filenames
def change_names(x):
    if x in filename_correspondence.keys():
        return filename_correspondence[x]
    else:
        return x

# start combining all present features/data per document
all_document_data = metadata
all_document_data['Filename'] = metadata['Filename'].apply(lambda x: change_names(x))

# combine document lines into one list instead of a list of lists for use in a DataFrame column
contents = []
for i in range(len(document_contents)):
    temp = ''
    for j in range(len(document_contents[i])):
        temp += document_contents[i][j] + ' '
    contents.append(temp)

# left inner join of DataFrames
# all_document_data contains information on all documents with content that have also metadata
name_contents = pd.DataFrame(columns=['Filename','Content'])
name_contents['Filename'] = filenames
name_contents['Content'] = contents
all_document_data = pd.merge(all_document_data,name_contents)

In [11]:
# verify which documents are missing from the previous DataFrame
s1 = set(all_document_data['Filename'])
s2 = set(name_contents['Filename'])
no_metadata = list(s2.difference(s1))

# create another DataFrame that contains only the filename, 
# source language, target language and content of these documents
no_metadata_source_langs = []
no_metadata_target_langs = []
no_metadata_contents = []

for i in no_metadata:
    j = filenames.index(i)
    no_metadata_source_langs.append(document_source_langs[j])
    no_metadata_target_langs.append(document_target_langs[j])
    no_metadata_contents.append(contents[j])

no_metadata_df = pd.DataFrame(columns=['Filename', 'Source_lang', 'Target_lang', 'Content'])
no_metadata_df['Source_lang'] = no_metadata_source_langs
no_metadata_df['Target_lang'] = no_metadata_target_langs
no_metadata_df['Content'] = no_metadata_contents
no_metadata_df['Filename'] = no_metadata

In [12]:
# find only documents in the english language
from langdetect import detect

# detect the document contents language if possible
def detect_helper(x):
    try:
        temp = detect(x)
    except:
        temp = 'No'
    return temp

# keep only the english documents in both datasets
languages = all_document_data['Content'].apply(lambda x: detect_helper(x))
english_document_data = all_document_data[languages == 'en']

languages2 = no_metadata_df['Content'].apply(detect_helper)
english_no_metadata_df = no_metadata_df[languages2 == 'en']

In [14]:
import nltk

# add average sentence length as a feature(it was mentioned by the challenge owners, thought it could be interesting)
def avg_sent_len(x):
    acc = 0
    sents = nltk.sent_tokenize(x)
    if len(sents) == 0:
        return 0
    for sent in sents:
        acc += len(nltk.word_tokenize(sent))
    return acc/len(sents)
                                         
english_document_data['Average Sentence Length'] = english_document_data['Content'].apply(avg_sent_len)
english_no_metadata_df['Average Sentence Length'] = english_no_metadata_df['Content'].apply(avg_sent_len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,Filename,Source_lang,Target_lang,Content
0,Master_HRMI_Survey_-_V3-EN.xml,en-GB,sr-Latn-RS,K Chad Clay K Chad Clay 2018-02-28T22:06:56Z 1...
1,drive-download-20180913T132628Z-001.zip___SEP_...,en-GB,fr-FR,Subject Matter Expert: Authored by: (Print) Au...
2,Vaccines_for_preventing_cholera_-_killed_whole...,en-GB,fr-FR,Vaccines for preventing cholera: killed whole ...
3,RSF-FSAL.docx,es-es,en-gb,El fondo de Solidaridad América Latina (FSAL) ...
4,IDP_16220_Ruan_EN.docx,en-us,ru-ru,Genetic diversity of Plasmodium Vivax revealed...
5,Strategic_Gifts_Overview_September_2017.pptx,en-US,ru-RU,Strategic Gifts\n2017 Guiding Principles Focus...
6,cccm.zip___SEP___1._MODULE_7-03-REV.docx,en-gb,ar-sa,Exercise 6: Final site selection Based on the...
7,Photo_Release_(OPTIONAL).doc,en,fr,PHOTOGRAPH AND INFORMATION RELEASE FORM PROPER...
8,AR_T1_F1_Introducao_investigacao_accao.doc,en,fr,Introdução à Investigação-Acção Um dos fundado...
9,Exit_Survey_Cameroon.zip___SEP___IMG_20171027_...,fr-FR,en-US,"(} :[FxIfIL_H,..o:?uNcrL 1. QuelthBme 6tait le..."


In [27]:
# cache to disk for easier use
english_document_data.to_pickle('data/english_document_data.pkl')
english_no_metadata_df.to_pickle('data/english_no_metadata.pkl')

## Stemming

In [29]:
import nltk
from nltk.stem import PorterStemmer
#from nltk.stem import LancasterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=tokenizer.tokenize(sentence)
    token_words
    stop_words = set(stopwords.words('english'))
    #print(stop_words)
    filter_sentense = [w for w in token_words if not w in stop_words]
    
    stem_sentence=[]
    for word in filter_sentense:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mixalis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# testing tokenizer function
n=16880
filename = all_document_data['Filename'].iloc[n]
sentence = all_document_data['Content'].iloc[n]
token_words = tokenizer.tokenize(filename)
print(filename)
print(sentence)
print(token_words)

Instructions-Leaflet2_ENglish_(2).docx
THIS IS YOUR FREE MICRO SD CARD FROM REFUCOMM IT CONTAINS ESSENTIAL INFORMATION FOR YOU  There are many different rules and procedures in Greece. This card contains VITAL information on these procedures and how you can prepare for them. You will NOT get this information from the authorities. All our information has been checked by lawyers. You do not need to use the internet to access the information. It is free! There is information in document and film format on this SD card in Arabic, Farsi, Dari, Urdu, French, Sorani and Kurmanji.  HOW TO USE Look for a place on your phone to insert the SD card; see images below for the usual locations. Most phones have one of these, if you can’t find it, ask a friend for help. 		 in the side of your phone	in the back of your phone	in the top of your phone If your phone does not have a place for SD cards you can get the information from a friend’s phone via Bluetooth (see overleaf). If you cannot open the docu

In [31]:
# testing stemSentence function
x=tokenizer.tokenize(stemSentence(sentence))
print(x)

['thi', 'IS', 'your', 'free', 'micro', 'SD', 'card', 'from', 'refucomm', 'IT', 'contain', 'essenti', 'inform', 'for', 'you', 'there', 'mani', 'differ', 'rule', 'procedur', 'greec', 'thi', 'card', 'contain', 'vital', 'inform', 'procedur', 'prepar', 'you', 'not', 'get', 'inform', 'author', 'all', 'inform', 'check', 'lawyer', 'you', 'need', 'use', 'internet', 'access', 'inform', 'It', 'free', 'there', 'inform', 'document', 'film', 'format', 'SD', 'card', 'arab', 'farsi', 'dari', 'urdu', 'french', 'sorani', 'kurmanji', 'how', 'TO', 'use', 'look', 'place', 'phone', 'insert', 'SD', 'card', 'see', 'imag', 'usual', 'locat', 'most', 'phone', 'one', 'find', 'ask', 'friend', 'help', 'side', 'phone', 'back', 'phone', 'top', 'phone', 'If', 'phone', 'place', 'SD', 'card', 'get', 'inform', 'friend', 'phone', 'via', 'bluetooth', 'see', 'overleaf', 'If', 'cannot', 'open', 'document', 'need', 'download', 'pdf', 'viewer', 'phone', 'To', 'go', 'googl', 'play', 'store', 'search', 'pdf', 'viewer', 'select',

In [32]:
# create pd data frame "Stems" and save all the stemed Filenames and Content
stems = pd.DataFrame(index=all_document_data.index, columns=['Filename','Content'])
for i in all_document_data.index:
    filename = all_document_data['Filename'].iloc[i]
    content = all_document_data['Content'].iloc[i]
    stems.Filename[i] = stemSentence(filename.replace("_", " "))
    stems.Content[i] = stemSentence(content.replace("_", " "))

In [33]:
stems.Content.describe

<bound method NDFrame.describe of 0        thi test thi test thi test thi test thi test t...
1        thi test thi test thi test thi test thi test t...
2        thi test thi test thi test thi test thi test t...
3        thi test thi test thi test thi test thi test t...
4        thi test thi test thi test thi test thi test t...
5        thi test thi test thi test thi test thi test t...
6        thi test thi test thi test thi test thi test t...
7        thi test thi test thi test thi test thi test t...
8        thi test thi test thi test thi test thi test t...
9        thi test thi test thi test thi test thi test t...
10       thi test thi test thi test thi test thi test t...
11       thi test thi test thi test thi test thi test t...
12       thi test thi test thi test thi test thi test t...
13       grt strength high qualiti resourc scale scope ...
14       grt strength high qualiti resourc scale scope ...
15       grt strength high qualiti resourc scale scope ...
16       fact individu

In [34]:
tags = ['humanitarian crisis',
        'natural disaster',
        'environmental crisis',        
        'disability',
        'gender',
        'genital mutilation',
        'racism',
        'genocide',
        'civil war',
        'terrorism',
        'infectious disease',
        'political revolution',
        'political prisoner',
        'amnesty',
        'corruption',
        'health awareness',
        'gender inequality',
        'rape',
        'ebola',
        'aids',
        'first aid',
        'emergency',
        'disease',
        'operation',
        'virus',
        'response',
        'protocol',
        'certificate',
        'disability', 
        'medication'
       ]

In [35]:
stemSentence(tags[0])

'humanitarian crisi '

In [36]:
# stemming the tags and saving them to a df 
impactful_dict = pd.DataFrame(index=range(len(tags)), columns=['Stems'])
i =0
for w in tags:
    impactful_dict.Stems[i]= stemSentence(w)
    i += 1

In [37]:
impactful_dict.describe

<bound method NDFrame.describe of                   Stems
0   humanitarian crisi 
1         natur disast 
2    environment crisi 
3               disabl 
4               gender 
5          genit mutil 
6               racism 
7              genocid 
8            civil war 
9               terror 
10      infecti diseas 
11       polit revolut 
12        polit prison 
13             amnesti 
14             corrupt 
15         health awar 
16        gender inequ 
17                rape 
18               ebola 
19                 aid 
20           first aid 
21               emerg 
22              diseas 
23                oper 
24                viru 
25             respons 
26            protocol 
27              certif 
28              disabl 
29               medic >

## Compute Relative Freqs of tags in the Contents and Filenames

In [38]:
import re

In [39]:
# testing word counts and freqs
text = stems.Content[0]
# text = 'paok paok paok paok paok paok paok test'
givenWord = 'test'
total = len(re.findall(r'\w+', text)) 
count = len(re.findall('\w*'+ givenWord +'\w*', text))
print(total)
print(count)
print(count/total)

173
87
0.5028901734104047


In [40]:
# function that returns the relative freq of spercific word in str
def word_relativ_Freq(text, givenWord):
    total = len(re.findall(r'\w+', text)) 
    count = len(re.findall('\w*'+ givenWord +'\w*', text))
    if total==0:
        return 0
    relFreq = count/total
    return relFreq

In [41]:
# create a data frame with the relative freqs of stemed tags per document content
freq_content = pd.DataFrame(index= all_document_data.index, columns=impactful_dict.Stems)

In [42]:
import time
start_time = time.time()

for j in impactful_dict.Stems:
    freq_content[j] = stems.Content.apply(lambda x: word_relativ_Freq(x,j))

print("--- %s seconds ---" % (time.time() - start_time))

--- 1457.4833626747131 seconds ---


In [43]:
freq_content.describe()

Stems,humanitarian crisi,natur disast,environment crisi,disabl,gender,genit mutil,racism,genocid,civil war,terror,...,first aid,emerg,diseas,oper,viru,respons,protocol,certif,disabl.1,medic
count,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,...,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0
mean,1.5e-05,5.7e-05,3.524379e-08,0.001387,0.000483,1.1e-05,6e-06,4e-06,1e-05,1.1e-05,...,0.000405,0.002024,0.000873,0.002187,0.000257,0.002697,0.000192,0.000191,0.001387,0.000843
std,0.000213,0.000659,1.890121e-06,0.006728,0.002382,0.000185,0.000134,0.000123,0.000211,0.000143,...,0.004412,0.006836,0.004109,0.005829,0.002609,0.007644,0.001081,0.00183,0.006728,0.003409
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.8e-05,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000801,0.0,0.001397,0.0,0.002622,0.0,0.0,0.0,0.0
max,0.023256,0.04717,0.0001013993,0.125874,0.068531,0.013333,0.005726,0.006494,0.01087,0.004057,...,0.166667,0.111111,0.103093,0.066667,0.061856,0.166667,0.022543,0.064516,0.125874,0.119565


In [44]:
# create a data frame with the relative freqs of stemed tags per document filename
freq_filename = pd.DataFrame(index= all_document_data.index, columns=impactful_dict.Stems)
for j in impactful_dict.Stems:
    freq_filename[j] = stems.Filename.apply(lambda x: word_relativ_Freq(x,j))

In [45]:
freq_filename.describe()

Stems,humanitarian crisi,natur disast,environment crisi,disabl,gender,genit mutil,racism,genocid,civil war,terror,...,first aid,emerg,diseas,oper,viru,respons,protocol,certif,disabl.1,medic
count,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,...,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0
mean,3e-06,2.1e-05,0.0,0.00094,0.000296,0.0,0.0,0.0,4e-06,0.0,...,0.000977,0.001522,0.000848,0.000954,2.2e-05,0.001653,0.000665,0.000334,0.00094,0.000344
std,0.000538,0.00178,0.0,0.011007,0.007487,0.0,0.0,0.0,0.000483,0.0,...,0.012845,0.015369,0.013366,0.011169,0.001378,0.015919,0.010871,0.009086,0.011007,0.007151
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.1,0.166667,0.0,0.25,0.333333,0.0,0.0,0.0,0.0625,0.0,...,0.25,0.333333,0.333333,0.25,0.142857,0.5,0.25,0.333333,0.25,0.25


Finding labels of the documents

In [50]:
# defining function to check buzz word 
def exist_(temp,j): 
    temp2=temp.lower()
    temp3=temp2.find(j)
    return temp3>0
    
dictionary2=['certificate','instruction','poster','protocol','response','letter','report','map','manual','banner']

for j in dictionary2:
    jj=english_document_data.Filename.apply(lambda x: exist_(x,j))
    rr=pd.DataFrame(columns=[str(j)])
    rr[str(j)]=jj
    english_document_data=pd.concat([english_document_data,rr ], axis=1)

In [49]:
from TWB.nlp import tfidf, Dictionary

# create tokens per document count DataFrame for tf-idf 
# give memory error
tfidf_df = tfidf(Dictionary(list(stems['Content'])).corpus_annotation)

MemoryError: 

In [52]:
#stems.to_pickle('data/stemmed_data.pkl')
freq_content.to_pickle('data/freq_content.pkl')
freq_filename.to_pickle('data/freq_filename.pkl')
#tfidf_df.to_pickle('data/tfidf.pkl')