# Library and Data Preparation 

In [3]:
import os
import fnmatch
import TWB.xliff as xliff
import spacy
import pandas as pd
import queue
import numpy as np
import unicodedata

In [5]:
# normalize unicode characters
def NFD(s):
        return unicodedata.normalize('NFD', s)

# put your data path here
# put your data path here
data_dir = os.getcwd()
data_dir='/home/antonis/TWB_translators/'
metadata = pd.read_excel(data_dir + 'Hackathon-for-Good-2019_TWB-Challenge_Metadata.xlsx')

# choose only text documents
# TODO The metadata contain some files multiple times and the table is almost 3 times the number of documents
# Using various combinations of subsets of the columns to get all the 12156 documents did not succeed
accepted_documents = metadata.loc[(metadata['Format'] == 'doc') | (metadata['Format'] == 'pdf')].drop_duplicates()

# fix some problems with the encoding of special characters in filenames
accepted_documents['Filename'] = accepted_documents['Filename'].apply(NFD)

# update data path with the sdlxliff directory
data_dir += '/hackathon-for-good-2019_TWB-challenge_files/'

In [6]:
accepted_documents

Unnamed: 0,Date,Wordcount,Filename,Source_lang,Source_country,Target_lang,Target_country,NGO,Format,System
0,2016-12-12 00:56:11,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
1,2016-12-12 01:53:23,121.0,387words.docx,en,US,fr,FR,Translators without Borders,doc,Kató 1
2,2016-12-12 09:30:28,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
3,2016-12-12 12:57:12,121.0,387words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
10,2016-12-12 19:04:04,45371.3,Global_Tools_Review_FINAL_Nov2016_graphs_corre...,en,GB,ar,SA,British Red Cross,doc,Kató 1
11,2016-12-12 19:04:04,45371.3,Global_Tools_Review_FINAL_Nov2016_graphs_corre...,en,GB,fr,FR,British Red Cross,doc,Kató 1
12,2016-12-12 19:04:04,45371.3,Global_Tools_Review_FINAL_Nov2016_graphs_corre...,en,GB,es,ES,British Red Cross,doc,Kató 1
13,2016-12-19 11:04:31,0.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
14,2016-12-19 12:09:22,109.0,347words.docx,en,US,fr,FR,Translators without Borders,doc,Kató 1
15,2016-12-19 12:10:10,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1


In [7]:
# store all sdlxliff filenames into a list
document_names = []

# trying to get all doc documents based on extensions
for document in os.listdir(data_dir):    
    if fnmatch.fnmatch(document,'*.sdlxliff'):
        if fnmatch.fnmatch(document,'*.doc*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.DOC*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.txt*'):
            document_names.append(document)    
        elif fnmatch.fnmatch(document,'*.pdf*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.PDF*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.odt*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.rtf*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.dotx*'):
            document_names.append(document)

In [8]:
# get the translated contents of all documents
document_contents = []
document_source_langs = []
document_target_langs = []

for i in range(len(document_names)):
    document = document_names[i]
    temp_xliff = xliff.XLIFF(data_dir + document)
    document_contents.append(temp_xliff.target)
    document_source_langs.append(temp_xliff.source_lang)
    document_target_langs.append(temp_xliff.target_lang)

In [9]:
# same unicode fix as for accepted_documents
for i in range(len(document_names)):
    document_names[i] = NFD(document_names[i])

In [10]:
# remove the sdlxliff extension as most files in the metadata are without it
filenames = []
for i in range(len(document_names)):
    filenames.append(document_names[i].replace('.sdlxliff', ''))

In [11]:
# find the documents that most probably have no metadata, they are a lot
documents_without_metadata = []

count = 0
for name in filenames:
    temp = accepted_documents.loc[accepted_documents['Filename'] == name]
    if temp.empty:
        # for the special case that the translated version's extension is used
        temp2 = accepted_documents.loc[accepted_documents['Filename'] == name + '.sdlxliff']
        if temp2.empty:
            count += 1       
            documents_without_metadata.append(name)

print(count)

2733


In [12]:
# find the few entries in the metadata that are not in the documents 
# and the corresponding document filenames for the metadata filenames with wrong encoding
import difflib

filename_correspondence = {}
count = 0
for name in list(accepted_documents['Filename'].drop_duplicates()):
    if name not in filenames and name not in document_names:
        max_similarity = 0.0
        corresponding_document = None
        for filename in filenames:
            # filename letter similarity
            seq = difflib.SequenceMatcher(None, filename, name)
            if (seq.ratio() > max_similarity):
                max_similarity = seq.ratio()
                corresponding_document = filename
        # this threshold was manually checked and it produces only one false positive
        if max_similarity > 0.5:
            filename_correspondence[name] = corresponding_document
        else:
            count += 1
            print(name)

# delete the false positive 
del filename_correspondence['Patient_Release_Form_-_Final_4.25.docx']                
print(count)

Ø·Ø¹Ø§ÙÙÙ_Ø§ÙÙÙÙ.docx
åäººèªå³.docx
2


In [13]:
# helper function that changes the wrongly encoded metadata filenames
def change_names(x):
    if x in filename_correspondence.keys():
        return filename_correspondence[x]
    else:
        return x

# start combining all present features/data per document
all_document_data = accepted_documents
all_document_data['Filename'] = accepted_documents['Filename'].apply(lambda x: change_names(x))

# combine document lines into one list instead of a list of lists for use in a DataFrame column
contents = []
for i in range(len(document_contents)):
    temp = ''
    for j in range(len(document_contents[i])):
        temp += document_contents[i][j] + ' '
    contents.append(temp)

# left inner join of DataFrames
# all_document_data contains information on all documents with content that have also metadata
name_contents = pd.DataFrame(columns=['Filename','Content'])
name_contents['Filename'] = filenames
name_contents['Content'] = contents
all_document_data = pd.merge(all_document_data,name_contents)

In [14]:
# verify which documents are missing from the previous DataFrame
s1 = set(all_document_data['Filename'])
s2 = set(name_contents['Filename'])
no_metadata = list(s2.difference(s1))

# create another DataFrame that contains only the filename, 
# source language, target language and content of these documents
no_metadata_source_langs = []
no_metadata_target_langs = []
no_metadata_contents = []

for i in no_metadata:
    j = filenames.index(i)
    no_metadata_source_langs.append(document_source_langs[j])
    no_metadata_target_langs.append(document_target_langs[j])
    no_metadata_contents.append(contents[j])

no_metadata_df = pd.DataFrame(columns=['Filename', 'Source_lang', 'Target_lang', 'Content'])
no_metadata_df['Source_lang'] = no_metadata_source_langs
no_metadata_df['Target_lang'] = no_metadata_target_langs
no_metadata_df['Content'] = no_metadata_contents
no_metadata_df['Filename'] = no_metadata

## Stemming

In [15]:
import nltk
from nltk.stem import PorterStemmer
#from nltk.stem import LancasterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=tokenizer.tokenize(sentence)
    token_words
    stop_words = set(stopwords.words('english'))
    #print(stop_words)
    filter_sentense = [w for w in token_words if not w in stop_words]
    
    stem_sentence=[]
    for word in filter_sentense:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/antonis/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
# testing tokenizer function
n=16880
filename = all_document_data['Filename'].iloc[n]
sentence = all_document_data['Content'].iloc[n]
token_words = tokenizer.tokenize(filename)
print(filename)
print(sentence)
print(token_words)

REACH_BGD_JENA_C26_BT_F_2.docx
Name:  নাম  নিগার সোলতানা  Venue: স্থান  ক্যাম্প ২৬  Number of Participants: 
অংশগ্রহণকারীদের সংখ্যা ১০  Type of Participants: 
অংশগ্রহণকারীদের প্রকার Time: সময়  ১১.৩০ সকাল বেলা   Date: তারিখ  ২৪.০২.২০১৯  Note: Note: নির্দেশ আইসিই বিরতি অংশ INSERT SECTION NAME ইনসার্ট বিভাগ নাম INSERT PART ইনসার্ট অংশ Respondent উত্তরদাতা #   R9 উনি মনে করেন উনি ১০০% নিরাপদ R1 sapacc টা বড় না। ওখানে খেলা ধুলা করার জায়গা বড় না। R7 বাচ্চারা লেখাপড়া করতে চাই যদি খেলাধুলা বেশি দিলে ওরা বেশি করে পড়ালেখা করতে চাই। R5 স্কুলের পাশে জায়গা আছে। আগে ওখানে গেলে তারা আর স্কুলে আসত না। লেখাপড়া করতে আসেনা কারণ ওরা খেলা করে।  টাইম টা সেইম তাই R3 স্কুলে যেতে অনেক কষ্ট হয়। ওখানে স্কুলে চুর ডুকে। স্কুল্টা নিরাপদ না। রাস্তার পাশে স্কুল, বিভিন্ন লোক ওসে আড্ডা দেয়।  R5 স্কুলটা রাস্তার পাশে। ওখান কার ছেলেরা এসে ডিস্টার্ব করে। ওখানে পানির সমস্যা অনেক। আচর ভেঙ্গে পড়েছে।  R3 ক্লাস রোম টা যদি বড় হত তাইলে ভাল হত। ওদের জন্য সুবিধা হত পড়ত। R9 বড় হত তাইলে ভাল হত। বাইরে বারান্দাটা বড় হলে আরেকটু খেলা করত

In [17]:
# testing stemSentence function
x=tokenizer.tokenize(stemSentence(sentence))
print(x)

['name', 'ন', 'ম', 'ন', 'গ', 'র', 'স', 'লত', 'ন', 'venu', 'স', 'থ', 'ন', 'ক', 'য', 'ম', 'প', '২৬', 'number', 'particip', 'অ', 'শগ', 'রহণক', 'র', 'দ', 'র', 'স', 'খ', 'য', '১০', 'type', 'particip', 'অ', 'শগ', 'রহণক', 'র', 'দ', 'র', 'প', 'রক', 'র', 'time', 'সময়', '১১', '৩০', 'সক', 'ল', 'ব', 'ল', 'date', 'ত', 'র', 'খ', '২৪', '০২', '২০১৯', 'note', 'note', 'ন', 'র', 'দ', 'শ', 'আইস', 'ই', 'ব', 'রত', 'অ', 'শ', 'insert', 'section', 'name', 'ইনস', 'র', 'ট', 'ব', 'ভ', 'গ', 'ন', 'ম', 'insert', 'part', 'ইনস', 'র', 'ট', 'অ', 'শ', 'respond', 'উত', 'তরদ', 'ত', 'R9', 'উন', 'মন', 'কর', 'ন', 'উন', '১০০', 'ন', 'র', 'পদ', 'R1', 'sapacc', 'ট', 'বড়', 'ন', 'ওখ', 'ন', 'খ', 'ল', 'ধ', 'ল', 'কর', 'র', 'জ', 'য়গ', 'বড়', 'ন', 'R7', 'ব', 'চ', 'চ', 'র', 'ল', 'খ', 'পড়', 'করত', 'চ', 'ই', 'যদ', 'খ', 'ল', 'ধ', 'ল', 'ব', 'শ', 'দ', 'ল', 'ওর', 'ব', 'শ', 'কর', 'পড়', 'ল', 'খ', 'করত', 'চ', 'ই', 'R5', 'স', 'ক', 'ল', 'র', 'প', 'শ', 'জ', 'য়গ', 'আছ', 'আগ', 'ওখ', 'ন', 'গ', 'ল', 'ত', 'র', 'আর', 'স', 'ক', 'ল', 'আসত', 'ন', 'ল', 'খ', 'প

In [18]:
# create pd data frame "Stems" and save all the stemed Filenames and Content
stems = pd.DataFrame(index=all_document_data.index, columns=['Filename','Content'])
for i in all_document_data.index:
    filename = all_document_data['Filename'].iloc[i]
    content = all_document_data['Content'].iloc[i]
    stems.Filename[i] = stemSentence(filename.replace("_", " "))
    stems.Content[i] = stemSentence(content.replace("_", " "))

In [19]:
stems.Content.describe

<bound method NDFrame.describe of 0        thi test thi test thi test thi test thi test t...
1        thi test thi test thi test thi test thi test t...
2        thi test thi test thi test thi test thi test t...
3        thi test thi test thi test thi test thi test t...
4        thi test thi test thi test thi test thi test t...
5        thi test thi test thi test thi test thi test t...
6        thi test thi test thi test thi test thi test t...
7        thi test thi test thi test thi test thi test t...
8        thi test thi test thi test thi test thi test t...
9        thi test thi test thi test thi test thi test t...
10       thi test thi test thi test thi test thi test t...
11       thi test thi test thi test thi test thi test t...
12       global tool review final report 30 novemb 2016...
13       global tool review final report 30 novemb 2016...
14       global tool review final report 30 novemb 2016...
15       includ AN imag map OR pictur format cover page...
16       execut summar

In [20]:
tags = ['humanitarian crisis',
        'natural disaster',
        'environmental crisis',        
        'disability',
        'gender',
        'genital mutilation',
        'racism',
        'genocide',
        'civil war',
        'terrorism',
        'infectious disease',
        'political revolution',
        'political prisoner',
        'amnesty',
        'corruption',
        'health awareness',
        'gender inequality',
        'rape',
        'ebola',
        'aids',
        'first aid',
        'emergency',
        'disease',
        'operation',
        'virus',
        'response',
        'protocol',
        'certificate',
        'disability', 
        'medication'
       ]

In [21]:
stemSentence(tags[0])

'humanitarian crisi '

In [22]:
# stemming the tags and saving them to a df 
impactful_dict = pd.DataFrame(index=range(len(tags)), columns=['Stems'])
i =0
for w in tags:
    impactful_dict.Stems[i]= stemSentence(w)
    i += 1

In [23]:
impactful_dict.describe

<bound method NDFrame.describe of                   Stems
0   humanitarian crisi 
1         natur disast 
2    environment crisi 
3               disabl 
4               gender 
5          genit mutil 
6               racism 
7              genocid 
8            civil war 
9               terror 
10      infecti diseas 
11       polit revolut 
12        polit prison 
13             amnesti 
14             corrupt 
15         health awar 
16        gender inequ 
17                rape 
18               ebola 
19                 aid 
20           first aid 
21               emerg 
22              diseas 
23                oper 
24                viru 
25             respons 
26            protocol 
27              certif 
28              disabl 
29               medic >

## Compute Relative Freqs of tags in the Contents and Filenames

In [24]:
import re

In [25]:
# testing word counts and freqs
text = stems.Content[0]
# text = 'paok paok paok paok paok paok paok test'
givenWord = 'test'
total = len(re.findall(r'\w+', text)) 
count = len(re.findall('\w*'+ givenWord +'\w*', text))
print(total)
print(count)
print(count/total)

173
87
0.5028901734104047


In [26]:
# function that returns the relative freq of spercific word in str
def word_relativ_Freq(text, givenWord):
    total = len(re.findall(r'\w+', text)) 
    count = len(re.findall('\w*'+ givenWord +'\w*', text))
    if total==0:
        return 0
    relFreq = count/total
    return relFreq

In [27]:
# create a data frame with the relative freqs of stemed tags per document content
freq_content = pd.DataFrame(index= all_document_data.index, columns=impactful_dict.Stems)

In [28]:
import time
start_time = time.time()

for j in impactful_dict.Stems:
    freq_content[j] = stems.Content.apply(lambda x: word_relativ_Freq(x,j))

print("--- %s seconds ---" % (time.time() - start_time))

--- 411.5003261566162 seconds ---


In [29]:
freq_content.describe()

Stems,humanitarian crisi,natur disast,environment crisi,disabl,gender,genit mutil,racism,genocid,civil war,terror,...,first aid,emerg,diseas,oper,viru,respons,protocol,certif,disabl.1,medic
count,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,...,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0
mean,1.4e-05,5.8e-05,1.200276e-08,0.00153,0.000531,1.2e-05,5e-06,2e-06,1.1e-05,1.2e-05,...,0.000355,0.001115,0.00122,0.001218,0.000367,0.002032,0.000143,0.000256,0.00153,0.000921
std,0.000258,0.000872,1.103177e-06,0.007228,0.002931,0.00022,0.000106,8.6e-05,0.000237,0.000164,...,0.003567,0.003953,0.00555,0.004004,0.00337,0.008536,0.000902,0.002301,0.007228,0.004079
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000296,0.0,0.001585,0.0,0.0,0.0,0.0
max,0.023256,0.04717,0.0001013993,0.125874,0.068531,0.013333,0.005726,0.006494,0.01087,0.004057,...,0.166667,0.111111,0.103093,0.066667,0.061856,0.166667,0.022543,0.064516,0.125874,0.119565


In [30]:
# create a data frame with the relative freqs of stemed tags per document filename
freq_filename = pd.DataFrame(index= all_document_data.index, columns=impactful_dict.Stems)
for j in impactful_dict.Stems:
    freq_filename[j] = stems.Filename.apply(lambda x: word_relativ_Freq(x,j))

In [31]:
freq_filename.describe()

Stems,humanitarian crisi,natur disast,environment crisi,disabl,gender,genit mutil,racism,genocid,civil war,terror,...,first aid,emerg,diseas,oper,viru,respons,protocol,certif,disabl.1,medic
count,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,...,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0,16896.0
mean,6e-06,1e-05,0.0,0.000891,0.000373,0.0,0.0,0.0,9e-06,0.0,...,0.000598,0.001309,0.001644,0.00029,3.3e-05,0.001618,0.000566,0.000433,0.000891,0.000393
std,0.000769,0.001282,0.0,0.011385,0.008677,0.0,0.0,0.0,0.00069,0.0,...,0.011961,0.015468,0.018674,0.006241,0.001813,0.017025,0.010044,0.009764,0.011385,0.007842
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.1,0.166667,0.0,0.25,0.333333,0.0,0.0,0.0,0.0625,0.0,...,0.25,0.333333,0.333333,0.2,0.142857,0.5,0.25,0.333333,0.25,0.25


Finding labels of the documents

In [32]:
# defining function to check buzz word 
def exist_(temp,j): 
    temp2=temp.lower()
    temp3=temp2.find(j)
    return temp3>0
    
dictionary2=['certificate','instruction','poster','protocol','response','letter','report','map','manual']

for j in dictionary2:
    jj=all_document_data.Filename.apply(lambda x: exist_(x,j))
    rr=pd.DataFrame(columns=[str(j)])
    rr[str(j)]=jj
    all_document_data=pd.concat([all_document_data,rr ], axis=1)