# Library and Data Preparation 

In [1]:
import os
import fnmatch
import TWB.xliff as xliff
import spacy
import pandas as pd
import queue
import numpy as np
import unicodedata

In [2]:
# normalize unicode characters
def NFD(s):
        return unicodedata.normalize('NFD', s)

# put your data path here
data_dir = ''
metadata = pd.read_excel(data_dir + 'Hackathon-for-Good-2019_TWB-Challenge_Metadata.xlsx')

# choose only text documents
# TODO The metadata contain some files multiple times and the table is almost 3 times the number of documents
# Using various combinations of subsets of the columns to get all the 12156 documents did not succeed
accepted_documents = metadata.loc[metadata['Format'] == 'doc'].drop_duplicates()

# fix some problems with the encoding of special characters in filenames
accepted_documents['Filename'] = accepted_documents['Filename'].apply(NFD)

# update data path with the sdlxliff directory
data_dir += 'hackathon-for-good-2019_TWB-challenge_files/'

In [3]:
# store all sdlxliff filenames into a list
document_names = []

# trying to get all doc documents based on extensions
for document in os.listdir(data_dir):    
    if fnmatch.fnmatch(document,'*.sdlxliff'):
        if fnmatch.fnmatch(document,'*.doc*'):
            document_names.append(document)
        if fnmatch.fnmatch(document,'*.DOC*'):
            document_names.append(document)
        if fnmatch.fnmatch(document,'*.txt*'):
            document_names.append(document)    
        if fnmatch.fnmatch(document,'*.pdf*'):
            document_names.append(document)
        if fnmatch.fnmatch(document,'*.PDF*'):
            document_names.append(document)
        if fnmatch.fnmatch(document,'*.odt*'):
            document_names.append(document)
        if fnmatch.fnmatch(document,'*.rtf*'):
            document_names.append(document)
        if fnmatch.fnmatch(document,'*.dotx*'):
            document_names.append(document)    

In [4]:
# get the translated contents of all documents
document_contents = []

for document in list(document_names):
    # Avoid a couple of (probably) malformed xml documents(No root found by the parser)
    try:
        document_contents.append(xliff.XLIFF(data_dir + document).target)
    except AttributeError:
        print(document)
        document_names.remove(document)
        continue

In [5]:
# same unicode fix as for accepted_documents
for i in range(len(document_names)):
    document_names[i] = NFD(document_names[i])

In [6]:
# remove the sdlxliff extension as most files in the metadata are without it
filenames = []
for name in document_names:
    filenames.append(name.replace('.sdlxliff', ''))

In [7]:
# find the documents that most probably have no metadata, they are a lot
documents_without_metadata = []

count = 0
for name in filenames:
    temp = accepted_documents.loc[accepted_documents['Filename'] == name]
    if temp.empty:
        # for the special case that the translated version's extension is used
        temp2 = accepted_documents.loc[accepted_documents['Filename'] == name + '.sdlxliff']
        if temp2.empty:
            count += 1
        documents_without_metadata.append(name)

print(count)

2788


In [8]:
# find the few entries in the metadata that are not in the documents 
# and the corresponding document filenames for the metadata filenames with weird special characters
import difflib

filename_correspondence = {}
count = 0
for name in list(accepted_documents['Filename'].drop_duplicates()):
    if name not in filenames and name not in document_names:
        max_similarity = 0.0
        corresponding_document = None
        for filename in filenames:
            # filename letter similarity
            seq = difflib.SequenceMatcher(None, filename, name)
            if (seq.ratio() > max_similarity):
                max_similarity = seq.ratio()
                corresponding_document = filename
        # this threshold was manually checked and it produces only one false positive
        if max_similarity > 0.5:
            filename_correspondence[name] = corresponding_document
        else:
            count += 1
            print(name)

# delete the false positive 
del filename_correspondence['Patient_Release_Form_-_Final_4.25.docx']                
count

Ø·Ø¹Ø§Ù_x0085_Ù_x0083_Ù_x0085__Ø§Ù_x0084_Ù_x008a_Ù_x0088_Ù_x0085_.docx
å_x0080__x008b_äººè_x0087_ªå_x0082_³.docx


2