# Library and Data Preparation 

In [1]:
import os
import fnmatch
import TWB.xliff as xliff
import spacy
import pandas as pd
import queue
import numpy as np
import unicodedata

In [2]:
# normalize unicode characters
def NFD(s):
        return unicodedata.normalize('NFD', s)

# put your data path here
#data_dir = os.getcwd()
data_dir = '/home/mixalis/Downloads/Translators Without Borders/'
metadata = pd.read_excel(data_dir + 'Hackathon-for-Good-2019_TWB-Challenge_Metadata.xlsx')

# choose only text documents
#accepted_documents = metadata.loc[(metadata['Format'] == 'doc') | (metadata['Format'] == 'pdf')].drop_duplicates()

# fix some problems with the encoding of special characters in filenames
#accepted_documents['Filename'] = accepted_documents['Filename'].apply(NFD)
# we use all documents after all
metadata['Filename'] = metadata['Filename'].apply(NFD)

# update data path with the sdlxliff directory
data_dir += '/hackathon-for-good-2019_TWB-challenge_files/'

In [3]:
metadata

Unnamed: 0,Date,Wordcount,Filename,Source_lang,Source_country,Target_lang,Target_country,NGO,Format,System
0,2016-12-12 00:56:11,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
1,2016-12-12 01:53:23,121.0,387words.docx,en,US,fr,FR,Translators without Borders,doc,Kató 1
2,2016-12-12 09:30:28,109.0,347words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
3,2016-12-12 12:57:12,121.0,387words.docx,en,US,fr,FR,British Red Cross,doc,Kató 1
4,2016-12-12 19:04:04,45371.3,gtr_charts_FINAL.PPTX,en,GB,ar,SA,British Red Cross,ppt,Kató 1
5,2016-12-12 19:04:04,45371.3,gtr_charts_FINAL.PPTX,en,GB,fr,FR,British Red Cross,ppt,Kató 1
6,2016-12-12 19:04:04,45371.3,gtr_charts_FINAL.PPTX,en,GB,es,ES,British Red Cross,ppt,Kató 1
7,2016-12-12 19:04:04,45371.3,gtr_graphs_FINAL.XLSX,en,GB,ar,SA,British Red Cross,xls,Kató 1
8,2016-12-12 19:04:04,45371.3,gtr_graphs_FINAL.XLSX,en,GB,fr,FR,British Red Cross,xls,Kató 1
9,2016-12-12 19:04:04,45371.3,gtr_graphs_FINAL.XLSX,en,GB,es,ES,British Red Cross,xls,Kató 1


In [4]:
# store all sdlxliff filenames into a list
document_names = []

# trying to get all doc documents based on extensions
for document in os.listdir(data_dir):    
    if fnmatch.fnmatch(document,'*.sdlxliff'):
        '''
        if fnmatch.fnmatch(document,'*.doc*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.DOC*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.txt*'):
            document_names.append(document)    
        elif fnmatch.fnmatch(document,'*.pdf*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.PDF*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.odt*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.rtf*'):
            document_names.append(document)
        elif fnmatch.fnmatch(document,'*.dotx*'):
            document_names.append(document)
        '''
        document_names.append(document)

In [5]:
# get the translated contents of all documents
document_contents = []
document_source_langs = []
document_target_langs = []

for i in range(len(document_names)):
    document = document_names[i]
    temp_xliff = xliff.XLIFF(data_dir + document)
    document_contents.append(temp_xliff.source)
    document_source_langs.append(temp_xliff.source_lang)
    document_target_langs.append(temp_xliff.target_lang)

In [6]:
# same unicode fix as for accepted_documents
for i in range(len(document_names)):
    document_names[i] = NFD(document_names[i])

In [7]:
# remove the sdlxliff extension as most files in the metadata are without it
filenames = []
for i in range(len(document_names)):
    filenames.append(document_names[i].replace('.sdlxliff', ''))

In [8]:
# find the documents that most probably have no metadata, they are a lot
documents_without_metadata = []

count = 0
for name in filenames:
    temp = metadata.loc[metadata['Filename'] == name]
    if temp.empty:
        # for the special case that the translated version's extension is used
        temp2 = metadata.loc[metadata['Filename'] == name + '.sdlxliff']
        if temp2.empty:
            count += 1       
            documents_without_metadata.append(name)

print(count)

3151


In [9]:
# find the few entries in the metadata that are not in the documents 
# and the corresponding document filenames for the metadata filenames with wrong encoding
import difflib

filename_correspondence = {}
count = 0
for name in list(metadata['Filename'].drop_duplicates()):
    if name not in filenames and name not in document_names:
        max_similarity = 0.0
        corresponding_document = None
        for filename in filenames:
            # filename letter similarity
            seq = difflib.SequenceMatcher(None, filename, name)
            if (seq.ratio() > max_similarity):
                max_similarity = seq.ratio()
                corresponding_document = filename
        # this threshold was manually checked and it produces only one false positive
        if max_similarity > 0.5:
            filename_correspondence[name] = corresponding_document
        else:
            count += 1
            print(name)

# delete the false positive 
del filename_correspondence['Patient_Release_Form_-_Final_4.25.docx']                
print(count)

Ø·Ø¹Ø§Ù_x0085_Ù_x0083_Ù_x0085__Ø§Ù_x0084_Ù_x008a_Ù_x0088_Ù_x0085_.docx
å_x0080__x008b_äººè_x0087_ªå_x0082_³.docx
2


In [10]:
# helper function that changes the wrongly encoded metadata filenames
def change_names(x):
    if x in filename_correspondence.keys():
        return filename_correspondence[x]
    else:
        return x

# start combining all present features/data per document
all_document_data = metadata
all_document_data['Filename'] = metadata['Filename'].apply(lambda x: change_names(x))

# combine document lines into one list instead of a list of lists for use in a DataFrame column
contents = []
for i in range(len(document_contents)):
    temp = ''
    for j in range(len(document_contents[i])):
        temp += document_contents[i][j] + ' '
    contents.append(temp)

# left inner join of DataFrames
# all_document_data contains information on all documents with content that have also metadata
name_contents = pd.DataFrame(columns=['Filename','Content'])
name_contents['Filename'] = filenames
name_contents['Content'] = contents
all_document_data = pd.merge(all_document_data,name_contents)

In [11]:
# verify which documents are missing from the previous DataFrame
s1 = set(all_document_data['Filename'])
s2 = set(name_contents['Filename'])
no_metadata = list(s2.difference(s1))

# create another DataFrame that contains only the filename, 
# source language, target language and content of these documents
no_metadata_source_langs = []
no_metadata_target_langs = []
no_metadata_contents = []

for i in no_metadata:
    j = filenames.index(i)
    no_metadata_source_langs.append(document_source_langs[j])
    no_metadata_target_langs.append(document_target_langs[j])
    no_metadata_contents.append(contents[j])

no_metadata_df = pd.DataFrame(columns=['Filename', 'Source_lang', 'Target_lang', 'Content'])
no_metadata_df['Source_lang'] = no_metadata_source_langs
no_metadata_df['Target_lang'] = no_metadata_target_langs
no_metadata_df['Content'] = no_metadata_contents
no_metadata_df['Filename'] = no_metadata

In [12]:
# find only documents in the english language
from langdetect import detect

# detect the document contents language if possible
def detect_helper(x):
    try:
        temp = detect(x)
    except:
        temp = 'No'
    return temp

# keep only the english documents in both datasets
languages = all_document_data['Content'].apply(lambda x: detect_helper(x))
english_document_data = all_document_data[languages == 'en']

languages2 = no_metadata_df['Content'].apply(detect_helper)
english_no_metadata_df = no_metadata_df[languages2 == 'en']

In [13]:
import nltk

# add average sentence length as a feature(it was mentioned by the challenge owners, thought it could be interesting)
def avg_sent_len(x):
    acc = 0
    sents = nltk.sent_tokenize(x)
    if len(sents) == 0:
        return 0
    for sent in sents:
        acc += len(nltk.word_tokenize(sent))
    return acc/len(sents)
                                         
english_document_data['Average Sentence Length'] = english_document_data['Content'].apply(avg_sent_len)
english_no_metadata_df['Average Sentence Length'] = english_no_metadata_df['Content'].apply(avg_sent_len)

In [14]:
# cache to disk for easier use
english_document_data.to_pickle('data/english_document_data.pkl')
english_no_metadata_df.to_pickle('data/english_no_metadata.pkl')

## Stemming

In [11]:
import nltk
from nltk.stem import PorterStemmer
#from nltk.stem import LancasterStemmer
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=tokenizer.tokenize(sentence)
    token_words
    stop_words = set(stopwords.words('english'))
    #print(stop_words)
    filter_sentense = [w for w in token_words if not w in stop_words]
    
    stem_sentence=[]
    for word in filter_sentense:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [30]:
# testing tokenizer function
n=16880
filename = all_document_data['Filename'].iloc[n]
sentence = all_document_data['Content'].iloc[n]
token_words = tokenizer.tokenize(filename)
print(filename)
print(sentence)
print(token_words)

Instructions-Leaflet2_ENglish_(2).docx
THIS IS YOUR FREE MICRO SD CARD FROM REFUCOMM IT CONTAINS ESSENTIAL INFORMATION FOR YOU  There are many different rules and procedures in Greece. This card contains VITAL information on these procedures and how you can prepare for them. You will NOT get this information from the authorities. All our information has been checked by lawyers. You do not need to use the internet to access the information. It is free! There is information in document and film format on this SD card in Arabic, Farsi, Dari, Urdu, French, Sorani and Kurmanji.  HOW TO USE Look for a place on your phone to insert the SD card; see images below for the usual locations. Most phones have one of these, if you can’t find it, ask a friend for help. 		 in the side of your phone	in the back of your phone	in the top of your phone If your phone does not have a place for SD cards you can get the information from a friend’s phone via Bluetooth (see overleaf). If you cannot open the docu

In [31]:
# testing stemSentence function
x=tokenizer.tokenize(stemSentence(sentence))
print(x)

['thi', 'IS', 'your', 'free', 'micro', 'SD', 'card', 'from', 'refucomm', 'IT', 'contain', 'essenti', 'inform', 'for', 'you', 'there', 'mani', 'differ', 'rule', 'procedur', 'greec', 'thi', 'card', 'contain', 'vital', 'inform', 'procedur', 'prepar', 'you', 'not', 'get', 'inform', 'author', 'all', 'inform', 'check', 'lawyer', 'you', 'need', 'use', 'internet', 'access', 'inform', 'It', 'free', 'there', 'inform', 'document', 'film', 'format', 'SD', 'card', 'arab', 'farsi', 'dari', 'urdu', 'french', 'sorani', 'kurmanji', 'how', 'TO', 'use', 'look', 'place', 'phone', 'insert', 'SD', 'card', 'see', 'imag', 'usual', 'locat', 'most', 'phone', 'one', 'find', 'ask', 'friend', 'help', 'side', 'phone', 'back', 'phone', 'top', 'phone', 'If', 'phone', 'place', 'SD', 'card', 'get', 'inform', 'friend', 'phone', 'via', 'bluetooth', 'see', 'overleaf', 'If', 'cannot', 'open', 'document', 'need', 'download', 'pdf', 'viewer', 'phone', 'To', 'go', 'googl', 'play', 'store', 'search', 'pdf', 'viewer', 'select',

In [18]:
from TWB.nlp import extract_words

# create pd data frame "Stems" and save all the stemed Filenames and Content
stems = pd.DataFrame(index=all_document_data.index, columns=['Filename','Content'])
for i in all_document_data.index:
    filename = all_document_data['Filename'].iloc[i]
    content = all_document_data['Content'].iloc[i]
    stems.Filename[i] = " ".join(extract_words(filename.replace("_", " ")))
    stems.Content[i] = " ".join(extract_words(content.replace("_", " ")))

In [19]:
stems.Content.describe

<bound method NDFrame.describe of 0        thi test thi test thi test thi test thi test t...
1        thi test thi test thi test thi test thi test t...
2        thi test thi test thi test thi test thi test t...
3        thi test thi test thi test thi test thi test t...
4        thi test thi test thi test thi test thi test t...
5        thi test thi test thi test thi test thi test t...
6        thi test thi test thi test thi test thi test t...
7        thi test thi test thi test thi test thi test t...
8        thi test thi test thi test thi test thi test t...
9        thi test thi test thi test thi test thi test t...
10       thi test thi test thi test thi test thi test t...
11       thi test thi test thi test thi test thi test t...
12       thi test thi test thi test thi test thi test t...
13       grt strength high qualiti resourc scale scope ...
14       grt strength high qualiti resourc scale scope ...
15       grt strength high qualiti resourc scale scope ...
16       fact individu

In [20]:
tags = ['humanitarian crisis',
        'natural disaster',
        'environmental crisis',        
        'disability',
        'gender',
        'genital mutilation',
        'racism',
        'genocide',
        'civil war',
        'terrorism',
        'infectious disease',
        'political revolution',
        'political prisoner',
        'amnesty',
        'corruption',
        'health awareness',
        'gender inequality',
        'rape',
        'ebola',
        'aids',
        'first aid',
        'emergency',
        'disease',
        'operation',
        'virus',
        'response',
        'protocol',
        'certificate',
        'medication',
        'elderly',
        'children',
        'adults',
        'men',
        'women',
        'disabled',
        'old',
        'teenagers',
        'kids',
        'male',
        'female'
       ]

# remove possible duplicates
tags = set(tags)
tags = list(tags)

In [21]:
stemSentence(tags[0])

NameError: name 'stemSentence' is not defined

In [27]:
# stemming the tags and saving them to a df 
impactful_dict = pd.DataFrame(index=range(len(tags)), columns=['Stems'])
i =0
for w in tags:
    impactful_dict.Stems[i]= " ".join(extract_words(w))
    i += 1

In [31]:
impactful_dict.describe

<bound method NDFrame.describe of                  Stems
0            civil war
1               racism
2                  kid
3                emerg
4         natur disast
5                ebola
6               terror
7              respons
8             children
9                femal
10             genocid
11               women
12                 aid
13  humanitarian crisi
14                rape
15             elderli
16      infecti diseas
17               adult
18              teenag
19         genit mutil
20               medic
21                 old
22                 men
23              gender
24        polit prison
25                oper
26              certif
27              diseas
28                male
29              disabl
30             corrupt
31                viru
32            protocol
33           first aid
34        gender inequ
35             amnesti
36         health awar
37   environment crisi
38              disabl
39       polit revolut>

## Compute Relative Freqs of tags in the Contents and Filenames

In [24]:
import re

In [25]:
# testing word counts and freqs
text = stems.Content[0]
# text = 'paok paok paok paok paok paok paok test'
givenWord = 'test'
total = len(re.findall(r'\w+', text)) 
count = len(re.findall('\w*'+ givenWord +'\w*', text))
print(total)
print(count)
print(count/total)

173
87
0.5028901734104047


In [28]:
# function that returns the relative freq of spercific word in str
def word_relativ_Freq(text, givenWord):
    total = len(re.findall(r'\w+', text)) 
    count = len(re.findall('\w*'+ givenWord +'\w*', text))
    if total==0:
        return 0
    relFreq = count/total
    return relFreq

In [29]:
# create a data frame with the relative freqs of stemed tags per document content
freq_content = pd.DataFrame(index= all_document_data.index, columns=impactful_dict.Stems)

In [30]:
import time
start_time = time.time()

for j in impactful_dict.Stems:
    freq_content[j] = stems.Content.apply(lambda x: word_relativ_Freq(x,j))

print("--- %s seconds ---" % (time.time() - start_time))

--- 1908.7024607658386 seconds ---


In [32]:
freq_content.describe()

Stems,civil war,racism,kid,emerg,natur disast,ebola,terror,respons,children,femal,...,corrupt,viru,protocol,first aid,gender inequ,amnesti,health awar,environment crisi,disabl,polit revolut
count,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,...,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0
mean,1e-05,6e-06,0.000155,0.002054,5.8e-05,0.000213,1.9e-05,0.002752,0.00334,0.000254,...,0.000167,0.000308,0.000198,0.000409,1.6e-05,1.3e-05,3e-06,3.524379e-08,0.00139,0.0
std,0.000211,0.000134,0.002565,0.006872,0.000681,0.002504,0.00022,0.00772,0.008386,0.001446,...,0.002248,0.002945,0.001089,0.004437,0.000262,0.000361,0.000418,1.890121e-06,0.006749,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000278,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.000878,0.0,0.0,0.0,0.00274,0.002133,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.01087,0.005726,0.12766,0.111111,0.056604,0.166667,0.008696,0.166667,0.125,0.090909,...,0.056805,0.072072,0.022543,0.166667,0.023256,0.021739,0.076923,0.0001013993,0.125874,0.0


In [33]:
# create a data frame with the relative freqs of stemed tags per document filename
freq_filename = pd.DataFrame(index= all_document_data.index, columns=impactful_dict.Stems)
for j in impactful_dict.Stems:
    freq_filename[j] = stems.Filename.apply(lambda x: word_relativ_Freq(x,j))

In [34]:
freq_filename.describe()

Stems,civil war,racism,kid,emerg,natur disast,ebola,terror,respons,children,femal,...,corrupt,viru,protocol,first aid,gender inequ,amnesti,health awar,environment crisi,disabl,polit revolut
count,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,...,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0,34525.0
mean,4e-06,0.0,0.000135,0.001527,2.1e-05,0.00046,0.0,0.001669,0.001044,0.000329,...,0.000412,2.5e-05,0.000687,0.000977,0.0,0.0,6e-06,0.0,0.00094,0.0
std,0.000483,0.0,0.004589,0.015395,0.00178,0.009115,0.0,0.015973,0.012664,0.005408,...,0.006682,0.001533,0.011074,0.012845,0.0,0.0,0.001076,0.0,0.011007,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0625,0.0,0.166667,0.333333,0.166667,0.333333,0.0,0.5,0.25,0.142857,...,0.333333,0.142857,0.25,0.25,0.0,0.0,0.2,0.0,0.25,0.0


Finding labels of the documents

In [35]:
# defining function to check buzz word 
def exist_(temp,j): 
    temp2=temp.lower()
    temp3=temp2.find(j)
    return temp3>0
    
dictionary2=['certificate','instruction','poster','protocol','response','letter','report','map','manual','banner']

for j in dictionary2:
    jj=english_document_data.Filename.apply(lambda x: exist_(x,j))
    rr=pd.DataFrame(columns=[str(j)])
    rr[str(j)]=jj
    english_document_data=pd.concat([english_document_data,rr ], axis=1)

In [36]:
# cache intermediate data
stems.to_pickle('data/stemmed_data.pkl')
freq_content.to_pickle('data/freq_content.pkl')
freq_filename.to_pickle('data/freq_filename.pkl')
english_document_data.to_pickle('data/english_document_data_categories.pkl')
#tfidf_df.to_pickle('data/tfidf.pkl')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

stems = pd.read_pickle('data/stemmed_data.pkl')

vectorizer = TfidfVectorizer()
tfidfs = vectorizer.fit_transform(stems['Content'])

# this also has a memory error
print(tfidfs.toarray())

In [38]:
# read cached data
#english_document_data = pd.read_pickle('data/english_document_data.pkl')
#freq_content = pd.read_pickle('data/freq_content.pkl')

# aggregate tags that refer to the same people group
def combine_tags(tag_list, entry):
    max_tag = 0.0
    tag = -1
    for i in range(len(tag_list)):
        temp = 0
        for j in tag_list[i]:
            temp += entry[j]
        if temp > max_tag:
            max_tag = temp
            tag = i
    return tag


# tags refer to: -1 -> Undefined, 0 -> 65+, 1 -> 0-18, 2 -> 18-65 
age_tag_pairs = [['elderli','old'],['children','kid','teenag'],['adult']]
age_tag_pair_inds = []
for tag_pair in age_tag_pairs:
    temp = []
    for tag in tag_pair:
        temp.append(freq_content.columns.get_loc(tag))
    age_tag_pair_inds.append(temp)
    
print(age_tag_pair_inds)

# tags refer to: -1 -> Undefined, 0 -> Male, 1 -> Female
gender_tag_pairs = [['male','men'], ['femal','women']]
gender_tag_pair_inds = []
for tag_pair in gender_tag_pairs:
    temp = []
    for tag in tag_pair:
        temp.append(freq_content.columns.get_loc(tag))
    gender_tag_pair_inds.append(temp)

print(gender_tag_pair_inds)

# populate a list with age and gender values for each row in the english documents
gender = []
age = []
for i in range(len(freq_content)):
    gender.append(combine_tags(gender_tag_pair_inds,freq_content.iloc[i]))
    age.append(combine_tags(age_tag_pair_inds,freq_content.iloc[i]))

[[15, 21], [8, 2, 18], [17]]
[[28, 22], [9, 11]]


In [41]:
# put the lists into the tag frequencies per document DataFrame
gender2 = pd.DataFrame(gender)
age2 = pd.DataFrame(age) 
freq_content['Age'] = age2
freq_content['Gender'] = gender2

In [42]:
# find documents without NGO values
metadata.loc[metadata['NGO'].isna(),'NGO'] = 'Undefined'

# calculate the amount of translations requested by each document
ngo_docs = metadata.groupby('NGO').aggregate('count')['System']

# sanity check
print(len(metadata['NGO'].unique()))

# sanity check
total_docs = 0
for i in ngo_docs:
    total_docs += i
print(total_docs)

# sanity check
print(len(metadata))
sc = (metadata['NGO'].apply(pd.isna))
metadata.loc[sc]

# calculate each NGO's portion of documents from the corpus to be used as a weight
ngo_docs = ngo_docs/total_docs

258
34549
34549


In [290]:
# cache intermediate data
ngo_docs.to_pickle('data/ngo_weights.pkl')
freq_content.to_pickle('data/freq_content_updated.pkl')

In [45]:
# create labels for the topic of documents(as given by the challenge owners) based on the title
dictionary3=['health','logistics','nutrition','protection','shelter','water','sanitation','hygiene','camp coordination','camp management','early recovery','education','emergency telecommunications','food security']

extra_features = pd.DataFrame(index=english_document_data.index)
for j in dictionary3:
    jj=english_document_data.Filename.apply(lambda x: exist_(x,j))
    rr=pd.DataFrame(columns=[str(j)])
    rr[str(j)]=jj
    extra_features=pd.concat([extra_features,rr ], axis=1)

In [46]:
# combine some topics that were given together
extra_features['water/sanitation/hygiene'] = (extra_features['water'] | extra_features['sanitation'] | extra_features['hygiene'])
extra_features['camp coordination/camp management'] = (extra_features['camp coordination'] | extra_features['camp management'])

# check the amount of documents per topic
for i in dictionary3:
    print(i,len(extra_features[extra_features[i]]))

# drop the single columns that are part of the combined ones
extra_features = extra_features.loc[:,~np.array([False, False, False, False, False, True, True, True, True,True, False, False, False, False, False, False])]
dictionary4 = ['health','logistics','nutrition','protection','shelter','water/sanitation/hygiene','camp coordination/camp management','early recovery','education','emergency telecommunications','food security']

# sanity check
for i in dictionary4:
    print(i,len(extra_features[extra_features[i]]))

health 431
logistics 18
nutrition 160
protection 157
shelter 95
water 47
sanitation 0
hygiene 19
camp coordination 0
camp management 0
early recovery 0
education 345
emergency telecommunications 0
food security 0
health 431
logistics 18
nutrition 160
protection 157
shelter 95
water/sanitation/hygiene 66
camp coordination/camp management 0
early recovery 0
education 345
emergency telecommunications 0
food security 0


In [48]:
extra_features.to_pickle('data/english_document_data_topics.pkl')

In [47]:
extra_features

Unnamed: 0,health,logistics,nutrition,protection,shelter,early recovery,education,emergency telecommunications,food security,water/sanitation/hygiene,camp coordination/camp management
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False
