In [4]:
!pip3 install pandas numpy --user

Collecting pandas
  Using cached https://files.pythonhosted.org/packages/67/a7/12261a51ac2e7be4c698ca27cbe364ca5f16d64999456ee47ea8c7b44417/pandas-0.23.4-cp37-cp37m-manylinux1_x86_64.whl
Collecting numpy
  Using cached https://files.pythonhosted.org/packages/98/44/94cc2e139b611b16458384ff3b9c87f217144b5915b0a9798c07a7295437/numpy-1.15.2-cp37-cp37m-manylinux1_x86_64.whl
Collecting pytz>=2011k (from pandas)
  Using cached https://files.pythonhosted.org/packages/30/4e/27c34b62430286c6d59177a0842ed90dc789ce5d1ed740887653b898779a/pytz-2018.5-py2.py3-none-any.whl
Installing collected packages: numpy, pytz, pandas
Successfully installed numpy-1.15.2 pandas-0.23.4 pytz-2018.5


In [5]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("../../data/processed_new_data.csv", header=0)

## Text Preprocessing

Before the text is fed into the classifier, the following steps are taken:

1. Detect language
2. Translate into English
3. Lemmatization
4. Remove punctuation but not numbers
5. Stop word removal
6. Join strings separated by a space
7. Converts the string into lower case characters
8. Converts everything into a string

In [21]:
!pip3 install nltk --user

Collecting nltk
  Using cached https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip
Installing collected packages: nltk
  Running setup.py install for nltk ... [?25ldone
[?25hSuccessfully installed nltk-3.3


In [24]:
!pip3 install googletrans langid --user

Collecting googletrans
  Using cached https://files.pythonhosted.org/packages/49/0f/eac53560e99a6b1e3b3e18e8b98f0e0041bfc4f1ee1e1b74cdf5530786d0/googletrans-2.3.0.tar.gz
Collecting langid
  Using cached https://files.pythonhosted.org/packages/ea/4c/0fb7d900d3b0b9c8703be316fbddffecdab23c64e1b46c7a83561d78bd43/langid-1.1.6.tar.gz
Collecting requests (from googletrans)
  Using cached https://files.pythonhosted.org/packages/65/47/7e02164a2a3db50ed6d8a6ab1d6d60b69c4c3fdf57a284257925dfc12bda/requests-2.19.1-py2.py3-none-any.whl
Collecting certifi>=2017.4.17 (from requests->googletrans)
  Using cached https://files.pythonhosted.org/packages/df/f7/04fee6ac349e915b82171f8e23cee63644d83663b34c539f7a09aed18f9e/certifi-2018.8.24-py2.py3-none-any.whl
Collecting urllib3<1.24,>=1.21.1 (from requests->googletrans)
  Using cached https://files.pythonhosted.org/packages/bd/c9/6fdd990019071a4a32a5e7cb78a1d92c53851ef4f56f62a3486e6a7d8ffb/urllib3-1.23-py2.py3-none-any.whl
Collecting idna<2.8,>=2.5 (from re

In [42]:
import nltk

from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /home/deepl/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/deepl/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [45]:
nltk_wordnet_tag_map = {
    'NN': wn.NOUN,
    'NNS': wn.NOUN,
    'VBP': wn.VERB,
    'VBG': wn.VERB,
    'JJ': wn.ADJ,
}

from functools import reduce

def compose(*functions):
    def compose2(f1, f2):
        """Compose two functions"""
        return lambda *args: f1(f2(*args))
    return reduce(compose2, functions)

def translate_to_english_txt(row):
    text = row["excerpt"]
    try:
        if langid.classify(text)[0] != 'en':
            trans = googletrans.client.Translator()
            return trans.translate(text, 'en').text
        return text
    except Exception as e:
        return ''
    
def lemmatize(row, lemmatizer=WordNetLemmatizer()):
    text = row
    splitted = text if type(text) == list else str(text).split()
    splitted = list(map(lambda x: str(x).lower(), splitted))
    tagged = nltk.pos_tag(splitted)
    lemmatized = []
    for word, tag in tagged:
        wnet_tag = nltk_wordnet_tag_map.get(tag)
        if wnet_tag:
            lemmatized.append(lemmatizer.lemmatize(word, wnet_tag))
        else:
            lemmatized.append(word)
    return ' '.join(lemmatized)

def rm_punc_not_nums(inp, col=None):
    """Remove punctuation unless it's a number for either a df (and col)
    or single entry
    """
    punc = string.punctuation
    transtable = str.maketrans("", "", punc)

    def sing_rm(phr):
        """Remove for a single entity"""
        return ' '.join([re.sub('\W+', '', i).translate(transtable) if not (
                    all(j.isdigit() or j in punc for j in i)
                    and
                    any(j.isdigit() for j in i)
                ) else re.sub('\W+', '', i)
                for i in str(phr).split(' ')]
        )
    if col and isinstance(inp, pd.core.frame.DataFrame):
        return inp.filter(like=col).applymap(lambda phr: sing_rm(phr))
    elif isinstance(inp, str):
        return sing_rm(inp)
    else:
        raise Exception('Not a vaild type')


def rm_stop_words_txt(txt, swords=nltk.corpus.stopwords.words('english')):
    """ Remove stop words from given text """
    return ' '.join(
        [token for token in str(txt).split(' ')
            if token.lower() not in swords]
    )

## Sanity Check

In [46]:
data = {"excerpt": "The 2 quick brown foxes jumped over the lazy dogs!"}

def preprocess(row):
        inp = row["excerpt"]
        inp = lemmatize(inp)
        func = compose(
            rm_punc_not_nums,
            rm_stop_words_txt,
            ' '.join,
            str.split,
            str.lower,
            str
        )
        
        return func(inp)

preprocess(data)

'2 quick brown fox jumped lazy dogs'

In [47]:
df = pd.read_csv("../../data/all_en_processed_sectors_subsectors.csv")

In [49]:
df.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True)

In [50]:
df["preprocessed_excerpt"] = df.apply(preprocess, axis=1)

In [55]:
df

Unnamed: 0,sector,subsector,excerpt,preprocessed_excerpt
0,Nutrition,Breastfeeding,"112,500 children under five are at risk of acu...",112500 child five risk acute malnutrition
1,Health,Health status and risk,"2,271 cholera cases suspected between 4-19 Oct...",2271 cholera case suspected 419 october alone
2,Food,,"806,000 people need urgent food assistance.",806000 people need urgent food assistance
3,Health,,36 health facilities destroyed.,36 health facility destroyed
4,Cross,,1.4 million People need aid,14 million people need aid
5,Cross,,Humanitarian needs are said to include access ...,humanitarian need said include access sufficie...
6,Food,,The people in urgent food insecurity are locat...,people urgent food insecurity located area 75 ...
7,Shelter,Shelter infrastructure and material,Following the government’s announcement to clo...,follow governments announcement close temporar...
8,Protection,Sexual violence and other forms of Gender Base...,Exacerbating the pre-existing displacement cri...,exacerbate preexisting displacement crisis ten...
9,Protection,Sexual violence and other forms of Gender Base...,Agencies have expressed serious concerns about...,agency expressed serious concern risk increase...


In [90]:
merged_df = pd.DataFrame(df.groupby('preprocessed_excerpt')["sector"].apply(set)).reset_index()

In [76]:
!pip3 install sklearn --user

Collecting sklearn
  Using cached https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting scikit-learn (from sklearn)
  Using cached https://files.pythonhosted.org/packages/56/7d/0737aed3e157fb90a1eaecb1cbfa5742fa4206fb305c8f157b666b71da14/scikit_learn-0.20.0-cp37-cp37m-manylinux1_x86_64.whl
Collecting scipy>=0.13.3 (from scikit-learn->sklearn)
  Using cached https://files.pythonhosted.org/packages/40/de/0c22c6754370ba6b1fa8e53bd6e514d4a41a181125d405a501c215cbdbd6/scipy-1.1.0-cp37-cp37m-manylinux1_x86_64.whl
Installing collected packages: scipy, scikit-learn, sklearn
  Running setup.py install for sklearn ... [?25ldone
[?25hSuccessfully installed scikit-learn-0.20.0 scipy-1.1.0 sklearn-0.0


In [91]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [92]:
text_clf = Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, 2))),
            ('tfidf', TfidfTransformer(use_idf=False)),
            ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
        ])

In [100]:
temp_list = [list(x) for x in list(merged_df["sector"])]

In [107]:
category_columns = [j for sub in temp_list for j in sub]
category_columns = list(set(category_columns))

In [111]:
category_df = pd.concat([merged_df,pd.DataFrame(columns=category_columns)])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [113]:
category_df.fillna(value=0, inplace=True)

In [None]:
! git config --global user.email "aayush.chadha@student.manchester.ac.uk"
! git config --global user.name "achadha0111"
! git add Benchmarks.ipynb
! git commit -m "Add benchmark notebook"
! git push --set-upstream origin benchmarks