<h1><center>This is a notebook, which illustrates the ideas behind main NLP preprocessing techniques</center></h1>

<h1><center>The code is made by Taras Svystun</center></h1>
tatarik.sv@gmail.com

https://github.com/taras-svystun

In [3]:
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_md
from gensim import corpora
from gensim import models
from collections import OrderedDict
from random import shuffle

import warnings
warnings.filterwarnings('ignore')

## Part-Of-Speech

In [2]:
nlp = en_core_web_md.load()
doc = nlp('High above the city.')
POS_tagging = [(token.text, token.pos_) for token in doc]
print(POS_tagging)

[('High', 'ADJ'), ('above', 'ADP'), ('the', 'DET'), ('city', 'NOUN'), ('.', 'PUNCT')]


## Named entity recognition

In [3]:
doc = nlp('Taras claims, UCU is the best university in Europe')
NER = [(ent.text, ent.label_) for ent in doc.ents]
print(NER)

[('Taras', 'PERSON'), ('UCU', 'ORG'), ('Europe', 'LOC')]


## Lemmatization

In [4]:
doc = nlp('I liked my Ford 67, but now I like and likes and liking Ford 69.')
lemmas = [token.lemma_ for token in doc if 
          not token.is_stop and 
          not token.is_punct and 
          not token.is_digit]
print(lemmas)

['like', 'Ford', 'like', 'like', 'like', 'Ford']


## Vectorizing

## Bag-of-words

In [5]:
def lemmatize(*sentences):
    for sentence in sentences:
        doc = nlp(sentence)
        lemmas = [token.lemma_ for token in doc if 
          not token.is_stop and 
          not token.is_punct and 
          not token.is_digit]
        yield lemmas

In [6]:
def make_vocab(lemmas):
    vocab = set()
    for lemma in lemmas:
        unique_lemma = set(lemma)
        vocab = vocab.union(unique_lemma)
    return list(vocab)

In [7]:
def doc2bow(lemmas, vocab):
    for lemma in lemmas:
        yield [lemma.count(word) for word in vocab]

In [8]:
sent1 = 'The cow eats grass and then produces milk.'
sent2 = 'The dog with other dogs bark at the cow.'

lemmas = list(lemmatize(sent1, sent2))
vocab = make_vocab(lemmas)
print(vocab)
for bow in doc2bow(lemmas, vocab):
    print(bow)

['dog', 'eat', 'produce', 'bark', 'milk', 'cow', 'grass']
[0, 1, 1, 0, 1, 1, 1]
[2, 0, 0, 1, 0, 1, 0]


Just another way to create BOW

In [9]:
def make_bow(texts):
    dictionary = OrderedDict()
    for text in texts:
        for word in text:
            if word in dictionary:
                dictionary[word] += 1
            else:
                dictionary[word] = 1
    return dictionary

## TF-iDF

In [10]:
documents = ["Football club club club Arsenal defeat local rivals this weekend.",
             "Weekend football frenzy takes over London.",
             "Bank open for take over bids after losing millions.",
             "London football clubs bid to move to Wembley stadium.",
             "Arsenal bid 50 million pounds bid bid bid for striker Kane.",
             "Financial troubles result in loss of millions for bank.",
             "Western bank files for bankruptcy after financial losses.",
             "London football club is taken over by oil millionaire from Russia.",
             "Banking on finances not working for Russia."]

In [11]:
texts = list(lemmatize(*documents))
for text in texts:
    print(text)

['football', 'club', 'club', 'club', 'Arsenal', 'defeat', 'local', 'rival', 'weekend']
['Weekend', 'football', 'frenzy', 'take', 'London']
['bank', 'open', 'bid', 'lose', 'million']
['London', 'football', 'club', 'bid', 'Wembley', 'stadium']
['arsenal', 'bid', 'million', 'pound', 'bid', 'bid', 'bid', 'striker', 'Kane']
['financial', 'trouble', 'result', 'loss', 'million', 'bank']
['western', 'bank', 'file', 'bankruptcy', 'financial', 'loss']
['London', 'football', 'club', 'take', 'oil', 'millionaire', 'Russia']
['banking', 'finance', 'work', 'Russia']


### Test our manual implementation of Dictionary vs. Gensim
1. Make dictionary

In [12]:
def make_dictionary2(texts):
    index = 0
    result = {}
    for text in texts:
        for word in text:
            if word not in result:
                result[word] = index
                index += 1
    return result

def make_dictionary(texts):
    unique_words = {word for text in texts for word in text}
    return dict(zip(unique_words, range(len(unique_words))))

In [13]:
dictionary = make_dictionary(texts)
print(dictionary)

{'striker': 0, 'result': 1, 'pound': 2, 'football': 3, 'bankruptcy': 4, 'oil': 5, 'local': 6, 'work': 7, 'Weekend': 8, 'western': 9, 'Wembley': 10, 'club': 11, 'open': 12, 'millionaire': 13, 'Russia': 14, 'million': 15, 'financial': 16, 'bank': 17, 'London': 18, 'loss': 19, 'take': 20, 'finance': 21, 'lose': 22, 'Arsenal': 23, 'trouble': 24, 'weekend': 25, 'stadium': 26, 'defeat': 27, 'rival': 28, 'arsenal': 29, 'file': 30, 'banking': 31, 'Kane': 32, 'frenzy': 33, 'bid': 34}


##### vs.

In [14]:
dictionary2 = corpora.Dictionary(texts)
print(dictionary2.token2id)

{'Arsenal': 0, 'club': 1, 'defeat': 2, 'football': 3, 'local': 4, 'rival': 5, 'weekend': 6, 'London': 7, 'Weekend': 8, 'frenzy': 9, 'take': 10, 'bank': 11, 'bid': 12, 'lose': 13, 'million': 14, 'open': 15, 'Wembley': 16, 'stadium': 17, 'Kane': 18, 'arsenal': 19, 'pound': 20, 'striker': 21, 'financial': 22, 'loss': 23, 'result': 24, 'trouble': 25, 'bankruptcy': 26, 'file': 27, 'western': 28, 'Russia': 29, 'millionaire': 30, 'oil': 31, 'banking': 32, 'finance': 33, 'work': 34}


2. Transform texts to BOW according to dictionary

In [15]:
def dict_doc2bow(texts, dictionary):
    result = []
    for text in texts:
        bow = []
        for word in set(text):
            bow.append((dictionary[word], text.count(word)))
        result.append(bow)
    return result

In [16]:
corpus = dict_doc2bow(texts, dictionary)
print('\n'.join(map(str, corpus)))

[(23, 1), (3, 1), (11, 3), (25, 1), (6, 1), (27, 1), (28, 1)]
[(18, 1), (20, 1), (33, 1), (3, 1), (8, 1)]
[(17, 1), (22, 1), (12, 1), (15, 1), (34, 1)]
[(18, 1), (10, 1), (3, 1), (11, 1), (26, 1), (34, 1)]
[(0, 1), (32, 1), (2, 1), (15, 1), (34, 4), (29, 1)]
[(16, 1), (17, 1), (1, 1), (24, 1), (15, 1), (19, 1)]
[(16, 1), (9, 1), (17, 1), (4, 1), (19, 1), (30, 1)]
[(18, 1), (20, 1), (3, 1), (11, 1), (14, 1), (13, 1), (5, 1)]
[(31, 1), (7, 1), (14, 1), (21, 1)]


#### vs.

In [17]:
corpus2 = [dictionary2.doc2bow(text) for text in texts]
print('\n'.join(map(str, corpus)))

[(23, 1), (3, 1), (11, 3), (25, 1), (6, 1), (27, 1), (28, 1)]
[(18, 1), (20, 1), (33, 1), (3, 1), (8, 1)]
[(17, 1), (22, 1), (12, 1), (15, 1), (34, 1)]
[(18, 1), (10, 1), (3, 1), (11, 1), (26, 1), (34, 1)]
[(0, 1), (32, 1), (2, 1), (15, 1), (34, 4), (29, 1)]
[(16, 1), (17, 1), (1, 1), (24, 1), (15, 1), (19, 1)]
[(16, 1), (9, 1), (17, 1), (4, 1), (19, 1), (30, 1)]
[(18, 1), (20, 1), (3, 1), (11, 1), (14, 1), (13, 1), (5, 1)]
[(31, 1), (7, 1), (14, 1), (21, 1)]


### Now it's time to implement TF-iDF itself

In [18]:
def get_tf(corpus):
    nDocuments = len(corpus)
    TF = dict.fromkeys(range(1, nDocuments + 1), dict())
    
    for idx, document in enumerate(corpus):
        total_frequency, tf = 0, dict()
        for term, frequency in document:
            tf[term] = tf.get(term, 0) + frequency
            total_frequency += frequency

        for term, frequency in tf.items():
            tf[term] = frequency / total_frequency
        TF[idx + 1] = tf

    return TF

TF = get_tf(corpus)
for document in TF.values():
    for term, freq in document.items():
        print(f'term - {term}, frequency - {round(freq, 2)}')
    print('-----------------------------')

term - 23, frequency - 0.11
term - 3, frequency - 0.11
term - 11, frequency - 0.33
term - 25, frequency - 0.11
term - 6, frequency - 0.11
term - 27, frequency - 0.11
term - 28, frequency - 0.11
-----------------------------
term - 18, frequency - 0.2
term - 20, frequency - 0.2
term - 33, frequency - 0.2
term - 3, frequency - 0.2
term - 8, frequency - 0.2
-----------------------------
term - 17, frequency - 0.2
term - 22, frequency - 0.2
term - 12, frequency - 0.2
term - 15, frequency - 0.2
term - 34, frequency - 0.2
-----------------------------
term - 18, frequency - 0.17
term - 10, frequency - 0.17
term - 3, frequency - 0.17
term - 11, frequency - 0.17
term - 26, frequency - 0.17
term - 34, frequency - 0.17
-----------------------------
term - 0, frequency - 0.11
term - 32, frequency - 0.11
term - 2, frequency - 0.11
term - 15, frequency - 0.11
term - 34, frequency - 0.44
term - 29, frequency - 0.11
-----------------------------
term - 16, frequency - 0.17
term - 17, frequency - 0.17

In [19]:
def get_term_container(corpus):
    term_container = dict()
    for idx, document in enumerate(corpus):
        for term, frequency in document:
            if term in term_container:
                term_container[term].append(idx + 1)
            else:
                term_container[term] = [idx + 1]
    return term_container

term_container = get_term_container(corpus)
print(term_container)

{23: [1], 3: [1, 2, 4, 8], 11: [1, 4, 8], 25: [1], 6: [1], 27: [1], 28: [1], 18: [2, 4, 8], 20: [2, 8], 33: [2], 8: [2], 17: [3, 6, 7], 22: [3], 12: [3], 15: [3, 5, 6], 34: [3, 4, 5], 10: [4], 26: [4], 0: [5], 32: [5], 2: [5], 29: [5], 16: [6, 7], 1: [6], 24: [6], 19: [6, 7], 9: [7], 4: [7], 30: [7], 14: [8, 9], 13: [8], 5: [8], 31: [9], 7: [9], 21: [9]}


In [20]:
def get_idf(corpus, term_container):
    IDF = dict()
    nDocuments = len(corpus)
    for term, documents_with_term in term_container.items():
        IDF[term] = np.log(nDocuments / len(documents_with_term))
    return IDF


IDF = get_idf(corpus, term_container)
for term, idf in IDF.items():
    print(f'term - {term}, inverse document frequency - {round(idf, 2)}')

term - 23, inverse document frequency - 2.2
term - 3, inverse document frequency - 0.81
term - 11, inverse document frequency - 1.1
term - 25, inverse document frequency - 2.2
term - 6, inverse document frequency - 2.2
term - 27, inverse document frequency - 2.2
term - 28, inverse document frequency - 2.2
term - 18, inverse document frequency - 1.1
term - 20, inverse document frequency - 1.5
term - 33, inverse document frequency - 2.2
term - 8, inverse document frequency - 2.2
term - 17, inverse document frequency - 1.1
term - 22, inverse document frequency - 2.2
term - 12, inverse document frequency - 2.2
term - 15, inverse document frequency - 1.1
term - 34, inverse document frequency - 1.1
term - 10, inverse document frequency - 2.2
term - 26, inverse document frequency - 2.2
term - 0, inverse document frequency - 2.2
term - 32, inverse document frequency - 2.2
term - 2, inverse document frequency - 2.2
term - 29, inverse document frequency - 2.2
term - 16, inverse document frequenc

In [21]:
def tfidf_model(corpus=None, TF=None, IDF=None):
    nDocuments, TFIDF = len(corpus), dict()
    
    for idx in range(nDocuments):
        TFIDF[idx + 1] = dict()

    for idx, tf in TF.items():
        for term, freq in tf.items():
            TFIDF[idx][term] = freq * IDF[term]

    return TFIDF

TFIDF = tfidf_model(corpus=corpus, TF=TF, IDF=IDF)

In [22]:
example = 0 + 1
for term, tfidf in TFIDF[example].items():
    print((term, tfidf))

(23, 0.24413606414846883)
(3, 0.09010335735736986)
(11, 0.3662040962227032)
(25, 0.24413606414846883)
(6, 0.24413606414846883)
(27, 0.24413606414846883)
(28, 0.24413606414846883)


In [23]:
tfidf = models.TfidfModel(corpus=corpus)
example = corpus[0]
for term, tfidf in tfidf[example]:
    print((term, tfidf))

(23, 0.3679502401984268)
(3, 0.1357994858234748)
(11, 0.5519253602976403)
(25, 0.3679502401984268)
(6, 0.3679502401984268)
(27, 0.3679502401984268)
(28, 0.3679502401984268)


## n-grams

#### Searching for bi-grams

In [28]:
bigram = models.Phrases(texts)
texts = [bigram[line] for line in texts]

#### An example of filtering words, leaving frequent ones, but not too much

In [27]:
dictionary2.filter_extremes(no_below=20, no_above=.5)

In [11]:
sentence = 'Clement and Mathieu are working at Apple.'
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
iob_tagged = tree2conlltags(ne_tree)
print(iob_tagged)

[('Clement', 'NN', 'B-GPE'), ('and', 'CC', 'O'), ('Mathieu', 'NNP', 'B-PERSON'), ('are', 'VBP', 'O'), ('working', 'VBG', 'O'), ('at', 'IN', 'O'), ('Apple', 'NNP', 'B-ORGANIZATION'), ('.', '.', 'O')]


In [12]:
ne_tree = conlltags2tree(iob_tagged)
print(ne_tree)

(S
  (GPE Clement/NN)
  and/CC
  (PERSON Mathieu/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION Apple/NNP)
  ./.)
