# Natural Language Processing Demo

Yupeng Yang

Jan 24, 2019

In [1]:
from collections import Counter
import numpy as np

## Compile Documents

In [2]:
doc1 = 'Wise people think they are foolish'
doc2 = 'Foolish foolish people think they are wise wise'
doc3 = 'I am definitely wise so this irritates me'
doc4 = 'Trump is for sure like definitely foolish'

## Create Corpus

In [4]:
documents = [doc1, doc2, doc3, doc4]
documents

['Wise people think they are foolish',
 'Foolish foolish people think they are wise wise',
 'I am definitely wise so this irritates me',
 'Trump is for sure like definitely foolish']

## Tokenize and Lower case

In [14]:
from nltk.tokenize import word_tokenize

In [15]:
# import nltk
# nltk.download()

In [117]:
#TRY
v= 'Trump is for sure sure like children Definitely foolish'
[word_tokenize(v.lower())]

[['trump',
  'is',
  'for',
  'sure',
  'sure',
  'like',
  'children',
  'definitely',
  'foolish']]

In [16]:
tokenized = [word_tokenize(doc.lower()) for doc in documents]

In [17]:
tokenized

[['wise', 'people', 'think', 'they', 'are', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'they', 'are', 'wise', 'wise'],
 ['i', 'am', 'definitely', 'wise', 'so', 'this', 'irritates', 'me'],
 ['trump', 'is', 'for', 'sure', 'like', 'definitely', 'foolish']]

## Remove Stop Words

In [27]:
# import nltk
# nltk.download('stopwords')

In [39]:
set(v)

{' ',
 'D',
 'T',
 'e',
 'f',
 'h',
 'i',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'y'}

In [32]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [31]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [46]:
docs = [[word for word in words if word not in stop] 
        for words in tokenized]

In [47]:
docs

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

## Stemming and Lemmatization

In [57]:
# import nltk
# nltk.download('wordnet')

In [53]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

docs_stem = [[porter.stem(word) for word in words]
               for words in docs]
docs_lemma = [[wordnet.lemmatize(word) for word in doc]
                for doc in docs]

In [54]:
print(porter.stem('mice'))
print(wordnet.lemmatize('mice'))

mice
mouse


In [55]:
docs_stem

[['wise', 'peopl', 'think', 'foolish'],
 ['foolish', 'foolish', 'peopl', 'think', 'wise', 'wise'],
 ['definit', 'wise', 'irrit'],
 ['trump', 'sure', 'like', 'definit', 'foolish']]

In [56]:
docs_lemma

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

## Vocabulary for our Corpus

In [74]:
vocabulary = [word for doc in docs_lemma for word in doc]
#vocabulary = sorted(list(set(word for doc in docs_lemma for word in doc)))

In [75]:
vocabulary

['wise',
 'people',
 'think',
 'foolish',
 'foolish',
 'foolish',
 'people',
 'think',
 'wise',
 'wise',
 'definitely',
 'wise',
 'irritates',
 'trump',
 'sure',
 'like',
 'definitely',
 'foolish']

In [76]:
vocabulary = sorted(list(set(vocabulary)))
vocabulary

['definitely',
 'foolish',
 'irritates',
 'like',
 'people',
 'sure',
 'think',
 'trump',
 'wise']

In [77]:
print('Vocabulary (features):', vocabulary)

Vocabulary (features): ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise']


## Bag of Words

In [78]:
from collections import Counter

In [79]:
def bow_vectorize(doc, vocabulary):
    bag_of_words = Counter(doc) #count the number of words appeared
    doc_vector = np.zeros(len(vocabulary)) #set 9 '0' for positions in a list
    for word_index, word in enumerate(vocabulary): #word_index is from 0 to 8; word has 9 words
        if word in bag_of_words:
            doc_vector[word_index] += bag_of_words[word] #add the count number on this position.
    return doc_vector

In [104]:
docs_lemma

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

In [111]:
Counter(docs_lemma[0])['foolish']
Counter(docs_lemma[0])

Counter({'wise': 1, 'people': 1, 'think': 1, 'foolish': 1})

In [105]:
bow_matrix = list()
for doc in docs_lemma:
    bow_matrix.append(bow_vectorize(doc, vocabulary))


In [108]:
bow_matrix

[array([0., 1., 0., 0., 1., 0., 1., 0., 1.]),
 array([0., 2., 0., 0., 1., 0., 1., 0., 2.]),
 array([1., 0., 1., 0., 0., 0., 0., 0., 1.]),
 array([1., 1., 0., 1., 0., 1., 0., 1., 0.])]

In [109]:
print('features:',vocabulary)
for i in range(len(bow_matrix)):
    print('"%s":'% docs_lemma[i], '\n', bow_matrix[i], '\n')
          
        
print('feature matrix:')
print(bow_matrix)

features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise']
"['wise', 'people', 'think', 'foolish']": 
 [0. 1. 0. 0. 1. 0. 1. 0. 1.] 

"['foolish', 'foolish', 'people', 'think', 'wise', 'wise']": 
 [0. 2. 0. 0. 1. 0. 1. 0. 2.] 

"['definitely', 'wise', 'irritates']": 
 [1. 0. 1. 0. 0. 0. 0. 0. 1.] 

"['trump', 'sure', 'like', 'definitely', 'foolish']": 
 [1. 1. 0. 1. 0. 1. 0. 1. 0.] 

feature matrix:
[array([0., 1., 0., 0., 1., 0., 1., 0., 1.]), array([0., 2., 0., 0., 1., 0., 1., 0., 2.]), array([1., 0., 1., 0., 0., 0., 0., 0., 1.]), array([1., 1., 0., 1., 0., 1., 0., 1., 0.])]


### Bag of Words with CountVectorizer

In [119]:
def lemmatize(doc):
    return [wordnet.lemmatize(word) for word in word_tokenize(doc.lower())]
lemmatize(v) #'children' to 'child'

['trump',
 'is',
 'for',
 'sure',
 'sure',
 'like',
 'child',
 'definitely',
 'foolish']

In [113]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'),
                                   vocabulary=vocabulary,
                                   tokenizer=lemmatize)

feature_matrix = count_vectorizer.fit_transform([doc1])

In [114]:
feature_matrix.toarray()

array([[0, 1, 0, 0, 1, 0, 1, 0, 1]])

In [124]:
count_vectorizer.fit_transform(documents).todense()

matrix([[0, 1, 0, 0, 1, 0, 1, 0, 1],
        [0, 2, 0, 0, 1, 0, 1, 0, 2],
        [1, 0, 1, 0, 0, 0, 0, 0, 1],
        [1, 1, 0, 1, 0, 1, 0, 1, 0]])

In [120]:
print('Vectorize:',doc1)
print('Lemmatized:',docs_lemma[0])
print('Features:', vocabulary)
print('\n')
print('sklearn result',feature_matrix.toarray())
print('our result',bow_vectorize(docs_lemma[0], vocabulary))
print('\n')
print('feature matrix')
print(count_vectorizer.fit_transform(documents).todense())

Vectorize: Wise people think they are foolish
Lemmatized: ['wise', 'people', 'think', 'foolish']
Features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise']


sklearn result [[0 1 0 0 1 0 1 0 1]]
our result [0. 1. 0. 0. 1. 0. 1. 0. 1.]


feature matrix
[[0 1 0 0 1 0 1 0 1]
 [0 2 0 0 1 0 1 0 2]
 [1 0 1 0 0 0 0 0 1]
 [1 1 0 1 0 1 0 1 0]]


## Term Frequency (Tf)

In [125]:
def tf_vectorize(doc, vocabulary):
    bow_vector = bow_vectorize(doc, vocabulary)
    tf_vector = np.zeros(len(vocabulary))
    for idx, vec in enumerate(bow_vector):
        tf_vector[idx] = vec / len(doc)
    return tf_vector

In [126]:
tf_matrix = list()
for doc in docs_lemma:
    tf_matrix.append(tf_vectorize(doc, vocabulary))

In [127]:
print('features:', vocabulary)

for i in range(len(tf_matrix)):
    print('"%s":'%docs_lemma[i], '\n', tf_matrix[i], '\n')

features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise']
"['wise', 'people', 'think', 'foolish']": 
 [0.   0.25 0.   0.   0.25 0.   0.25 0.   0.25] 

"['foolish', 'foolish', 'people', 'think', 'wise', 'wise']": 
 [0.         0.33333333 0.         0.         0.16666667 0.
 0.16666667 0.         0.33333333] 

"['definitely', 'wise', 'irritates']": 
 [0.33333333 0.         0.33333333 0.         0.         0.
 0.         0.         0.33333333] 

"['trump', 'sure', 'like', 'definitely', 'foolish']": 
 [0.2 0.2 0.  0.2 0.  0.2 0.  0.2 0. ] 



## Some Tf-Idf 

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                                   vocabulary=vocabulary)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).todense()

In [129]:
tfidf_matrix

matrix([[0.        , 0.44493104, 0.        , 0.        , 0.54957835,
         0.        , 0.54957835, 0.        , 0.44493104],
        [0.        , 0.60161783, 0.        , 0.        , 0.37155886,
         0.        , 0.37155886, 0.        , 0.60161783],
        [0.55349232, 0.        , 0.70203482, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.44809973],
        [0.39278432, 0.31799276, 0.        , 0.49819711, 0.        ,
         0.49819711, 0.        , 0.49819711, 0.        ]])

In [130]:
print('features:',vocabulary)

for i in range(len(tfidf_matrix)):
    print('"%s":'%docs_lemma[i], '\n', tfidf_matrix[i], '\n')

features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise']
"['wise', 'people', 'think', 'foolish']": 
 [[0.         0.44493104 0.         0.         0.54957835 0.
  0.54957835 0.         0.44493104]] 

"['foolish', 'foolish', 'people', 'think', 'wise', 'wise']": 
 [[0.         0.60161783 0.         0.         0.37155886 0.
  0.37155886 0.         0.60161783]] 

"['definitely', 'wise', 'irritates']": 
 [[0.55349232 0.         0.70203482 0.         0.         0.
  0.         0.         0.44809973]] 

"['trump', 'sure', 'like', 'definitely', 'foolish']": 
 [[0.39278432 0.31799276 0.         0.49819711 0.         0.49819711
  0.         0.49819711 0.        ]] 



## Euclidian Distance Comparison

In [131]:
from sklearn.metrics.pairwise import euclidean_distances

In [132]:
bow_matrix[0]

array([0., 1., 0., 0., 1., 0., 1., 0., 1.])

In [133]:
bow_matrix[1]

array([0., 2., 0., 0., 1., 0., 1., 0., 2.])

In [134]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[1.41421356]]


In [135]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tf_matrix[0].reshape(1, -1), tf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.16666667]]


In [136]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.33538543]]


## Cosine Similarity Comparison

In [137]:
from sklearn.metrics.pairwise import cosine_similarity

In [138]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.9486833]]


In [139]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tf_matrix[0].reshape(1, -1), tf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.9486833]]


In [140]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.94375831]]


# Search Engine Query Example

In [141]:
query = 'The foolish Trump'

In [142]:
query_vectorized = tfidf_vectorizer.transform([query]).todense()
print("Query:", query)
print("Vectorized query:", query_vectorized)

Query: The foolish Trump
Vectorized query: [[0.         0.53802897 0.         0.         0.         0.
  0.         0.84292635 0.        ]]


In [143]:
for doc, tf_doc in zip(documents, tfidf_matrix):
    print(doc, cosine_similarity(query_vectorized.reshape(1, -1), tf_doc.reshape(1, -1)))

Wise people think they are foolish [[0.23938579]]
Foolish foolish people think they are wise wise [[0.32368782]]
I am definitely wise so this irritates me [[0.]]
Trump is for sure like definitely foolish [[0.59103279]]


## A Final Cosine Similiarity Thingy

In [144]:
for index in range(len(documents[1:])):
    print('"%s" compared with "%s"'%(documents[0], documents[index+1]))
    print('TF cosine similarity:', cosine_similarity(tf_matrix[0].reshape(1, -1),
                                                     tf_matrix[index+1].reshape(1, -1)))
    print('TF-IDF cosine similarity:', cosine_similarity(tfidf_matrix[0].reshape(1, -1),
                                                         tfidf_matrix[index+1].reshape(1, -1)))
    

"Wise people think they are foolish" compared with "Foolish foolish people think they are wise wise"
TF cosine similarity: [[0.9486833]]
TF-IDF cosine similarity: [[0.94375831]]
"Wise people think they are foolish" compared with "I am definitely wise so this irritates me"
TF cosine similarity: [[0.28867513]]
TF-IDF cosine similarity: [[0.19937348]]
"Wise people think they are foolish" compared with "Trump is for sure like definitely foolish"
TF cosine similarity: [[0.2236068]]
TF-IDF cosine similarity: [[0.14148485]]
