In [1]:
from collections import Counter
import numpy as np

## Compile Documents

In [2]:
doc1 = 'Wise people think they are foolish'
doc2 = 'Foolish foolish people think they are wise wise'
doc3 = 'I am definitely wise so this irritates me'
doc4 = 'Ryan is for sure like definitely foolish'

## Create Corpus

In [3]:
documents = [doc1, doc2, doc3, doc4]

## Tokenize and Lower case

In [4]:
from nltk.tokenize import word_tokenize

tokenized = [word_tokenize(doc.lower()) for doc in documents]

## Remove Stop Words

In [5]:
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))
docs = [[word for word in words if word not in stop]
        for words in tokenized]

## Stemming and Lemmatization

In [6]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

docs_stem = [[porter.stem(word) for word in words]
               for words in docs]
docs_lemma = [[wordnet.lemmatize(word) for word in words]
                for words in docs]

In [7]:
print porter.stem('mice')
print wordnet.lemmatize('mice')

mice
mouse


In [8]:
docs_stem

[[u'wise', u'peopl', u'think', u'foolish'],
 [u'foolish', u'foolish', u'peopl', u'think', u'wise', u'wise'],
 [u'definit', u'wise', u'irrit'],
 [u'ryan', u'sure', u'like', u'definit', u'foolish']]

In [10]:
docs_lemma

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['ryan', 'sure', 'like', 'definitely', 'foolish']]

## Vocabulary for our Corpus

In [11]:
vocabulary = [word for doc in docs_lemma for word in doc]
vocabulary = sorted(list(set(vocabulary)))

In [12]:
print 'Vocabulary (features):',vocabulary

Vocabulary (features): ['definitely', 'foolish', 'irritates', 'like', 'people', 'ryan', 'sure', 'think', 'wise']


## Bag of Words

In [13]:
def bow_vectorize(doc, vocabulary):
    bag_of_words = Counter(doc)
    doc_vector = np.zeros(len(vocabulary))
    for word_index, word in enumerate(vocabulary):
        if word in bag_of_words:
            doc_vector[word_index] += bag_of_words[word]
    return doc_vector

In [14]:
bow_matrix = list()
for doc in docs_lemma:
    bow_matrix.append(bow_vectorize(doc, vocabulary))


In [15]:
print 'features:',vocabulary
print
for i in range(len(bow_matrix)):
    print '"%s":'%docs_lemma[i], '\n', bow_matrix[i], '\n'
print
print 'feature matrix:'
print bow_matrix

features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'ryan', 'sure', 'think', 'wise']

"['wise', 'people', 'think', 'foolish']": 
[ 0.  1.  0.  0.  1.  0.  0.  1.  1.] 

"['foolish', 'foolish', 'people', 'think', 'wise', 'wise']": 
[ 0.  2.  0.  0.  1.  0.  0.  1.  2.] 

"['definitely', 'wise', 'irritates']": 
[ 1.  0.  1.  0.  0.  0.  0.  0.  1.] 

"['ryan', 'sure', 'like', 'definitely', 'foolish']": 
[ 1.  1.  0.  1.  0.  1.  1.  0.  0.] 


feature matrix:
[array([ 0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  1.]), array([ 0.,  2.,  0.,  0.,  1.,  0.,  0.,  1.,  2.]), array([ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.]), array([ 1.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,  0.])]


### Bag of Words with CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
def lemmatize(doc):
    return [wordnet.lemmatize(word) for word in word_tokenize(doc.lower())]

count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'),
                                  vocabulary=vocabulary,
                                  tokenizer = lemmatize)
feature_matrix = count_vectorizer.fit_transform([doc1]).todense()

In [17]:
print 'Vectorize:',doc1
print 'Lemmatized:',docs_lemma[0]
print 'Features:', vocabulary
print
print 'sklearn result',feature_matrix
print 'our result',bow_vectorize(docs_lemma[0], vocabulary)
print 
print 'feature matrix'
print count_vectorizer.fit_transform(documents).todense()

Vectorize: Wise people think they are foolish
Lemmatized: ['wise', 'people', 'think', 'foolish']
Features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'ryan', 'sure', 'think', 'wise']

sklearn result [[0 1 0 0 1 0 0 1 1]]
our result [ 0.  1.  0.  0.  1.  0.  0.  1.  1.]

feature matrix
[[0 1 0 0 1 0 0 1 1]
 [0 2 0 0 1 0 0 1 2]
 [1 0 1 0 0 0 0 0 1]
 [1 1 0 1 0 1 1 0 0]]


## Term Frequency (Tf)

In [18]:
def tf_vectorize(doc, vocabulary):
    bow_vector = bow_vectorize(doc, vocabulary)
    tf_vector = np.zeros(len(vocabulary))
    for idx, vec in enumerate(bow_vector):
        tf_vector[idx] = vec/len(doc)
    return tf_vector
            

In [19]:
tf_matrix = list()
for doc in docs_lemma:
    tf_matrix.append(tf_vectorize(doc, vocabulary))

In [20]:
print 'features:',vocabulary
print
for i in range(len(tf_matrix)):
    print '"%s":'%docs_lemma[i], '\n', tf_matrix[i], '\n'

features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'ryan', 'sure', 'think', 'wise']

"['wise', 'people', 'think', 'foolish']": 
[ 0.    0.25  0.    0.    0.25  0.    0.    0.25  0.25] 

"['foolish', 'foolish', 'people', 'think', 'wise', 'wise']": 
[ 0.          0.33333333  0.          0.          0.16666667  0.          0.
  0.16666667  0.33333333] 

"['definitely', 'wise', 'irritates']": 
[ 0.33333333  0.          0.33333333  0.          0.          0.          0.
  0.          0.33333333] 

"['ryan', 'sure', 'like', 'definitely', 'foolish']": 
[ 0.2  0.2  0.   0.2  0.   0.2  0.2  0.   0. ] 



In [21]:
stopwords.words('english')

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all

## Some Tf-Idf 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                                  vocabulary=vocabulary)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).todense()

In [23]:
print 'features:',vocabulary
print
for i in range(len(tfidf_matrix)):
    print '"%s":'%docs_lemma[i], '\n', tfidf_matrix[i], '\n'

features: ['definitely', 'foolish', 'irritates', 'like', 'people', 'ryan', 'sure', 'think', 'wise']

"['wise', 'people', 'think', 'foolish']": 
[[ 0.          0.44493104  0.          0.          0.54957835  0.          0.
   0.54957835  0.44493104]] 

"['foolish', 'foolish', 'people', 'think', 'wise', 'wise']": 
[[ 0.          0.60161783  0.          0.          0.37155886  0.          0.
   0.37155886  0.60161783]] 

"['definitely', 'wise', 'irritates']": 
[[ 0.55349232  0.          0.70203482  0.          0.          0.          0.
   0.          0.44809973]] 

"['ryan', 'sure', 'like', 'definitely', 'foolish']": 
[[ 0.39278432  0.31799276  0.          0.49819711  0.          0.49819711
   0.49819711  0.          0.        ]] 



## Euclidian Distance Comparison

In [24]:
from sklearn.metrics.pairwise import euclidean_distances

In [25]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print euclidean_distances(bow_matrix[0].reshape(1,-1), bow_matrix[1].reshape(1,-1))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[ 1.41421356]]


In [26]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print euclidean_distances(tf_matrix[0].reshape(1,-1), tf_matrix[1].reshape(1,-1))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[ 0.16666667]]


In [28]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print euclidean_distances(tfidf_matrix[0].reshape(1,-1), tfidf_matrix[1].reshape(1,-1))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[ 0.33538543]]


## Cosine Similarity Comparison

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print cosine_similarity(bow_matrix[0].reshape(1,-1), bow_matrix[1].reshape(1,-1))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[ 0.9486833]]


In [31]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print cosine_similarity(tf_matrix[0].reshape(1,-1), tf_matrix[1].reshape(1,-1))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[ 0.9486833]]


In [32]:
print 'Compare "%s" \nwith "%s"'%(doc1, doc2)
print cosine_similarity(tfidf_matrix[0].reshape(1,-1), tfidf_matrix[1].reshape(1,-1))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[ 0.94375831]]


# Search Engine Query Example

In [35]:
query = 'the foolish Ryan'

In [36]:
query_vectorized = tfidf_vectorizer.fit_transform([query]).todense()
print "Query:",query
print "Vectorized query:",query_vectorized

Query: the foolish Ryan
Vectorized query: [[ 0.          0.70710678  0.          0.          0.          0.70710678
   0.          0.          0.        ]]


In [37]:
for doc, tf_doc in zip(documents, tfidf_matrix):
    print doc, cosine_similarity(query_vectorized.reshape(1, -1), tf_doc.reshape(1, -1))

Wise people think they are foolish [[ 0.31461376]]
Foolish foolish people think they are wise wise [[ 0.42540805]]
I am definitely wise so this irritates me [[ 0.]]
Ryan is for sure like definitely foolish [[ 0.57713339]]


## A Final Cosine Similiarity Thingy

In [38]:
for index in xrange(len(documents[1:])):
    print '"%s" compared with "%s"'%(documents[0], documents[index+1])
    print 'TF cosine similarity:', cosine_similarity(tf_matrix[0].reshape(1, -1),
                                                     tf_matrix[index+1].reshape(1, -1))
    print 'TF-IDF cosine similarity:', cosine_similarity(tfidf_matrix[0].reshape(1, -1),
                                                         tfidf_matrix[index+1].reshape(1, -1))
    

"Wise people think they are foolish" compared with "Foolish foolish people think they are wise wise"
TF cosine similarity: [[ 0.9486833]]
TF-IDF cosine similarity: [[ 0.94375831]]
"Wise people think they are foolish" compared with "I am definitely wise so this irritates me"
TF cosine similarity: [[ 0.28867513]]
TF-IDF cosine similarity: [[ 0.19937348]]
"Wise people think they are foolish" compared with "Ryan is for sure like definitely foolish"
TF cosine similarity: [[ 0.2236068]]
TF-IDF cosine similarity: [[ 0.14148485]]
