## Compile Documents and Create Corpus

In [1]:
doc1 = 'Wise people thinking mice they shop  are foolish!'
doc2 = 'Foolish think shopping foolish people 4 * ( think they are wise wise'
doc3 = 'I am definitely wise; so this viirritates me'
doc4 = '-- Trump is for sure like definitely foolish'
corpus = [doc1, doc2, doc3, doc4]

### Bag of Words with CountVectorizer + TfidfTransformer -----> Tfidf matrix

In [2]:
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
def lemmize(doc, lemmatizer, tokenizer):
    return [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(doc)]

In [4]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode', stop_words = 'english', tokenizer = lambda x: lemmize(x, lemmatizer,tokenizer))
bow_matrix = count_vectorizer.fit_transform(corpus).toarray()

In [5]:
bow_matrix

array([[0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1],
       [0, 2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 2],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0]], dtype=int64)

In [6]:
tfidf_former = TfidfTransformer()
tfidf_matrix = tfidf_former.fit_transform(bow_matrix)

In [7]:
tfidf_matrix.toarray()

array([[ 0.        ,  0.30304005,  0.        ,  0.4747708 ,  0.37431475,
         0.4747708 ,  0.        ,  0.        ,  0.        ,  0.4747708 ,
         0.        ,  0.        ,  0.30304005],
       [ 0.        ,  0.4283691 ,  0.        ,  0.        ,  0.26456053,
         0.        ,  0.33556149,  0.        ,  0.67112297,  0.        ,
         0.        ,  0.        ,  0.4283691 ],
       [ 0.55349232,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.70203482,  0.44809973],
       [ 0.39278432,  0.31799276,  0.49819711,  0.        ,  0.        ,
         0.        ,  0.        ,  0.49819711,  0.        ,  0.        ,
         0.49819711,  0.        ,  0.        ]])

## TfidfVectorizer ----> tfidf matrix

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).todense()
# tf_idf_tra = TfidfTransformer()
# tfidf_matrix = tf_idf_tra.fit_transform(bow_matrix)

In [10]:
tfidf_matrix

matrix([[ 0.        ,  0.30304005,  0.        ,  0.4747708 ,  0.37431475,
          0.4747708 ,  0.        ,  0.        ,  0.        ,  0.4747708 ,
          0.        ,  0.        ,  0.30304005],
        [ 0.        ,  0.4283691 ,  0.        ,  0.        ,  0.26456053,
          0.        ,  0.33556149,  0.        ,  0.67112297,  0.        ,
          0.        ,  0.        ,  0.4283691 ],
        [ 0.55349232,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.70203482,  0.44809973],
        [ 0.39278432,  0.31799276,  0.49819711,  0.        ,  0.        ,
          0.        ,  0.        ,  0.49819711,  0.        ,  0.        ,
          0.49819711,  0.        ,  0.        ]])

## Euclidian Distance Comparison

In [11]:
from sklearn.metrics.pairwise import euclidean_distances

In [12]:
bow_matrix[0]

array([0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1], dtype=int64)

In [13]:
# shaped it n_samples_1 by n_features, one dimension can be -1, which means automatically calculate this dimension
bow_matrix[0].reshape(1, -1) 

array([[0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1]], dtype=int64)

In [14]:
bow_matrix[1]

array([0, 2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 2], dtype=int64)

In [15]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

Compare "Wise people thinking mice they shop  are foolish!" 
with "Foolish think shopping foolish people 4 * ( think they are wise wise"
[[ 3.16227766]]


In [17]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

Compare "Wise people thinking mice they shop  are foolish!" 
with "Foolish think shopping foolish people 4 * ( think they are wise wise"
[[ 1.13255914]]


## Cosine Similarity Comparison

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

Compare "Wise people thinking mice they shop  are foolish!" 
with "Foolish think shopping foolish people 4 * ( think they are wise wise"
[[ 0.54554473]]


In [22]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

Compare "Wise people thinking mice they shop  are foolish!" 
with "Foolish think shopping foolish people 4 * ( think they are wise wise"
[[ 0.35865489]]


# Search Engine Query Example

In [23]:
query = 'The foolish Trump'

In [25]:
query_vectorized = tfidf_vectorizer.transform([query]).todense()
print "Query:", query
print "Vectorized query:", query_vectorized

Query: The foolish Trump
Vectorized query: [[ 0.          0.53802897  0.          0.          0.          0.          0.
   0.          0.          0.          0.84292635  0.          0.        ]]


In [27]:
for doc, tf_doc in zip(corpus, tfidf_matrix):
    print(doc, cosine_similarity(query_vectorized.reshape(1, -1), tf_doc.reshape(1, -1)))

('Wise people thinking mice they shop  are foolish!', array([[ 0.16304433]]))
('Foolish think shopping foolish people 4 * ( think they are wise wise', array([[ 0.23047499]]))
('I am definitely wise; so this viirritates me', array([[ 0.]]))
('-- Trump is for sure like definitely foolish', array([[ 0.59103279]]))


## A Final Cosine Similiarity Thingy

In [28]:
for index in xrange(len(corpus[1:])):
    print('"%s" compared with "%s"'%(corpus[0], corpus[index+1])) 
#     print('TF cosine similarity:', cosine_similarity(tf_matrix[0].reshape(1, -1),
#                                                      tf_matrix[index+1].reshape(1, -1)))
    print('TF-IDF cosine similarity:', cosine_similarity(tfidf_matrix[0].reshape(1, -1),
                                                         tfidf_matrix[index+1].reshape(1, -1)))
    

"Wise people thinking mice they shop  are foolish!" compared with "Foolish think shopping foolish people 4 * ( think they are wise wise"
('TF-IDF cosine similarity:', array([[ 0.35865489]]))
"Wise people thinking mice they shop  are foolish!" compared with "I am definitely wise; so this viirritates me"
('TF-IDF cosine similarity:', array([[ 0.13579216]]))
"Wise people thinking mice they shop  are foolish!" compared with "-- Trump is for sure like definitely foolish"
('TF-IDF cosine similarity:', array([[ 0.09636454]]))
