## Bag of Words

In [1]:
content = ["How to format my hard disk", " Hard disk format problems "]
content

['How to format my hard disk', ' Hard disk format problems ']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
?CountVectorizer

In [5]:
vectorizer = CountVectorizer(min_df = 1)

In [6]:
X = vectorizer.fit_transform(content)

In [7]:
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [10]:
print(vectorizer.get_feature_names())
X.toarray()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']


array([[1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1, 0]], dtype=int64)

## 計算查詢關鍵字對文章的距離

In [13]:
%ls data/toy

01.txt  02.txt  03.txt  04.txt  05.txt


In [17]:
import os
path  = 'data/toy'
[os.path.join(path, f) for f in os.listdir(path)]

['data/toy/01.txt',
 'data/toy/02.txt',
 'data/toy/03.txt',
 'data/toy/04.txt',
 'data/toy/05.txt']

In [19]:
posts = [open(os.path.join(path, f)).read() for f in os.listdir(path)]

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1)
X_train = vectorizer.fit_transform(posts)

In [25]:
X_train

<5x25 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [22]:
# Tell you how many unique words
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [26]:
X_train.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        0, 0, 0]], dtype=int64)

In [27]:
X_train.shape

(5, 25)

In [28]:
num_samples, num_features = X_train.shape

print("#samples: %d, #features: %d" % (num_samples,num_features)) 
print(vectorizer.get_feature_names())


#samples: 5, #features: 25
['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [31]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [32]:
new_post_vec

<1x25 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [34]:
# Tell you how many unique words
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [33]:
new_post_vec.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

## 計算查詢關鍵字對文章的距離 (距離一：歐式距離)

In [38]:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta)

In [39]:
import numpy 
a = numpy.array([0,0,1,1,0])
b = numpy.array([1,0,1,0,0])
dist_raw(a,b)

1.4142135623730951

In [44]:
import math
a = [0,0,1,1,0]
b = [1,0,1,0,0]
d = []
for i in range(0,len(a)):
    d.append(a[i] - b[i])
math.sqrt(sum([ele ** 2for ele in d]))

1.4142135623730951

In [48]:
a = numpy.array([0,0,1,1,0])
b = numpy.array([1,0,1,0,0])
math.sqrt(sum((a - b)** 2) )

1.4142135623730951

In [49]:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

In [None]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)

for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


In [56]:
for i in range(0,5):
    print(i, posts[i], dist_raw(X_train[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 3.872983346207417
1 Imaging databases provide storage capabilities. 2.0
2 Most imaging databases safe images permanently. 2.23606797749979
3 Imaging databases store data. 1.7320508075688772
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 5.5677643628300215


## 計算查詢關鍵字對文章的距離 (距離二：標準化後的歐式距離)

In [57]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1.toarray()) 
    v2_normalized  = v2 / sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [64]:
a = numpy.array([1,1,1,0,0])
#math.sqrt(3)
a / sp.linalg.norm(a)

array([0.57735027, 0.57735027, 0.57735027, 0.        , 0.        ])

In [65]:
b = numpy.array([3,3,3,0,0])
b / sp.linalg.norm(b)

array([0.57735027, 0.57735027, 0.57735027, 0.        , 0.        ])

In [59]:
for i in range(0,5):
    print(i, posts[i], dist(X_train[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
1 Imaging databases provide storage capabilities. 1.0514622242382672
2 Most imaging databases safe images permanently. 1.0878894332937856
3 Imaging databases store data. 1.0
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 1.0
