## 詞頻矩陣

In [2]:
content = ["How to format my hard disk", 
           "Hard disk format problems "]

In [6]:
words0 = content[0].lower().split()
words0

['how', 'to', 'format', 'my', 'hard', 'disk']

In [7]:
words1 = content[1].lower().split()
words1

['hard', 'disk', 'format', 'problems']

In [9]:
words = set(words0) | set(words1)

In [10]:
len(words)

7

In [14]:
import numpy 
m = numpy.zeros((2,7))
m

array([[0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.]])

In [17]:
word_to_id = {}
id_to_word = {}
for idx, w in enumerate(words):
    #print(idx, w)
    word_to_id[w]   = idx
    id_to_word[idx] = w

In [18]:
word_to_id

{'how': 0, 'to': 1, 'my': 2, 'disk': 3, 'hard': 4, 'problems': 5, 'format': 6}

In [19]:
id_to_word

{0: 'how', 1: 'to', 2: 'my', 3: 'disk', 4: 'hard', 5: 'problems', 6: 'format'}

In [22]:
for w in words0:
    m[0,word_to_id[w]] = 1

In [24]:
word_to_id

{'how': 0, 'to': 1, 'my': 2, 'disk': 3, 'hard': 4, 'problems': 5, 'format': 6}

In [23]:
m

array([[1., 1., 1., 1., 1., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0.]])

In [25]:
for w in words1:
    m[1,word_to_id[w]] = 1

In [26]:
m

array([[1., 1., 1., 1., 1., 0., 1.],
       [0., 0., 0., 1., 1., 1., 1.]])

In [27]:
m.shape

(2, 7)

In [32]:
import math
math.sqrt(((m[0] - m[1]) ** 2).sum())

2.0

## 使用 sklearn 的 CountVectorizer

In [None]:
content = ["How to format my hard disk", 
           "Hard disk format problems "]

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# fit: 產生結構
# transform: 轉換
# fit_transform: 產生結構並轉換
X= vectorizer.fit_transform(content) 

In [35]:
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [37]:
print(vectorizer.get_feature_names())

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']


In [36]:
X.toarray()

array([[1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1, 0]], dtype=int64)

## 利用相似度檢索

In [38]:
contents = ['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
'Imaging databases can get huge.',
'Most imaging databases safe images permanently.',
'Imaging databases store images.',
'Imaging databases store images. Imaging databases store images. Imaging databases store images.']

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(contents)

In [40]:
X

<5x24 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [42]:
print(vectorizer.get_feature_names())

['about', 'actually', 'can', 'contains', 'databases', 'get', 'huge', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'safe', 'store', 'stuff', 'this', 'toy']


In [41]:
X.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1,
        1, 1],
       [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0],
       [0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
        0, 0]], dtype=int64)

In [44]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [45]:
new_post_vec

<1x24 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [46]:
new_post_vec.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]])

In [47]:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())


In [54]:
math.sqrt(((X[0] - new_post_vec).toarray() ** 2).sum())

3.872983346207417

In [49]:
dist_raw(X[0], new_post_vec)

3.872983346207417

In [55]:
for i in range(5):
    d = dist_raw(X[i], new_post_vec)
    print(contents[i], d)

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 3.872983346207417
Imaging databases can get huge. 2.0
Most imaging databases safe images permanently. 2.23606797749979
Imaging databases store images. 1.7320508075688772
Imaging databases store images. Imaging databases store images. Imaging databases store images. 5.5677643628300215


## 使用相對距離

In [56]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1.toarray()) 
    v2_normalized  = v2 / sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [57]:
for i in range(5):
    d = dist(X[i], new_post_vec)
    print(contents[i], d)

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.414213562373095
Imaging databases can get huge. 1.0514622242382672
Most imaging databases safe images permanently. 1.0878894332937856
Imaging databases store images. 1.0
Imaging databases store images. Imaging databases store images. Imaging databases store images. 1.0


## Stopwords

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(contents)

In [59]:
X

<5x15 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [60]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [62]:
print(vectorizer.get_feature_names())

['actually', 'contains', 'databases', 'huge', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'safe', 'store', 'stuff', 'toy']


In [61]:
for i in range(5):
    d = dist(X[i], new_post_vec)
    print(contents[i], d)

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
Imaging databases can get huge. 0.9194016867619662
Most imaging databases safe images permanently. 1.0514622242382672
Imaging databases store images. 1.0
Imaging databases store images. Imaging databases store images. Imaging databases store images. 1.0


## Stemming

In [63]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')
s.stem('graphics')

'graphic'

In [64]:
print(s.stem("imaging"))
print(s.stem("image"))
print(s.stem("imagination"))
print(s.stem("imagine"))


imag
imag
imagin
imagin


In [65]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))


In [66]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = StemmedCountVectorizer(stop_words='english')
X = vectorizer.fit_transform(contents)

In [68]:
print(vectorizer.get_feature_names())

['actual', 'contain', 'databas', 'huge', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'safe', 'store', 'stuff', 'toy']


In [67]:
X

<5x14 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [69]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [70]:
new_post_vec

<1x14 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [71]:
new_post_vec.toarray()

array([[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [72]:
for i in range(5):
    d = dist(X[i], new_post_vec)
    print(contents[i], d)

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.414213562373095
Imaging databases can get huge. 0.6058108930553725
Most imaging databases safe images permanently. 0.6296288974669553
Imaging databases store images. 0.5176380902050415
Imaging databases store images. Imaging databases store images. Imaging databases store images. 0.5176380902050415
