## Bag of Words

In [1]:
content = ["How to format my hard disk", " Hard disk format problems "]
content

['How to format my hard disk', ' Hard disk format problems ']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
?CountVectorizer

In [5]:
vectorizer = CountVectorizer(min_df = 1)

In [6]:
X = vectorizer.fit_transform(content)

In [7]:
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [10]:
print(vectorizer.get_feature_names())
X.toarray()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']


array([[1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1, 0]], dtype=int64)

## 計算查詢關鍵字對文章的距離

In [13]:
%ls data/toy

01.txt  02.txt  03.txt  04.txt  05.txt


In [17]:
import os
path  = 'data/toy'
[os.path.join(path, f) for f in os.listdir(path)]

['data/toy/01.txt',
 'data/toy/02.txt',
 'data/toy/03.txt',
 'data/toy/04.txt',
 'data/toy/05.txt']

In [19]:
posts = [open(os.path.join(path, f)).read() for f in os.listdir(path)]

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1)
X_train = vectorizer.fit_transform(posts)

In [25]:
X_train

<5x25 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [22]:
# Tell you how many unique words
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [26]:
X_train.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        0, 0, 0]], dtype=int64)

In [27]:
X_train.shape

(5, 25)

In [28]:
num_samples, num_features = X_train.shape

print("#samples: %d, #features: %d" % (num_samples,num_features)) 
print(vectorizer.get_feature_names())


#samples: 5, #features: 25
['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [31]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [32]:
new_post_vec

<1x25 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [34]:
# Tell you how many unique words
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [33]:
new_post_vec.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

## 計算查詢關鍵字對文章的距離 (距離一：歐式距離)

In [38]:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta)

In [39]:
import numpy 
a = numpy.array([0,0,1,1,0])
b = numpy.array([1,0,1,0,0])
dist_raw(a,b)

1.4142135623730951

In [44]:
import math
a = [0,0,1,1,0]
b = [1,0,1,0,0]
d = []
for i in range(0,len(a)):
    d.append(a[i] - b[i])
math.sqrt(sum([ele ** 2for ele in d]))

1.4142135623730951

In [48]:
a = numpy.array([0,0,1,1,0])
b = numpy.array([1,0,1,0,0])
math.sqrt(sum((a - b)** 2) )

1.4142135623730951

In [49]:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

In [None]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)

for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


In [56]:
for i in range(0,5):
    print(i, posts[i], dist_raw(X_train[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 3.872983346207417
1 Imaging databases provide storage capabilities. 2.0
2 Most imaging databases safe images permanently. 2.23606797749979
3 Imaging databases store data. 1.7320508075688772
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 5.5677643628300215


## 計算查詢關鍵字對文章的距離 (距離二：標準化後的歐式距離)

In [57]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1.toarray()) 
    v2_normalized  = v2 / sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [64]:
a = numpy.array([1,1,1,0,0])
#math.sqrt(3)
a / sp.linalg.norm(a)

array([0.57735027, 0.57735027, 0.57735027, 0.        , 0.        ])

In [65]:
b = numpy.array([3,3,3,0,0])
b / sp.linalg.norm(b)

array([0.57735027, 0.57735027, 0.57735027, 0.        , 0.        ])

In [59]:
for i in range(0,5):
    print(i, posts[i], dist(X_train[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
1 Imaging databases provide storage capabilities. 1.0514622242382672
2 Most imaging databases safe images permanently. 1.0878894332937856
3 Imaging databases store data. 1.0
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 1.0


In [66]:
# Tell you how many unique words
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


## 移除停用詞

In [67]:
! pip install nltk

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [69]:
vectorizer = CountVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())
print(X.shape)

['actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'toy']
(5, 18)


In [70]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [72]:
new_post_vec.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [71]:
for i in range(0,5):
    print(i, posts[i], dist(X[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
1 Imaging databases provide storage capabilities. 1.0514622242382672
2 Most imaging databases safe images permanently. 1.0514622242382672
3 Imaging databases store data. 1.0
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 1.0


## Stemming

In [73]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')


In [74]:
s.stem('graphics')

'graphic'

In [75]:
print(s.stem("imaging"))
print(s.stem("image"))

print(s.stem("imagination"))
print(s.stem("imagine"))

imag
imag
imagin
imagin


In [76]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [77]:
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())
print(X.shape)

['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'safe', 'storag', 'store', 'stuff', 'toy']
(5, 17)


In [78]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [80]:
new_post_vec.toarray()

array([[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [82]:
for i in range(0,5):
    print(i, posts[i], dist(X[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.414213562373095
1 Imaging databases provide storage capabilities. 0.8573732768944039
2 Most imaging databases safe images permanently. 0.6296288974669553
3 Imaging databases store data. 0.7653668647301795
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 0.7653668647301795


## TF-IDF  詞頻矩陣

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())
print(X.shape)

['actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'toy']
(5, 18)


In [86]:
#X.toarray()

In [88]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [89]:
for i in range(0,5):
    print(i, posts[i], dist(X[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
1 Imaging databases provide storage capabilities. 1.187009812033225
2 Most imaging databases safe images permanently. 1.187009812033225
3 Imaging databases store data. 1.091020922163783
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 1.091020922163783


In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))


In [91]:
vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())
print(X.shape)

['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'safe', 'storag', 'store', 'stuff', 'toy']
(5, 17)


In [92]:
new_post = 'imaging database'
new_post_vec = vectorizer.transform([new_post])

In [93]:
for i in range(0,5):
    print(i, posts[i], dist(X[i], new_post_vec))

0 This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
1 Imaging databases provide storage capabilities. 1.0789758507558254
2 Most imaging databases safe images permanently. 0.859044512133176
3 Imaging databases store data. 0.924634506718001
4 Imaging databases store data. Imaging databases store data. Imaging databases store data. 0.924634506718001


## 中文詞頻矩陣

In [105]:
! pip uninstall jieba

Uninstalling jieba-0.39:
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba-0.39.dist-info/INSTALLER
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba-0.39.dist-info/METADATA
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba-0.39.dist-info/RECORD
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba-0.39.dist-info/WHEEL
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba-0.39.dist-info/top_level.txt
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba/__init__.py
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba/__init__.pyc
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba/__main__.py
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba/__pycache__/__init__.cpython-36.pyc
  /Users/davidchiu/.pyenv/versions/3.6.2/lib/python3.6/site-packages/jieba/__pycache__/__main__.cpython-36

## 繁體中文版jieba
- https://github.com/ldkrsi/jieba-zh_TW

In [3]:
import jieba
jieba.load_userdict('userdict.txt')

for w in jieba.cut('柯文哲為了大巨蛋一事找趙藤雄算帳'):
    print(w)

柯文哲
為
了
大巨蛋
一事
找
趙藤雄
算帳


In [7]:
a = ['柯文哲為了大巨蛋一事找趙藤雄算帳', '柯P將不在大巨蛋舉辦世運會']

courpus  = []
for s in a:
    courpus.append(' '.join(jieba.cut(s)))
courpus

['柯文哲 為 了 大巨蛋 一事 找 趙藤雄 算帳', '柯P 將不在 大巨蛋 舉辦 世運會']

In [8]:
corpus = [' '.join(jieba.cut(s)) for s in a]
corpus

['柯文哲 為 了 大巨蛋 一事 找 趙藤雄 算帳', '柯P 將不在 大巨蛋 舉辦 世運會']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1)
X = vectorizer.fit_transform(corpus)

In [10]:
X

<2x9 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [11]:
print(vectorizer.get_feature_names())

['一事', '世運會', '大巨蛋', '將不在', '柯p', '柯文哲', '算帳', '舉辦', '趙藤雄']


In [12]:
print(X.toarray())

[[1 0 1 0 0 1 1 0 1]
 [0 1 1 1 1 0 0 1 0]]


In [17]:
import scipy as sp
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1.toarray()) 
    v2_normalized  = v2 / sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [18]:
dist(X[0],X[1])

1.2649110640673518

In [38]:
import requests
keyword = '郭雪芙'
res = requests.get('https://zh.wikipedia.org/wiki/{}'.format(keyword))

In [39]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text, 'lxml')

In [40]:
synonym = []
for p in soup.select('.mw-parser-output p'):
    for b in p.select('b'):
        synonym.append(b.text)
synonym

['郭雪芙', 'Puff Kuo']

## synonyms.txt
- 柯文哲/柯P/柯p
- 郭雪芙/puff

In [42]:
synonym_dic = {}
for s in open('synonyms.txt'):
    synonym = s.strip().split('/')
    for w in synonym[1:]:
        synonym_dic[w.lower()]  = synonym[0]
synonym_dic

{'puff': '郭雪芙', '柯p': '柯文哲'}

In [43]:
import nltk.stem

class SynonymCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SynonymCountVectorizer, self).build_analyzer()
        return lambda doc: (synonym_dic.get(w, w) for w in analyzer(doc))

vectorizer = SynonymCountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())


['一事', '世運會', '大巨蛋', '將不在', '柯文哲', '算帳', '舉辦', '趙藤雄']


In [44]:
X.toarray()

array([[1, 0, 1, 0, 1, 1, 0, 1],
       [0, 1, 1, 1, 1, 0, 1, 0]], dtype=int64)

In [45]:
stopwords = ['為了', '一事', '將不在']

vectorizer = SynonymCountVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())


['世運會', '大巨蛋', '柯文哲', '算帳', '舉辦', '趙藤雄']


In [47]:
content = [ '2天才女童到澳洲深造 竟全身傷疑受虐',
'來自台灣、現在只有9歲與7歲的2名兒童，上周六(13日)流落於澳洲街頭時被當地警方發現帶回安置，懷疑受虐和人口販運有關，隨即通知我國警方，不過經調查發現，這2名高智商的女童，其實是由父親委託友人帶去澳洲自主學習，父親表示並非遭到人口販運，小姐妹數理方面優異，當初委託朋友帶到澳洲深造。']

In [49]:
corpus = [' '.join(jieba.cut(s)) for s in content]

In [52]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [57]:
import scipy as sp

def dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta.toarray())

In [58]:
dist_raw(X[0], X[1])

7.681145747868608

In [59]:
X.toarray()

array([[0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [1, 1, 1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1,
        1, 1, 1]], dtype=int64)

In [60]:
content=  ['我喜歡看電視不喜歡看電影','我不喜歡看電視也不喜歡看電影']

In [61]:
corpus = [' '.join(jieba.cut(s)) for s in content]

In [67]:
corpus

['我 喜歡 看 電視 不 喜歡 看 電影', '我 不 喜歡 看 電視 也 不 喜歡 看 電影']

In [80]:
content = ['柯文哲為了大巨蛋一事找趙藤雄算帳', '柯P將不在大巨蛋舉辦世運會']
corpus = [' '.join(jieba.cut(s)) for s in content]

In [81]:
import nltk.stem

class SynonymCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SynonymCountVectorizer, self).build_analyzer()
        return lambda doc: (synonym_dic.get(w, w) for w in analyzer(doc))

vectorizer = SynonymCountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())


['一事', '世運會', '大巨蛋', '將不在', '柯文哲', '算帳', '舉辦', '趙藤雄']


In [82]:
X.toarray()

array([[1, 0, 1, 0, 1, 1, 0, 1],
       [0, 1, 1, 1, 1, 0, 1, 0]], dtype=int64)

In [83]:
from sklearn.metrics.pairwise import cosine_similarity
cs = cosine_similarity(X)
cs

array([[1. , 0.4],
       [0.4, 1. ]])

In [84]:
from sklearn.metrics.pairwise import cosine_distances
cs = cosine_distances(X)
cs

array([[0. , 0.6],
       [0.6, 0. ]])