In [1]:
s = '柯文哲為了大巨蛋一事槓上遠雄'

In [4]:
import jieba
jieba.load_userdict('userdict.txt')
for w in jieba.cut(s):
    print(w)

柯文哲
為
了
大巨蛋
一事
槓上
遠雄


In [5]:
content = ["How to format my hard disk", " Hard disk format problems "]
content

['How to format my hard disk', ' Hard disk format problems ']

In [9]:
a = content[0].lower().split()
b = content[1].lower().split()

In [10]:
a

['how', 'to', 'format', 'my', 'hard', 'disk']

In [11]:
b

['hard', 'disk', 'format', 'problems']

In [13]:
words_set = set(a) | set(b)
words_set

{'disk', 'format', 'hard', 'how', 'my', 'problems', 'to'}

In [16]:
word_to_id = {}
id_to_word = {}
for i in enumerate(words_set):
    #print(i)
    word_to_id[i[1]] = i[0]
    id_to_word[i[0]] = i[1]

In [18]:
word_to_id

{'how': 0, 'hard': 1, 'disk': 2, 'problems': 3, 'format': 4, 'to': 5, 'my': 6}

In [19]:
id_to_word

{0: 'how', 1: 'hard', 2: 'disk', 3: 'problems', 4: 'format', 5: 'to', 6: 'my'}

In [21]:
a_ary = [word_to_id.get(w) for w in a]
b_ary = [word_to_id.get(w) for w in b]

In [23]:
a_ary

[0, 5, 4, 6, 1, 2]

In [25]:
b_ary

[1, 2, 4, 3]

In [27]:
import numpy as np
m = np.zeros((2,7))

In [28]:
for e in a_ary:
    m[0,e] = 1
for e in b_ary:
    m[1,e] = 1

In [29]:
m

array([[1., 1., 1., 0., 1., 1., 1.],
       [0., 1., 1., 1., 1., 0., 0.]])

In [33]:
import math
math.sqrt(((m[0] - m[1])**2).sum())

2.0

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(content)

In [35]:
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [37]:
print(vectorizer.get_feature_names())

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']


In [36]:
X.toarray()

array([[1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1, 0]], dtype=int64)

## 計算文字對文字的距離

In [38]:
posts = [
'This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
'Imaging databases provide storage capabilities.' ,
'Most imaging databases safe images permanently.',
'Imaging databases store data.',
'Imaging databases store data. Imaging databases store data. Imaging databases store data.',
]

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(posts)

In [40]:
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [41]:
X

<5x25 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [43]:
#X.toarray()

In [46]:
query = 'Imaging database'

In [48]:
Y = vectorizer.transform([query])

In [49]:
Y

<1x25 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [52]:
(Y - X[0]).toarray()

array([[-1, -1,  0, -1,  0,  0,  0,  1, -1, -1, -1, -1, -1,  0, -1, -1,
         0, -1,  0,  0,  0,  0, -1, -1, -1]], dtype=int64)

## 使用Euclidean Distance 計算距離

In [61]:
import scipy as sp

def euclidean_dist_raw(v1, v2):
    delta = v1-v2
    return sp.linalg.norm(delta)

In [55]:
?sp.linalg.norm

In [58]:
sp.linalg.norm(np.array([-1,1,-1,1]))

2.0

In [62]:
for i in range(5):
    print(posts[i], euclidean_dist_raw(X[i].toarray(), Y.toarray()))

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 3.872983346207417
Imaging databases provide storage capabilities. 2.0
Most imaging databases safe images permanently. 2.23606797749979
Imaging databases store data. 1.7320508075688772
Imaging databases store data. Imaging databases store data. Imaging databases store data. 5.5677643628300215


In [63]:
sp.linalg.norm(np.array([1,1,1,1]))

2.0

In [64]:
sp.linalg.norm(np.array([0,0,1,1]))

1.4142135623730951

In [65]:
sp.linalg.norm(np.array([1,1,1,1,1,1,1,1,1,1,1,1]))

3.4641016151377544

## 將每個詞向量先做正規化後再計算距離

In [70]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1) 
    v2_normalized  = v2 / sp.linalg.norm(v2)
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta)

In [71]:
for i in range(5):
    print(posts[i], dist(X[i].toarray(), Y.toarray()))

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.4142135623730951
Imaging databases provide storage capabilities. 1.0514622242382672
Most imaging databases safe images permanently. 1.0878894332937856
Imaging databases store data. 1.0
Imaging databases store data. Imaging databases store data. Imaging databases store data. 1.0


## 移除停用詞

In [72]:
print(vectorizer.get_feature_names())

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


In [73]:
from sklearn.feature_extraction.text import CountVectorizer
?CountVectorizer

In [74]:

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(posts)

In [75]:
print(vectorizer.get_feature_names())

['actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'toy']


In [77]:
X

<5x18 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [79]:
Y = vectorizer.transform([query])

In [80]:
for i in range(5):
    print(posts[i], dist(X[i].toarray(), Y.toarray()))

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.414213562373095
Imaging databases provide storage capabilities. 1.0514622242382672
Most imaging databases safe images permanently. 1.0514622242382672
Imaging databases store data. 1.0
Imaging databases store data. Imaging databases store data. Imaging databases store data. 1.0


## Stemming

In [81]:
import nltk.stem
s = nltk.stem.SnowballStemmer('english')

In [82]:
s.stem('graphics')

'graphic'

In [83]:
print(s.stem("imaging"))
print(s.stem("image"))

print(s.stem("imagination"))
print(s.stem("imagine"))

imag
imag
imagin
imagin


In [85]:
class Dog():
    def eat(self):
        print('i am eating')
        
class Cat():
    def eat(self):
        print('i am eating')
        
a = Dog()
a.eat()

i am eating


In [86]:
b = Cat()
b.eat()

i am eating


In [89]:
class Animal():
    def eat(self):
        print('i am eating')
    def walk(self):
        print('i am walking')
        
class Dog(Animal):
    def play(self):
        print('i am playing')
        
class Cat(Animal):
    def sleep(self):
        print('i am sleeping')

a = Dog()
a.eat()
a.play()
a.sleep()

i am eating
i am playing


AttributeError: 'Dog' object has no attribute 'sleep'

In [92]:

import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [93]:
vectorizer = StemmedCountVectorizer(stop_words='english')
X = vectorizer.fit_transform(posts)
Y = vectorizer.transform([query])

In [96]:
print(vectorizer.get_feature_names())

['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'safe', 'storag', 'store', 'stuff', 'toy']


In [94]:
for i in range(5):
    print(posts[i], dist(X[i].toarray(), Y.toarray()))

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.414213562373095
Imaging databases provide storage capabilities. 0.8573732768944039
Most imaging databases safe images permanently. 0.6296288974669553
Imaging databases store data. 0.7653668647301795
Imaging databases store data. Imaging databases store data. Imaging databases store data. 0.7653668647301795


## TFIDF 詞頻矩陣

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(posts)
Y = vectorizer.transform([query])

In [102]:
X

<5x25 sparse matrix of type '<class 'numpy.float64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [103]:
X.toarray()

array([[0.26726124, 0.26726124, 0.        , 0.26726124, 0.        ,
        0.        , 0.        , 0.        , 0.26726124, 0.26726124,
        0.26726124, 0.26726124, 0.26726124, 0.        , 0.26726124,
        0.26726124, 0.        , 0.26726124, 0.        , 0.        ,
        0.        , 0.        , 0.26726124, 0.26726124, 0.26726124],
       [0.        , 0.        , 0.52451722, 0.        , 0.        ,
        0.29550385, 0.        , 0.29550385, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.52451722, 0.        ,
        0.52451722, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.26169047, 0.46449871, 0.26169047, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.46449871, 0.        ,
        0.        , 0.46449871, 0.        , 0.        , 0.46449871,
        0.        , 0.        , 0.        , 0.

In [104]:

import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = StemmedTfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(posts)
Y = vectorizer.transform([query])

In [106]:
for i in range(5):
    print(posts[i], dist(X[i].toarray(), Y.toarray()))

This is a toy post about machine learning. Actually, it contains not much interesting stuff. 1.414213562373095
Imaging databases provide storage capabilities. 0.8573732768944039
Most imaging databases safe images permanently. 0.6296288974669553
Imaging databases store data. 0.7653668647301795
Imaging databases store data. Imaging databases store data. Imaging databases store data. 0.7653668647301795


## 中文詞頻矩陣

In [107]:
a = ['柯文哲為了大巨蛋一事找趙藤雄算帳', '柯P將不在大巨蛋舉辦世運會']

In [118]:
import jieba
jieba.load_userdict('userdict.txt')
corpus = []
for w in a:
    corpus.append(' '.join(jieba.cut(w)))

In [119]:
corpus

['柯文哲 為 了 大巨蛋 一事 找 趙藤雄 算帳', '柯P 將 不 在 大巨蛋 舉辦 世運會']

In [120]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [121]:
print(vectorizer.get_feature_names())

['一事', '世運會', '大巨蛋', '柯p', '柯文哲', '算帳', '舉辦', '趙藤雄']


In [122]:
from sklearn.metrics.pairwise import cosine_distances
cosine_distances(X)

array([[0.       , 0.7763932],
       [0.7763932, 0.       ]])

In [123]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X)

array([[1.       , 0.2236068],
       [0.2236068, 1.       ]])

## 同義詞爬蟲

In [133]:
import requests
from bs4 import BeautifulSoup
res = requests.get('https://zh.wikipedia.org/wiki/%E6%9F%AF%E6%96%87%E5%93%B2')
soup = BeautifulSoup(res.text, 'lxml')
synonym = '/'.join([b.text for b in soup.select('.mw-parser-output p')[6].select('b')])

In [134]:
with open('synonym.txt' , 'w') as f:
    f.write(synonym)

## 建立同義詞詞頻矩陣

In [135]:
with open('synonym.txt' , 'r') as f:
    data = f.read()

In [148]:
words = data.split('/')
synonym_dic = {}
for w in words[1:]:
    synonym_dic[w.lower()] = words[0]
synonym_dic

{'柯p': '柯文哲', 'kp': '柯文哲'}

In [149]:
class SynonymCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SynonymCountVectorizer, self).build_analyzer()
        return lambda doc: (synonym_dic.get(w, w) for w in analyzer(doc))

In [150]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = SynonymCountVectorizer()
X = vectorizer.fit_transform(corpus)

In [151]:
print(vectorizer.get_feature_names())

['一事', '世運會', '大巨蛋', '柯文哲', '算帳', '舉辦', '趙藤雄']


In [152]:
from sklearn.metrics.pairwise import cosine_distances
cosine_distances(X)

array([[0.       , 0.5527864],
       [0.5527864, 0.       ]])

## 加入停用詞

In [153]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = SynonymCountVectorizer(stop_words=['一事'])
X = vectorizer.fit_transform(corpus)

In [154]:
from sklearn.metrics.pairwise import cosine_distances
cosine_distances(X)

array([[0. , 0.5],
       [0.5, 0. ]])