## Edit Distance
- pip install editdistance

In [1]:
! pip install editdistance

Collecting editdistance
  Downloading editdistance-0.4-cp36-cp36m-win_amd64.whl
Installing collected packages: editdistance
Successfully installed editdistance-0.4


In [2]:
import editdistance
editdistance.eval('banana', 'bahama')

2

In [None]:
'banana' # n => h
'bahana' # n => m
'bahama'

In [3]:
import editdistance
editdistance.eval('machine', 'macinae')

2

In [None]:
'machine' #=> delete h
'macine'  #=> insert a
'macinae'

## Bag of words

In [8]:
?CountVectorizer

In [5]:
content = ['How to format my hard disk', 'Hard disk format problems']

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(content)
X

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [7]:
print(vectorizer.get_feature_names())
X.toarray()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']


array([[1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 0, 1, 0]], dtype=int64)

In [9]:
X.toarray().transpose()

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]], dtype=int64)

## 讀取 toy 資料

In [17]:
import os
path = 'data/toy'
for f in os.listdir(path):
    print(open(os.path.join(path, f)).read())

This is a toy post about machine learning. Actually, it contains not much interesting stuff.
Imaging databases provide storage capabilities.
Most imaging databases safe images permanently.
Imaging databases store data.
Imaging databases store data. Imaging databases store data. Imaging databases store data.


In [20]:
posts = [open(os.path.join(path, f)).read() for f in os.listdir(path)]
posts

['This is a toy post about machine learning. Actually, it contains not much interesting stuff.',
 'Imaging databases provide storage capabilities.',
 'Most imaging databases safe images permanently.',
 'Imaging databases store data.',
 'Imaging databases store data. Imaging databases store data. Imaging databases store data.']

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X= vectorizer.fit_transform(posts)
X

<5x25 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [47]:
X.toarray()

array([[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        0, 0, 0]], dtype=int64)

In [23]:
print(vectorizer.get_feature_names())
X.toarray()

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'safe', 'storage', 'store', 'stuff', 'this', 'toy']


array([[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
        0, 0, 0]], dtype=int64)

In [26]:
new_post = 'imaging database qoo'
new_post_vec = vectorizer.transform([new_post])
new_post_vec

<1x25 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

## Euclidean Distance

In [35]:
import scipy as sp
import numpy as np
a = np.array([1,1,0,0,1])
b = np.array([0,1,1,0,1])

# method 1
sum((a - b) ** 2) ** (1/2)

# method 2
#np.array([a,b])
sp.linalg.norm(np.array([a,b]))


2.4494897427831779

### 計算imaging database 與每篇文章的距離

In [56]:
def dist(v1, v2):
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray())

In [57]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X.getrow(i)
    d = dist(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=3.87: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=2.00: Imaging databases provide storage capabilities.
=== Post 2 with dist=2.24: Most imaging databases safe images permanently.
=== Post 3 with dist=1.73: Imaging databases store data.
=== Post 4 with dist=5.57: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=1.73


### 使用sklearn 的 euclidean distance

In [58]:
from sklearn.metrics.pairwise import euclidean_distances
ed = euclidean_distances(new_post_vec, X)
ed

array([[ 3.87298335,  2.        ,  2.23606798,  1.73205081,  5.56776436]])

In [46]:
#pos = ed.argsort()[0][0]
pos = ed.argsort().flatten()[0]
posts[pos]
#posts[pos]

'Imaging databases store data.'

### 更改距離計算方式

In [64]:
import scipy as sp
import numpy as np
a = np.array([1,1,0,0,1])
b = np.array([0,1,1,0,1])
c = np.array([1,1,0,0,1,0,0,0,0,0,0,0])
d = np.array([1,1,0,0,1,1,1,1,1,1])
a
sp.linalg.norm([d])

2.8284271247461903

In [83]:
def dist(v1, v2):
    v1_normalized  = v1 / sp.linalg.norm(v1) 
    v2_normalized  = v2 / sp.linalg.norm(v2)
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta)

In [84]:
import sys
best_doc = None
best_dist = 999
best_i = None
num_samples = len(posts)
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X.getrow(i).toarray()
    d = dist(post_vec, new_post_vec.toarray())
    print("=== Post %i with dist=%.2f: %s"%(i, d, post))
    if d<best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))


=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=1.05: Imaging databases provide storage capabilities.
=== Post 2 with dist=1.09: Most imaging databases safe images permanently.
=== Post 3 with dist=1.00: Imaging databases store data.
=== Post 4 with dist=1.00: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=1.00
