-
Notifications
You must be signed in to change notification settings - Fork 846
/
Copy pathterm_similarity.py
187 lines (150 loc) · 6.56 KB
/
term_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 11 04:02:10 2016
@author: DIP
"""
import numpy as np
from scipy.stats import itemfreq
def vectorize_terms(terms):
terms = [term.lower() for term in terms]
terms = [np.array(list(term)) for term in terms]
terms = [np.array([ord(char) for char in term])
for term in terms]
return terms
def boc_term_vectors(word_list):
word_list = [word.lower() for word in word_list]
unique_chars = np.unique(
np.hstack([list(word)
for word in word_list]))
word_list_term_counts = [{char: count for char, count in itemfreq(list(word))}
for word in word_list]
boc_vectors = [np.array([int(word_term_counts.get(char, 0))
for char in unique_chars])
for word_term_counts in word_list_term_counts]
return list(unique_chars), boc_vectors
root = 'Believe'
term1 = 'beleive'
term2 = 'bargain'
term3 = 'Elephant'
terms = [root, term1, term2, term3]
vec_root, vec_term1, vec_term2, vec_term3 = vectorize_terms(terms)
print '''
root: {}
term1: {}
term2: {}
term3: {}
'''.format(vec_root, vec_term1, vec_term2, vec_term3)
features, (boc_root, boc_term1, boc_term2, boc_term3) = boc_term_vectors(terms)
print 'Features:', features
print '''
root: {}
term1: {}
term2: {}
term3: {}
'''.format(boc_root, boc_term1, boc_term2, boc_term3)
def hamming_distance(u, v, norm=False):
if u.shape != v.shape:
raise ValueError('The vectors must have equal lengths.')
return (u != v).sum() if not norm else (u != v).mean()
def manhattan_distance(u, v, norm=False):
if u.shape != v.shape:
raise ValueError('The vectors must have equal lengths.')
return abs(u - v).sum() if not norm else abs(u - v).mean()
def euclidean_distance(u,v):
if u.shape != v.shape:
raise ValueError('The vectors must have equal lengths.')
distance = np.sqrt(np.sum(np.square(u - v)))
return distance
import copy
import pandas as pd
def levenshtein_edit_distance(u, v):
# convert to lower case
u = u.lower()
v = v.lower()
# base cases
if u == v: return 0
elif len(u) == 0: return len(v)
elif len(v) == 0: return len(u)
# initialize edit distance matrix
edit_matrix = []
# initialize two distance matrices
du = [0] * (len(v) + 1)
dv = [0] * (len(v) + 1)
# du: the previous row of edit distances
for i in range(len(du)):
du[i] = i
# dv : the current row of edit distances
for i in range(len(u)):
dv[0] = i + 1
# compute cost as per algorithm
for j in range(len(v)):
cost = 0 if u[i] == v[j] else 1
dv[j + 1] = min(dv[j] + 1, du[j + 1] + 1, du[j] + cost)
# assign dv to du for next iteration
for j in range(len(du)):
du[j] = dv[j]
# copy dv to the edit matrix
edit_matrix.append(copy.copy(dv))
# compute the final edit distance and edit matrix
distance = dv[len(v)]
edit_matrix = np.array(edit_matrix)
edit_matrix = edit_matrix.T
edit_matrix = edit_matrix[1:,]
edit_matrix = pd.DataFrame(data=edit_matrix,
index=list(v),
columns=list(u))
return distance, edit_matrix
def cosine_distance(u, v):
distance = 1.0 - (np.dot(u, v) /
(np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v))))
)
return distance
# DEMOS!
# build the term vectors here
root_term = root
root_vector = vec_root
root_boc_vector = boc_root
terms = [term1, term2, term3]
vector_terms = [vec_term1, vec_term2, vec_term3]
boc_vector_terms = [boc_term1, boc_term2, boc_term3]
# HAMMING DISTANCE DEMO
for term, vector_term in zip(terms, vector_terms):
print 'Hamming distance between root: {} and term: {} is {}'.format(root_term,
term,
hamming_distance(root_vector, vector_term, norm=False))
for term, vector_term in zip(terms, vector_terms):
print 'Normalized Hamming distance between root: {} and term: {} is {}'.format(root_term,
term,
round(hamming_distance(root_vector, vector_term, norm=True), 2))
# MANHATTAN DISTANCE DEMO
for term, vector_term in zip(terms, vector_terms):
print 'Manhattan distance between root: {} and term: {} is {}'.format(root_term,
term,
manhattan_distance(root_vector, vector_term, norm=False))
for term, vector_term in zip(terms, vector_terms):
print 'Normalized Manhattan distance between root: {} and term: {} is {}'.format(root_term,
term,
round(manhattan_distance(root_vector, vector_term, norm=True),2))
# EUCLIDEAN DISTANCE DEMO
for term, vector_term in zip(terms, vector_terms):
print 'Euclidean distance between root: {} and term: {} is {}'.format(root_term,
term,
round(euclidean_distance(root_vector, vector_term),2))
# LEVENSHTEIN EDIT DISTANCE DEMO
for term in terms:
edit_d, edit_m = levenshtein_edit_distance(root_term, term)
print 'Computing distance between root: {} and term: {}'.format(root_term,
term)
print 'Levenshtein edit distance is {}'.format(edit_d)
print 'The complete edit distance matrix is depicted below'
print edit_m
print '-'*30
# COSINE DISTANCE\SIMILARITY DEMO
for term, boc_term in zip(terms, boc_vector_terms):
print 'Analyzing similarity between root: {} and term: {}'.format(root_term,
term)
distance = round(cosine_distance(root_boc_vector, boc_term),2)
similarity = 1 - distance
print 'Cosine distance is {}'.format(distance)
print 'Cosine similarity is {}'.format(similarity)
print '-'*40