Skip to content

Commit

Permalink
provide 3 different ways of using tfidf
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Jul 18, 2018
1 parent c3bc309 commit 817915e
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 17 deletions.
2 changes: 1 addition & 1 deletion rltk/similarity/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rltk.similarity.jaro import jaro_winkler_distance, jaro_winkler_similarity, jaro_distance
from rltk.similarity.jaccard import jaccard_index_similarity, jaccard_index_distance
from rltk.similarity.cosine import cosine_similarity, string_cosine_similarity
from rltk.similarity.tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_similarity_by_dict
from rltk.similarity.tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_cosine_similarity, TF_IDF
from rltk.similarity.lcs import longest_common_subsequence_distance, metric_longest_common_subsequence
from rltk.similarity.ngram import ngram_distance, ngram_similarity
from rltk.similarity.qgram import qgram_distance, qgram_similarity
Expand Down
57 changes: 41 additions & 16 deletions rltk/similarity/tf_idf.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,28 +61,28 @@ def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False):
return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))


def compute_tf(t, bag_len):
def compute_tf(tokens):
"""
Args:
t (dict): {term: count,...}
tokens (list): tokens
"""
return {k: float(v) / bag_len for k, v in t.items()}
terms = collections.Counter(tokens)
return {k: float(v) / len(tokens) for k, v in terms.items()}


# # plus 1
# def compute_idf(df, doc_size, math_log=False):
# return {k: float(doc_size) / v if math_log is False \
# else math.log(float(doc_size + 1) / (v + 1)) \
# for k, v in df.iteritems()}


def compute_idf(df, doc_size, math_log=False):
def compute_idf(df_corpus, doc_size, math_log=False):
"""
Args:
df_corpus (dict): terms in document
doc_size (int): total document size
math_log (bool): logarithm of the result
"""
return {k: doc_size * 1.0 / v if math_log is False \
else math.log(doc_size * 1.0 / v) \
for k, v in df.iteritems()}
for k, v in df_corpus.items()}


def tf_idf_similarity_by_dict(tfidf_dict1, tfidf_dict2):
def tf_idf_cosine_similarity(tfidf_dict1, tfidf_dict2):
"""
all terms of dict1 and dict2 should be in corpus
Expand All @@ -92,14 +92,39 @@ def tf_idf_similarity_by_dict(tfidf_dict1, tfidf_dict2):

# intersection of dict1 and dict2
# ignore the values that are not in both
for t in tfidf_dict1.iterkeys():
for t in tfidf_dict1.keys():
if t in tfidf_dict2:
v_x_y = tfidf_dict1[t] * tfidf_dict2[t]

for t, tfidf in tfidf_dict1.iteritems():
for t, tfidf in tfidf_dict1.items():
v_x_2 += tfidf * tfidf
for t, tfidf in tfidf_dict2.iteritems():
for t, tfidf in tfidf_dict2.items():
v_y_2 += tfidf * tfidf

# cosine similarity
return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))


class TF_IDF():
def __init__(self):
self.tf = {}
self.df_corpus = {}
self.doc_size = 0
self.idf = 0

def add_document(self, doc_id, tokens):
self.doc_size += 1
tf = compute_tf(tokens)
self.tf[doc_id] = tf
for k, _ in tf.items():
self.df_corpus[k] = self.df_corpus.get(k, 0) + 1

def pre_compute(self, math_log=False):
self.idf = compute_idf(self.df_corpus, self.doc_size, math_log)

def similarity(self, id1, id2):
tf_x = self.tf[id1]
tfidf_x = {k: v * self.idf[k] for k, v in tf_x.items()}
tf_y = self.tf[id2]
tfidf_y = {k: v * self.idf[k] for k, v in tf_y.items()}
return tf_idf_cosine_similarity(tfidf_x, tfidf_y)

0 comments on commit 817915e

Please sign in to comment.