-
Notifications
You must be signed in to change notification settings - Fork 1
/
term_similarity.py
133 lines (108 loc) · 3.96 KB
/
term_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from abc import ABC
from dataclasses import dataclass
from math import nan
from ir_axioms.axiom.base import Axiom
from ir_axioms.axiom.utils import strictly_greater, approximately_equal
from ir_axioms.model import Query, RankedDocument, IndexContext
from ir_axioms.modules.similarity import (
TermSimilarityMixin, WordNetSynonymSetTermSimilarityMixin,
FastTextWikiNewsTermSimilarityMixin
)
@dataclass(frozen=True)
class _STMC1(Axiom, TermSimilarityMixin, ABC):
def preference(
self,
context: IndexContext,
query: Query,
document1: RankedDocument,
document2: RankedDocument
):
document1_terms = context.term_set(document1)
document2_terms = context.term_set(document2)
query_terms = context.term_set(query)
return strictly_greater(
self.average_similarity(document1_terms, query_terms),
self.average_similarity(document2_terms, query_terms)
)
@dataclass(frozen=True)
class STMC1(_STMC1, WordNetSynonymSetTermSimilarityMixin):
name = "STMC1"
@dataclass(frozen=True)
class STMC1_fastText(_STMC1, FastTextWikiNewsTermSimilarityMixin):
name = "STMC1-fastText"
@dataclass(frozen=True)
class _STMC2(Axiom, TermSimilarityMixin, ABC):
def preference(
self,
context: IndexContext,
query: Query,
document1: RankedDocument,
document2: RankedDocument
):
"""
Given the most similar query term and non-query term,
prefer the first document if
the second document's non-query term frequency
compared to the first document's query term frequency
is similar to the second document's length
compared to the first document's length.
Note that the selection of the most similar query non-query term pair
is non-deterministic if there are multiple equally most similar pairs.
"""
document1_terms = context.term_set(document1)
document2_terms = context.term_set(document2)
document_terms = document1_terms | document2_terms
query_terms = context.term_set(query)
non_query_terms = document_terms - query_terms
most_similar_terms = self.most_similar_pair(
query_terms,
non_query_terms,
)
if most_similar_terms is None:
return 0
most_similar_query_term, most_similar_non_query_term = (
most_similar_terms
)
def term_frequency_ratio(
document_a: RankedDocument,
document_b: RankedDocument
):
tf_most_similar_query_term = context.term_frequency(
document_b,
most_similar_query_term
)
tf_most_similar_non_query_term = context.term_frequency(
document_a,
most_similar_non_query_term
)
if tf_most_similar_query_term <= 0:
return nan
return tf_most_similar_non_query_term / tf_most_similar_query_term
if (
len(document1_terms) > 0 and
approximately_equal(
len(document2_terms) / len(document1_terms),
term_frequency_ratio(document2, document1),
margin_fraction=0.2
)
):
return 1
elif (
len(document2_terms) > 0 and
approximately_equal(
len(document1_terms) / len(document2_terms),
term_frequency_ratio(document1, document2),
margin_fraction=0.2
)
):
return -1
return 0
@dataclass(frozen=True)
class STMC2(_STMC2, WordNetSynonymSetTermSimilarityMixin):
name = "STMC2"
@dataclass(frozen=True)
class STMC2_fastText(_STMC2, FastTextWikiNewsTermSimilarityMixin):
name = "STMC2-fastText"
# Shorthand names:
STMC1_f = STMC1_fastText
STMC2_f = STMC2_fastText