From 31ac0159f910ae4e59f6aa0fe536fc2f3a82dc75 Mon Sep 17 00:00:00 2001 From: Matthew Esposito Date: Fri, 24 Mar 2023 15:14:17 -0400 Subject: [PATCH] Implement keyword ranking --- backend/scopeBackend/keyword_ranker.py | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/backend/scopeBackend/keyword_ranker.py b/backend/scopeBackend/keyword_ranker.py index e3615d5..0770770 100644 --- a/backend/scopeBackend/keyword_ranker.py +++ b/backend/scopeBackend/keyword_ranker.py @@ -2,7 +2,10 @@ # so that sources with higher primary & secondary keyword count # and matches are prioritized -#takes in a primary keyword, array of secondary keywords, and the unranked queryset + +# takes in a primary keyword, array of secondary keywords, and the unranked +# queryset. secondary_kws MUST be an array of strings. If it is a STRING, it +# will be iterated through by CHARACTER, rapidly inflating its score. def rank(primary_kw, secondary_kws, queryset): # 1. Reorder sources based on how many keywords were matched # - the primary keyword match is most important so reorder on that basis first @@ -13,5 +16,23 @@ def rank(primary_kw, secondary_kws, queryset): # of times the primary keyword appears in a given source # - then for every source, store the counts for each secondary kw in a dictionary # and reorder based on the cumulative counts - # - return the final reordered set - pass \ No newline at end of file + # - return the final reordered set + def prim(string): + return string.upper().count(primary_kw.upper()) + + def sec(string): + return sum([ + string.upper().count(secondary.upper()) + for secondary in secondary_kws + ]) + + sorted_list = [(query, prim(query), sec(query)) for query in queryset] + sorted_list.sort(key=lambda a: -a[2]) + sorted_list.sort(key=lambda a: -a[1]) + return sorted_list + + +# docs = ["Ukraine war", "Syrian war war", "No more w*r", "Peace", "Coronavirus"] + +# search = rank("war", ["coronavirus", "peace"], docs) +# print(search)