In [1]:
import pickle
import MeCab

In [2]:
def load_file(name):
    with open("wordtable_event/"+name+".pickle", "rb") as f:
        return pickle.load(f)

In [3]:
word_table = load_file("word_table")
page_title_id_table = load_file("page_title_id_table")
page_heading_id_table = load_file("page_heading_id_table")
page_desc_id_table = load_file("page_desc_id_table")
zenbun_table = load_file("zenbun_table")

In [4]:
def get_word_id(word):
    global word_table
    if word not in word_table["data"]:
        return None
    return word_table["data"][word]

In [5]:
wakati = MeCab.Tagger("-Owakati")
remove_words = {"(", ")", "（", "）", "[", "]",
                    "「", "」", "+", "-", "*", "$",
                    "'", '"', "、", ".", "”", "’",
                    ":", ";", "_", "/", "?", "!",
                    "。", ",", "=", "＝"}
def split_word(keyword):
    return [get_word_id(r) for r in wakati.parse(keyword).split() if r not in remove_words]

In [6]:
def set_score(result, word_id, table, score):
    if word_id not in table: 
        return 
    pageset = table[word_id]
    for page in pageset:
        if page not in result:
            result[page] = 0
        result[page] += score
def zenbun_search(result,keyword, score):
    global zenbun_table
    for page_id, zenbun in zenbun_table.items():
        if keyword in zenbun:
            if page_id not in result:
                result[page_id] = 0
            result[page_id] += score

In [7]:
def scored_search(keyword):
    result = {}
    for word_id in split_word(keyword):
        if word_id is not None:
            set_score(result,word_id, page_title_id_table, 30)
            set_score(result,word_id, page_heading_id_table, 10)
            set_score(result,word_id, page_desc_id_table, 1)
        zenbun_search(result, keyword, 1)
    return result

In [8]:
def get_event_id(page_id, event_index):
    return page_id*1000 + event_index

In [9]:
event_dict = {}
with open("pages.pickle", "rb") as f:
    for page in pickle.load(f):
        page_id = page["id"]
        if "title" not in page["event"]:
            continue
        for i in range(len(page["event"]["title"])):
            event_id = get_event_id(page_id, i)
            event_dict[event_id] = {
                "title": page["event"]["title"][i],
                "start": page["event"]["start"][i],"end": page["event"]["end"][i],
                "page_id": page_id,
                "page": page
            }

In [10]:
def sort_score(scores):
    score_array = []
    for page_id, score in scores.items():
        score_array.append({
            "event_id": page_id,
            "score": score
        })
    score_array.sort(key=lambda x: x["score"])
    return score_array

In [11]:
def search(keyword):
    if len(keyword) == 0:
        return
    scores = scored_search(keyword)
    scores = [event_dict[s["event_id"]] for s in sort_score(scores)]
    return scores
    

In [12]:
search("ITF")

[{'title': 'ITF.鑑賞会',
  'start': '2021-03-26 02:00',
  'end': '2021-03-26 04:00',
  'page_id': 32,
  'page': {'id': 32,
   'date': '2021-03-25T03:37:49',
   'date_gmt': '2021-03-24T18:37:49',
   'guid': {'rendered': 'https://www.stb.tsukuba.ac.jp/~shinkan-web/orgadmin/?page_id=32'},
   'modified': '2021-03-30T01:50:52',
   'modified_gmt': '2021-03-29T16:50:52',
   'slug': '%e3%83%86%e3%82%b9%e3%83%88%e3%82%af%e3%83%a9%e3%83%96',
   'status': 'publish',
   'type': 'page',
   'link': 'https://www.stb.tsukuba.ac.jp/~shinkan-web/orgadmin/%e3%83%86%e3%82%b9%e3%83%88%e3%82%af%e3%83%a9%e3%83%96/',
   'title': {'rendered': 'テストクラブ'},
   'content': {'rendered': '<h1>テストクラブの紹介です。</h1>\n<p>♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦♦</p>\n<p>こんにちは❗️❗️❗️❗️❗️❗️</p>\n<pre><span style="color: #ff0000;">テ</span><span style="color: #ff9900;">ス</span><span style="color: #339966;">ト</span><span style="color: #33cccc;">ク</span><span style="color: #800080;"><span style="color: #3366ff;">ラ</span>ブ</span>です