notebook1
## PART 1. Document retrieval

### 載入相關函式庫

In [1]:
from pathlib import Path
from functools import partial
import re
import numpy as np
import pandas as pd
import scipy
import json
import jieba

jieba.set_dictionary("dict.txt.big")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from pandarallel import pandarallel
# Adjust the number of workers if you want
pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=5)

from utils import load_json, jsonl_dir_to_df, generate_evidence_to_wiki_pages_mapping
from tqdm import tqdm
tqdm.pandas() # for progress_apply


2023-06-01 23:30:38.277040: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 整理中文停用詞

In [2]:
from TCSP import read_stopwords_list
stopwords = read_stopwords_list()

Data class for type hinting

In [3]:
from dataclasses import dataclass
from typing import Dict, List, Set, Tuple, Union
@dataclass
class Claim:
    data: str

@dataclass
class AnnotationID:
    id: int

@dataclass
class EvidenceID:
    id: int

@dataclass
class PageTitle:
    title: str

@dataclass
class SentenceID:
    id: int

@dataclass
class Evidence:
    data: List[List[Tuple[AnnotationID, EvidenceID, PageTitle, SentenceID]]]

In [4]:
def tokenize(text: str, stopwords: list) -> str:  
    """This function performs Chinese word segmentation and removes stopwords.

    Args:
        text (str): claim or wikipedia article
        stopwords (list): common words that contribute little to the meaning of a sentence

    Returns:
        str: word segments separated by space (e.g. "我 喜歡 吃 蘋果")
    """
    
    tokens = jieba.cut(text) # TODO: Write your code here
    tokens = [w for w in tokens if w not in stopwords ]
    return " ".join(tokens)


In [5]:
def save_doc(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
    mode: str = "train",
    num_pred_doc: int = 5,
) -> None:
    with open(
        f"data/{mode}_doc{num_pred_doc}.jsonl",
        "w",
        encoding="utf8",
    ) as f:
        for i, d in enumerate(data):
            d["predicted_pages"] = list(predictions.iloc[i])
            f.write(json.dumps(d, ensure_ascii=False) + "\n")

In [6]:
def calculate_precision(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
) -> None:
    precision = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        # Extract all ground truth of titles of the wikipedia pages
        # evidence[2] refers to the title of the wikipedia page
        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])

        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        if len(predicted_pages) != 0:
            precision += len(hits) / len(predicted_pages)

        count += 1

    # Macro precision
    return precision / count

def calculate_recall(
    data: List[Dict[str, Union[int, Claim, Evidence]]],
    predictions: pd.Series,
) -> None:
    recall = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])
        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        recall += len(hits) / len(gt_pages)
        count += 1

    return recall / count

### 取得wiki_pages

In [7]:
# First time running this cell will 34 minutes using Google Colab.

wiki_path = "data/wiki-pages"
wiki_cache = "wiki"
target_column = "text"

wiki_cache_path = Path(f"data/{wiki_cache}.pkl")
if wiki_cache_path.exists():
    wiki_pages = pd.read_pickle(wiki_cache_path)
else:
    # You need to download `wiki-pages.zip` from the AICUP website
    wiki_pages = jsonl_dir_to_df(wiki_path)
    # wiki_pages are combined into one dataframe, so we need to reset the index
    wiki_pages = wiki_pages.reset_index(drop=True)
    # tokenize the text and keep the result in a new column `processed_text`
    wiki_pages["processed_text"] = wiki_pages[target_column].parallel_apply(
        partial(tokenize, stopwords=stopwords)
    )
    # save the result to a pickle file
    wiki_pages.to_pickle(wiki_cache_path, protocol=4)

### 計算出mapping (下面在找出wiki_sentences時會用到)

In [8]:
mapping_path = Path(f"data/mapping.json")
if mapping_path.exists():
    mapping = json.load( open( "data/mapping.json" ) )
else:
    mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages)
    json.dump( mapping, open( "data/mapping.json", 'w' ) )

# 第一步：篩選不重要的wiki_pages (用文本長度)


### 設定要篩選出的最小文本長度 (長度低的文本通常不重要，我測試過設定成90效果不錯)

In [9]:
min_wiki_length = 90

In [10]:
len(wiki_pages)

1187751

In [11]:
wiki_pages = wiki_pages[
    wiki_pages['processed_text'].str.len() > min_wiki_length
]

In [12]:
len(wiki_pages)

831630

# 第二步：整理出wiki_sentences 

### (若已經有產生檔案，則直接跳到下面讀取即可。)

In [13]:
path = Path("data/wiki_sentence_all_length90_process.pkl")
if path.exists():
    wiki_sentences = pd.read_pickle("data/wiki_sentence_all_length90_process.pkl")  
else:
    data = {'id': [], 'idx': [], 'text': []}
    wiki_sentences = []
    for i in tqdm(range(len(wiki_pages))):
        id = wiki_pages.iloc[i]['id']
    #     print(wiki_pages.iloc[i])
        for sentence in mapping[id].values():
            if(sentence != ''):
                dic = {'id':id, 'idx':int(i), 'text':sentence}
                wiki_sentences.append(dic)
    wiki_sentences = pd.DataFrame(wiki_sentences)
    del wiki_pages
    wiki_sentences["processed_text"] = wiki_sentences['text'].parallel_apply(
        partial(tokenize, stopwords=stopwords)
    )
    wiki_sentences.to_pickle('data/wiki_sentence_all_length90_process.pkl')

In [14]:
len(wiki_sentences)

3830335

# 第三步：開始使用wiki_sentences搭配TF-IDF預測

### 首先從wiki_sentences中篩掉長度小於min_sentence_length的，跟上面的文本篩選概念一樣，只是變成每句篩選。

In [15]:
min_sentence_length = 15
num_of_samples = 300
topk = 12
use_idf = True
sublinear_tf = True

In [16]:

wiki_sentences = wiki_sentences[
    wiki_sentences['processed_text'].str.len() >= min_sentence_length
]

In [17]:
len(wiki_sentences)

3482970

### 測試新方法，將訓練資料集納入字典庫

In [18]:
%%time
# This cell is for your scores on the training set.
DOC_DATA = load_json("data/public_train.jsonl")
doc_path = f"data/train_doc5.jsonl"
# Start to encode the corpus with TF-IDF
# DOC_DATA = pd.DataFrame(df)

# TRAIN_GT, DEV_GT = train_test_split(
#     DOC_DATA,
#     test_size=0.1,
#     random_state=20,
#     shuffle=True
# )
# TRAIN_GT = pd.DataFrame(TRAIN_GT)
TRAIN_GT = pd.DataFrame(DOC_DATA)

CPU times: user 84.6 ms, sys: 0 ns, total: 84.6 ms
Wall time: 82.6 ms


## 將train json的資料內容整理並納入wiki_sentences中

In [19]:
train_list = []
for i,row in TRAIN_GT.iterrows():
    if(row['label'] == 'NOT ENOUGH INFO'):
        continue
    wiki_names = []
    evidence_sets = row['evidence']
    for sets in evidence_sets:
        for one_set in sets:
            if(one_set[2] not in wiki_names):
                wiki_names.append(one_set[2])
    
    claim = tokenize(row['claim']  , stopwords)
    if len(wiki_names) > 0:
        for name in wiki_names:
            dic = {'id':name, 'idx':int(i), 'text':row['claim'], 'processed_text':claim}
            train_list.append(dic)
            
        
    # 整理成dictionary並新增到wiki_sentences中

Building prefix dict from /home/wx200010/AIcup2023/dict.txt.big ...
Loading model from cache /tmp/jieba.udad221628068222d71d4ed32de6bae18.cache
Loading model cost 0.478 seconds.
Prefix dict has been built successfully.


In [20]:
wiki_sentences = pd.concat([wiki_sentences , pd.DataFrame(train_list)])

In [21]:
wiki_sentences = wiki_sentences.drop(['idx'], axis=1)

In [22]:
wiki_sentences = wiki_sentences.reset_index(drop=True)

In [23]:
wiki_sentences

Unnamed: 0,id,text,processed_text
0,劉懷慎,劉懷慎 ， 彭城人 ， 東晉至南朝宋軍事人物 。,劉懷慎 彭城人 東晉 南朝 宋 軍事 人物
1,劉懷慎,任參鎮軍將軍事 、 振威將軍 、 彭城內史 。,任參鎮 軍 將軍 事 振威 將軍 彭 城內 史
2,劉懷慎,跟隨劉裕討伐南燕和盧循 ， 加封輔國將軍 。,跟隨 劉裕 討伐 南燕 盧循 加 封輔國 將軍
3,劉懷慎,義熙八年 （ 412年 ） ， 以輔國將軍身份兼任監北徐州諸軍事 ， 鎮守彭城 。,義熙 八年 412 以輔國 將軍 身份 兼任 監北 徐州 軍事 ...
4,劉懷慎,九年 （ 413年 ） ， 討平王靈秀 。,九年 413 討平 王靈秀
...,...,...,...
3492459,加拿大國家銀行,加拿大國家銀行是加拿大第六大商業銀行，是具有信用創作功能的金融機構。,加拿大 國家銀行 加拿大 第六 商業銀行 具有 信用 創作 功能 金融機構
3492460,臺中市文化資產處,臺中市文化資產處爲臺中市政府文化局所屬二級機關 ，是臺中市歷史文化資產保護的專責機關，它在臺...,臺中市 文化 資產 處 爲 臺中市 政府 文化局 所屬 二級 機關 臺中市 歷史 文化 ...
3492461,摩納哥,法國南部的摩納哥除了南部海岸線是靠地中海，其他三面皆被法國包圍。,法國 南部 摩納哥 南部 海岸線 地中海 三面 皆 法國 包圍
3492462,啓示錄,《啓示錄》有被收錄在新約聖經中。,啓示錄 收錄 新約 聖經


### 訓練TF-IDF

In [24]:
corpus = wiki_sentences['processed_text'].tolist()

In [25]:
# train TF-IDF vectorizer 
vectorizer = TfidfVectorizer( # TODO: Write your code here
    use_idf = use_idf,
    sublinear_tf = sublinear_tf,
    stop_words = stopwords
)
X = vectorizer.fit_transform(corpus)

In [26]:
def get_pred_docs_sklearn_sentence(
    claim: str,
    tokenizing_method: callable,
    vectorizer: TfidfVectorizer,
    topk: int,
) -> set:
    global wiki_sentences
    tokens = tokenizing_method(claim)
    claim_vector = vectorizer.transform([tokens])
    
    similarity_scores = cosine_similarity(claim_vector , X)# TODO: Write your code here
    
    # `similarity_scores` shape: (num_wiki_pages x 1)
    similarity_scores = similarity_scores[0, :]  # flatten the array

    
    # Sort the similarity scores in descending order
    sorted_indices = similarity_scores.argsort()[::-1] # TODO: Write your code here
    topk_sorted_indices = sorted_indices[:1000]# TODO: Write your code here
    results = []
    for idx in topk_sorted_indices:
        real_id = wiki_sentences.iloc[idx]['id']
        if(real_id not in results):
            results.append(real_id)
            if(len(results) == topk):
                break
            
    exact_matchs = []
    # You can find the following code in our AICUP2023 baseline.
    # Basically, we check if a result is exactly mentioned in the claim.
    Count = 0
    for i,result in enumerate(results):
        if (
            (result in claim)
            or (result in claim.replace(" ", "")) # E.g., MS DOS -> MSDOS
            or (result.replace("·", "") in claim) # E.g., 湯姆·克魯斯 -> 湯姆克魯斯
            or (result.replace("-", "") in claim) # E.g., X-SAMPA -> XSAMPA
        ):
            exact_matchs.append(result)
        elif "·" in result:
            splitted = result.split("·") # E.g., 阿爾伯特·愛因斯坦 -> 愛因斯坦
            for split in splitted:
                if split in claim:
                    exact_matchs.append(result)
                    break
        elif "_(" in result:
            splitted = result.split("_(") # E.g., 澎湖_(消歧義) -> 澎湖、消歧異
            splitted[1] = splitted[1][:-1]
            
            for split in splitted:
                if(")" in split):
                    split = split[:-1]
                    
                if split in claim:
                    exact_matchs.append(result)
                    break
            
#     print(set(exact_matchs))
    return set(exact_matchs)

## 開始對訓練資料找出預測的pages

In [27]:
%%time
# This cell is for your scores on the training set.
train = load_json("data/public_train.jsonl")
doc_path = f"data/train_doc5.jsonl"

# Start to encode the corpus with TF-IDF
train_df = pd.DataFrame(train)

# Perform the prediction for document retrieval
train_df["predicted_pages"] = train_df["claim"].parallel_apply(
    partial(
        get_pred_docs_sklearn_sentence,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        topk=topk,
    )
)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2324), Label(value='0 / 2324'))), …

CPU times: user 18.4 s, sys: 3.07 s, total: 21.5 s
Wall time: 1h 16min 47s


### 印出precision與recall結果

In [28]:
precision = calculate_precision(train, train_df["predicted_pages"])
recall = calculate_recall(train, train_df["predicted_pages"])
dictionary = {'topk':topk,
              'num_of_samples':num_of_samples,
              'min_sentence_length':min_sentence_length,
              'precision':precision,
              'recall':recall}
print(dictionary)

{'topk': 12, 'num_of_samples': 300, 'min_sentence_length': 15, 'precision': 0.6796583915128466, 'recall': 0.9049777296256171}


### 儲存train資料的執行結果

In [29]:
save_doc(train, train_df["predicted_pages"], mode="train")

### 最後一步：對test資料集進行預測pages，並將結果寫入檔案

In [30]:
doc_path = f"data/test_doc5.jsonl"
test = load_json("data/all_test_data.jsonl")

test_df = pd.DataFrame(test)
# Perform the prediction for document retrieval
test_df["predicted_pages"] = test_df["claim"].parallel_apply(
    partial(
        get_pred_docs_sklearn_sentence,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        topk=topk,
    )
)
save_doc(test, test_df["predicted_pages"], mode="test")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1808), Label(value='0 / 1808'))), …