# Obtain External Knowledge

1. Extractor Keywords

2. Acquire External Knowledge:
  1. Use Baidu Translate API to translate all extracted key phrases into English as a fallback mechanism for retrieval.
  2. Retrieval Rollback Mechanism:
First, use the key phrases in the target language to search via the Wikipedia API.
    1. If the search fails, use the translated English phrases for retrieval.
Note: During retrieval, there might be errors due to Traditional Chinese redirects. These need to be cleared, and results in Traditional Chinese should be forcefully converted.
  3. Extract the first 200 characters from the search results.

In [None]:
import pandas as pd
import time
from deep_translator import GoogleTranslator
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import requests
import random
import json
from hashlib import md5
from tqdm import tqdm
import spacy
import os

## Translate the Keywords - Baidu API

In [None]:
# Baidu Translate API setting
appid = ''
appkey = ''
endpoint = 'http://api.fanyi.baidu.com'
path = '/api/trans/vip/translate'
url = endpoint + path

from_lang = 'auto'
to_lang = 'en'

def make_md5(s, encoding='utf-8'):
    return md5(s.encode(encoding)).hexdigest()

def baidu_api(query, from_lang='auto', to_lang='en'):
    salt = random.randint(32768, 65536)
    sign = make_md5(appid + query + str(salt) + appkey)
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    payload = {
        'appid': appid,
        'q': query,
        'from': from_lang,
        'to': to_lang,
        'salt': salt,
        'sign': sign
    }
    try:
        r = requests.post(url, params=payload, headers=headers)
        result = r.json()
        if 'trans_result' in result:
            return [item['dst'] for item in result['trans_result']]
        else:
            print(f"Translate failed: {result}")
            return [query]
    except Exception as e:
        print(f"Request failed: {e}")
        return [query]

def translate_keyphrases(keywords):
    if not keywords:
        return []
    batch_query = '\n'.join(keywords)
    return baidu_api(batch_query)


### Apply on the Dataset

In [None]:
input_dir = '../data/detect_val/extract_m2/'
output_dir = '../data/detect_val/m2_translated_keywords/'
os.makedirs(output_dir, exist_ok=True)

def process_jsonl(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile, desc=f"Processing {os.path.basename(input_path)}"):
            data = json.loads(line)
            if 'keywords' in data:
                try:
                    data['keywords_en'] = translate_keyphrases(data['keywords'])
                except Exception as e:
                    print(f"Translate failed (file: {input_path}, row: {line}): {e}")
                    data['keywords_en'] = []
            json.dump(data, outfile, ensure_ascii=False)
            outfile.write('\n')


file_list = [f for f in os.listdir(input_dir) if f.endswith('.jsonl')]
for filename in tqdm(file_list, desc="Processing Files"):
    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, filename)
    process_jsonl(input_path, output_path)

## Obtain the External Knowledge

In [None]:
lang_map = {
    'ar': 'ar',
    'de': 'de',
    'en': 'en',
    'es': 'es',
    'fi': 'fi',
    'fr': 'fr',
    'hi': 'hi',
    'it': 'it',
    'sv': 'sv',
    'zh': 'zh',
    'ca': 'ca',
    'fa': 'fa',
    'cs': 'cs',
    'eu': 'eu'
}

proxies = {
    "http": "http://127.0.0.1:10809",
    "https": "http://127.0.0.1:10809",
    "socks5": "socks5://127.0.0.1:10808"
}

# A function to clean up the Chinese redirect prompt text
def clean_redirect_text(text):
    return "\n".join([line for line in text.split("\n") if "重定向" not in line and "轉換系統" not in line])


# Wikipedia API searching function
def fetch_wikipedia_summary(query, lang, proxies, timeout=30):
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": query,
        "prop": "extracts",
        "exintro": True,
        "explaintext": True,
    }
    if lang == 'zh':
        params["variant"] = "zh-cn"
    try:
        response = requests.get(url, params=params, proxies=proxies, timeout=timeout)
        response.raise_for_status()
        data = response.json()
        pages = data.get("query", {}).get("pages", {})
        for _, page in pages.items():
            if "extract" in page:
                summary = page["extract"]
                return clean_redirect_text(summary)
        return ""
    except Exception as e:
        return f"request fail: {e}"


def fetch_summaries_for_keyphrases(phrases, lang):
    summaries = []
    for phrase in phrases:
        summary = fetch_wikipedia_summary(phrase, lang, proxies)
        if summary:
            summaries.append(summary[:200])
        else:
            summaries.append("")
        time.sleep(1)
    return summaries

In [None]:
def process_jsonl(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile, desc=f"Processing {os.path.basename(input_path)}"):
            data = json.loads(line)
            lang = data.get('lang', 'en')
            keyphrases = data.get('keywords', [])
            keyphrases_en = data.get('keywords_en', [])

            summaries = fetch_summaries_for_keyphrases(keyphrases, lang)

            if all(s == "" for s in summaries):
                summaries = fetch_summaries_for_keyphrases(keyphrases_en, 'en')

            data['wikipedia_context'] = " ".join(summaries)

            json.dump(data, outfile, ensure_ascii=False)
            outfile.write('\n')

In [None]:
input_dir = '../data/detect_val/m2_translated_keywords/'
output_dir = '../data/detect_val/exknowledge_m2/'
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith('.jsonl'):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)
        process_jsonl(input_path, output_path)