# Obtain External Knowledge - Google API

In [None]:
import openai
import os
import json
import requests
from tqdm import tqdm
import re

In [None]:
proxies = {
    "http": "http://127.0.0.1:10809",
    "https": "http://127.0.0.1:10809"
}

In [None]:
API_KEY = ''
CSE_ID = ''

def build_query(keywords):
    return " ".join([f'"{keyword}"' for keyword in keywords])

# send Google Search API request
def google_search(query, api_key, cse_id, num=3):
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'q': query,
        'key': api_key,
        'cx': cse_id,
        'num': num
    }
    try:
        response = requests.get(url, params=params, proxies=proxies)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

def process_results(results):
    entries = []
    if results and 'items' in results:
        for item in results['items']:
            title = item['title']
            snippet = item.get('snippet', 'No abstract')
            entries.append(f"{title}: {snippet}")
    return " || ".join(entries) if entries else "Fail to retrirve related results."


def batch_search_from_files(input_folder, output_folder):

    os.makedirs(output_folder, exist_ok=True)

    for file_name in tqdm(os.listdir(input_folder)):
        if file_name.endswith('.jsonl'):
            input_path = os.path.join(input_folder, file_name)
            output_path = os.path.join(output_folder, file_name)

            with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
                for line in infile:
                    data = json.loads(line)
                    keywords = data.get('keywords', [])

                    if keywords:
                        query = build_query(keywords)
                        search_results = google_search(query, API_KEY, CSE_ID)
                        context_googlecse = process_results(search_results)
                    else:
                        context_googlecse = "Cannot retrive keywords."

                    data['context_googlecse'] = context_googlecse

                    outfile.write(json.dumps(data, ensure_ascii=False).replace('\u200f', '') + '\n')

In [None]:
input_folder = '../data/detect_val/extract_m1/'
output_folder = '../data/detect_val/exknowledge_m1/'

batch_search_from_files(input_folder, output_folder)

100%|██████████| 10/10 [14:46<00:00, 88.69s/it]
