# Key Phrases Extraction - OpenAI

In [None]:
import openai
import os
import json
import requests
from tqdm import tqdm
import re

In [None]:
api_key = ""
proxies = {
    "http": "http://127.0.0.1:10809",
    "https": "http://127.0.0.1:10809"
}

In [None]:
input_folder = '../data/val/val/'
output_folder = '../data/detect_val/extract_m1/'
os.makedirs(output_folder, exist_ok=True)

def extract_keyphrases(question, retries=3):
    prompt = f'Extract key phrases specially named entities from the following text. Return the results as a list of keywords in this format: [<keyword1>, <keyword2>, ...]:\n{question}'

    for attempt in range(retries):
        try:
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers={
                    "Authorization": f"Bearer {api_key}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": "gpt-3.5-turbo",
                    "messages": [
            {"role": "system", "content": "You are an expert in extracting key phrases specially named entities from multilingual texts."},
            {"role": "user", "content": prompt}
        ],
                },
                proxies=proxies
            )

            response.raise_for_status()
            result = response.json()

            response_text = result['choices'][0]['message']['content'].strip()

            try:
                keywords = json.loads(response_text)
                if isinstance(keywords, list):
                    return keywords
            except json.JSONDecodeError:

                return [kw.strip() for kw in response_text.split(',') if kw]

        except requests.exceptions.RequestException as e:
            print(f"Error during request: {e}")
            return []


for file in tqdm(os.listdir(input_folder), desc="Processing Files"):
    if file.endswith('.jsonl'):
        input_path = os.path.join(input_folder, file)
        output_path = os.path.join(output_folder, file)

        with open(input_path, 'r', encoding='utf-8') as infile, \
             open(output_path, 'w', encoding='utf-8') as outfile:

            lines = infile.readlines()
            for line in tqdm(lines, desc=f"Processing {file}", leave=False):
                record = json.loads(line)
                text = record.get('model_input', '')

                if text:
                    keywords = extract_keyphrases(text)
                    record['keywords'] = keywords

                outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
print("Keyword extraction completed. Extracted files saved in 'extract' folder.")