In [1]:
!pip install aiohttp pandas tqdm




In [2]:
import pandas as pd
import requests
import time
from tqdm import tqdm
import sys

# 🔧 Kích hoạt thanh tiến độ trong console log
tqdm.pandas(desc="🔍", file=sys.stdout)

# ====== Cấu hình API ======
api_key = "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"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# ====== Prompt mẫu one-shot cho LLaMA ======
example_description = (
    "I have code as shown below. In this code I have a factory which is returning the concrete instances. "
    "But every time I need to have a new implementation of the ICar interface, I have to change the CreateCar() method "
    "of the CarFactory. It seems like I am not supporting the Open Closed Principle of the SOLID principles."
)

example_code = """\
public interface ICar { void Created(); }
public class BigCar : ICar { public void Created() {} }
public class SmallCar : ICar { public void Created() {} }
public class LuxaryCar : ICar { public void Created() {} }
public class CarFactory {
    public ICar CreateCar(int carType) {
        switch (carType) {
            case 0: return new BigCar();
            case 1: return new SmallCar();
            case 2: return new LuxaryCar();
            default: break;
        }
        return null;
    }
}
"""

example_output = "KEY_PHRASES: Factory class, Open Closed Principle, SOLID principles"

# ====== Bộ đếm request và timestamp ======
requests_sent = 0
last_reset_time = time.time()
REQUEST_LIMIT = 2990

# ====== Hàm xử lý từng dòng ======
def extract_keywords_from_text(text):
    global requests_sent, last_reset_time

    if pd.isna(text):
        return ""

    if ":" in text:
        lang_part, content = text.split(":", 1)
        language = lang_part.strip()
        content = content.strip()
    else:
        language = "UNKNOWN"
        content = text.strip()

    if "<code>" in content:
        parts = content.split("<code>")
        description = parts[0].strip()
        code = parts[1].strip()
    else:
        description = content.strip()
        code = ""

    messages = [
        {"role": "system", "content": "You are a concise key-phrase extraction assistant. Extract only important technical phrases from description and code. Respond only with: KEY_PHRASES: <comma-separated list>."},
        {"role": "user", "content": f"""\
EXAMPLE:

DESCRIPTION:
{example_description}

CODE:
{example_code}

OUTPUT:
{example_output}

NEW TASK:

DESCRIPTION:
{description}

CODE:
{code}

OUTPUT:
KEY_PHRASES:"""}
    ]

    data = {
        "model": "meta-llama/llama-3-8b-instruct",
        "messages": messages,
        "temperature": 0.0,
        "max_tokens": 64
    }

    current_time = time.time()
    if requests_sent >= REQUEST_LIMIT:
        elapsed = current_time - last_reset_time
        if elapsed < 60:
            sleep_time = 60 - elapsed
            print(f"🕒 Đã gửi {REQUEST_LIMIT} requests. Đợi {round(sleep_time)}s...")
            time.sleep(sleep_time)
        requests_sent = 0
        last_reset_time = time.time()

    try:
        response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
        requests_sent += 1

        keyphrase_text = response.json()["choices"][0]["message"]["content"].strip()
        if keyphrase_text.lower().startswith("key_phrases:"):
            keyphrases = keyphrase_text.split(":", 1)[1].strip()
        else:
            keyphrases = "unknown"
    except Exception as e:
        keyphrases = f"error: {e}"

    return f"{language}: <keyphase> {keyphrases} <description> {description} <code> {code}"

# ====== CẤU HÌNH FILE ĐẦU VÀO ======
number = 4  # ←🔢 ĐIỀN SỐ FILE TẠI ĐÂY (VD: 1, 2, ..., 12)
input_path = f"/kaggle/input/dataset-train-merged/train_merged_data_p{number}.csv"
output_path = f"/kaggle/working/TRAIN_KEYWORD_P{number}_preview.csv"

# ====== XỬ LÝ FILE ======
df = pd.read_csv(input_path)

if "text" in df.columns and "title" in df.columns:
    print(f"📦 Bắt đầu xử lý {len(df)} dòng từ file P{number}")
    df["text"] = df["text"].progress_apply(extract_keywords_from_text)
    df_out = df[["text", "title"]]
    df_out.to_csv(output_path, index=False)
    print(f"✅ ĐÃ XỬ LÝ FILE P{number} -> {output_path}")
else:
    print(f"⚠️ BỎ QUA FILE P{number} (thiếu cột 'text' hoặc 'title')")


📦 Bắt đầu xử lý 13333 dòng từ file P4
🔍: 100%|██████████| 13333/13333 [1:31:51<00:00,  2.42it/s]
✅ ĐÃ XỬ LÝ FILE P4 -> /kaggle/working/TRAIN_KEYWORD_P4_preview.csv
