In [3]:
import modin.pandas as pd
import sqlite3
import uuid
import requests
import random
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from utils import create_database

DB_NAME = "aspect_sa.db"
API_URL = "http://localhost:5000/predict"
MAX_WORKERS = 4  # Số luồng gọi API song song

def create_connection(db_file=DB_NAME):
    return sqlite3.connect(db_file)

def get_all_ids(conn, table, columns):
    cur = conn.cursor()
    cur.execute(f"SELECT {', '.join(columns)} FROM {table}")
    rows = cur.fetchall()
    return [dict(zip(columns, row)) for row in rows]

def get_student_class_year_batch(conn):
    """Trả về dict: student_id -> (class_id, academic_year_id)"""
    cur = conn.cursor()
    cur.execute("""
        SELECT Student.id, Student.class_id, Class.academic_year_id
        FROM Student
        JOIN Class ON Student.class_id = Class.id
    """)
    return {row[0]: (row[1], row[2]) for row in cur.fetchall()}

def get_random_row(rows):
    return random.choice(rows)

def call_api(text, retry=3):
    for _ in range(retry):
        try:
            resp = requests.post(API_URL, json={"text": text}, timeout=10)
            if resp.status_code == 200:
                data = resp.json()
                return data.get("predictions", [])
        except Exception:
            time.sleep(0.5)
    return []

def get_id_map(conn, table, key_col):
    cur = conn.cursor()
    cur.execute(f"SELECT id, {key_col} FROM {table}")
    # chuẩn hoá về lowercase + trim để tra cứu chắc chắn
    return {row[1].strip().lower(): row[0] for row in cur.fetchall()}

def get_semester_map(conn):
    """Lấy map: academic_year_id -> [semester_name, ...]"""
    cur = conn.cursor()
    cur.execute("""
        SELECT ay.id, ay.start_year, s.name
        FROM AcademicYear ay
        JOIN Semester s ON s.name LIKE printf('%dHK%%', ay.start_year)
    """)
    sem_map = {}
    for ay_id, start_year, sem_name in cur.fetchall():
        sem_map.setdefault(ay_id, []).append(sem_name)
    return sem_map

def prepare_insert_tasks(texts, students, student_map, courses, semester_map):
    """Tạo danh sách task (text, student_id, course_id, class_id, academic_year_id, semester_name)"""
    tasks = []
    for text in texts:
        stu = get_random_row(students)
        sid = stu["id"]
        class_id, academic_year_id = student_map[sid]
        semester_names = semester_map.get(academic_year_id)
        if not semester_names:
            continue
        semester_name = random.choice(semester_names)
        course_id = get_random_row(courses)["id"]
        tasks.append({
            "text": text,
            "student_id": sid,
            "class_id": class_id,
            "academic_year_id": academic_year_id,
            "semester_name": semester_name,
            "course_id": course_id
        })
    return tasks

def api_worker(task):
    text = task["text"]
    predictions = call_api(text)
    result = []
    for pred in predictions:
        aspect = pred.get("aspect")
        sentiment = pred.get("sentiment")
        # print("API trả về sentiment:", sentiment)
        if not aspect or not sentiment:
            continue
        result.append({
            "text": text,
            "aspect": aspect,
            "sentiment": sentiment,
            "semester_name": task["semester_name"],
            "course_id": task["course_id"],
            "academic_year_id": task["academic_year_id"],
            "class_id": task["class_id"],
            "student_id": task["student_id"]
        })
    return result

def main():
    # Kết nối DB (main thread)
    conn = create_connection(DB_NAME)
    create_database(DB_NAME)  # Tạo DB nếu chưa có
    # Chuẩn bị lookup table
    students = get_all_ids(conn, "Student", ["id"])
    courses = get_all_ids(conn, "Course", ["id"])
    student_map = get_student_class_year_batch(conn)
    aspect_map = get_id_map(conn, "Aspect", "name")
    sentiment_map = get_id_map(conn, "Sentiment", "name")
    semester_map = get_semester_map(conn)

    # Đọc dữ liệu text
    data = pd.read_excel("data_20k.xlsx")
    texts = data["text"].tolist()

    # Chuẩn bị danh sách task (random student, course, semester,...)
    tasks = prepare_insert_tasks(texts, students, student_map, courses, semester_map)
    print(f"Tổng số task cần xử lý: {len(tasks)}")

    # Chạy song song gọi API và xử lý kết quả với tqdm
    all_predictions = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(api_worker, task) for task in tasks]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Gọi API"):
            try:
                predictions = future.result()
                all_predictions.extend(predictions)
            except Exception as exc:
                print(f"Task lỗi: {exc}")

    print(f"Tổng số predictions cần insert: {len(all_predictions)}")

    # Chuẩn bị id lookup cho semester
    semester_id_map = get_id_map(conn, "Semester", "name")

    # Insert batch vào DB với tqdm
    batch_size = 500
    inserted_count = 0
    for i in tqdm(range(0, len(all_predictions), batch_size), desc="Insert DB"):
        batch = all_predictions[i:i+batch_size]
        to_insert = []
        for pred in batch:
            asp_id = aspect_map.get(pred["aspect"])
            sen_id = sentiment_map.get(pred["sentiment"])
            print(sen_id)
            sem_id = semester_id_map.get(pred["semester_name"])
            if asp_id is None or sen_id is None or sem_id is None:
                continue
            to_insert.append((
                str(uuid.uuid4()), pred["text"], asp_id, sen_id, sem_id,
                pred["course_id"], pred["academic_year_id"], pred["class_id"], pred["student_id"]
            ))
        conn.executemany("""
            INSERT INTO Sentence
              (id, text, aspect_id, sentiment_id, semester_id, course_id, academic_year_id, class_id, student_id)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, to_insert)
        conn.commit()
        inserted_count += len(to_insert)
        tqdm.write(f"Đã insert {inserted_count}/{len(all_predictions)} predictions...")

    conn.close()
    print(f"Hoàn thành! Đã insert {inserted_count} record vào bảng Sentence.")

if __name__ == "__main__":
    main()



Tổng số task cần xử lý: 20172


Gọi API: 100%|██████████| 20172/20172 [19:01<00:00, 17.67it/s]


Tổng số predictions cần insert: 22575


Insert DB:   7%|▋         | 3/46 [00:00<00:01, 25.88it/s]

2
2
0
2
2
2
1
2
0
0
0
2
0
1
1
1
1
2
2
2
2
2
2
2
2
0
2
1
1
0
2
0
1
2
0
0
1
2
1
2
0
1
2
2
1
0
0
0
2
2
2
2
0
2
0
2
2
2
2
2
2
1
2
0
0
2
2
0
0
0
0
1
2
0
2
2
2
1
2
0
0
0
0
0
2
2
0
0
0
0
0
1
2
1
2
2
0
0
0
2
2
2
2
0
2
0
2
0
0
0
1
2
0
1
2
2
1
1
2
2
2
0
0
0
0
2
2
1
2
1
1
2
2
1
0
0
2
2
0
0
2
2
0
1
0
1
2
2
0
2
2
2
2
0
1
0
1
0
0
2
1
1
2
2
0
2
2
2
0
0
2
2
1
1
2
2
2
2
2
0
2
0
0
0
2
0
1
2
2
1
2
0
1
0
0
1
2
0
0
0
0
0
0
0
2
2
2
2
2
0
0
0
0
0
0
2
1
2
2
2
2
2
2
2
2
2
1
0
2
2
0
0
1
0
2
2
2
2
0
2
2
1
2
2
1
0
0
2
1
1
2
0
2
0
2
0
0
2
2
0
0
0
0
2
1
2
0
1
0
0
2
2
2
1
1
0
2
0
1
1
2
0
0
2
0
0
0
2
0
0
2
2
1
2
2
1
0
1
2
1
2
2
2
2
0
0
1
0
0
2
2
0
0
0
2
2
1
0
2
0
2
0
2
2
2
2
2
2
1
2
2
2
0
1
2
2
2
2
2
0
2
2
0
1
0
2
2
0
2
0
2
1
1
2
2
2
2
1
0
2
2
2
2
2
2
2
2
2
2
2
0
0
2
2
1
1
1
2
2
2
0
2
2
2
0
2
0
2
2
1
0
0
1
0
0
2
2
2
0
2
2
1
0
1
2
2
2
2
2
0
2
0
0
0
2
2
1
1
1
2
0
1
2
1
2
0
0
0
2
2
2
2
1
0
0
2
1
2
2
0
2
2
0
2
0
0
2
2
2
2
0
0
1
1
2
2
2
0
2
2
2
2
1
2
0
0
0
2
2
0
0
0
2
2
2
1
0
2
2
2
2
0
1
2
2
1
2
2
2
0
1
0
2
1
2
0
1
2
1
2


Insert DB:  20%|█▉        | 9/46 [00:00<00:01, 25.68it/s]

Đã insert 3000/22575 predictions...
2
0
0
2
1
2
0
0
2
0
1
2
0
0
0
0
2
0
2
0
2
0
0
2
2
2
2
0
2
0
2
0
2
2
2
2
2
0
2
2
0
2
0
0
2
0
1
1
0
0
2
2
2
0
2
0
0
0
0
0
0
2
2
0
2
2
2
2
0
0
2
0
0
2
2
2
0
0
2
2
2
2
2
2
2
2
0
0
2
2
1
0
0
2
2
2
2
0
2
2
1
2
2
2
1
0
2
0
0
2
0
2
2
2
2
0
0
2
0
0
2
2
0
0
0
2
2
0
0
0
2
2
0
0
2
0
2
2
1
0
0
2
2
0
0
2
0
1
2
2
2
1
1
0
2
0
2
2
2
0
0
0
0
0
0
2
2
1
2
0
2
0
0
0
2
0
2
0
2
0
0
0
0
2
0
0
0
1
2
0
0
2
0
0
2
2
1
2
2
2
2
0
0
2
2
0
0
0
2
2
2
1
2
0
2
0
0
2
1
2
2
2
1
2
0
1
0
2
2
2
2
0
1
2
2
0
2
2
0
2
2
0
2
0
0
0
2
1
2
2
0
1
2
2
2
2
0
2
2
2
0
2
0
2
0
2
2
2
0
2
2
2
2
0
1
2
2
1
1
0
2
2
0
1
0
0
0
2
2
2
2
0
2
2
0
2
0
1
1
0
2
0
1
2
0
0
0
2
0
0
1
2
2
1
2
2
2
0
1
2
0
0
0
2
0
2
2
0
0
0
2
2
0
2
2
2
2
2
2
2
2
0
2
0
2
2
2
2
2
2
1
0
2
2
2
1
0
0
0
1
2
1
2
2
2
0
0
2
2
2
0
0
0
0
1
2
2
0
0
0
0
0
2
0
2
2
0
0
0
2
1
1
0
1
2
1
0
2
0
0
1
2
2
1
1
2
0
2
2
2
1
1
1
0
0
1
2
2
0
2
2
2
2
2
1
0
2
0
2
1
2
0
0
2
2
0
2
2
2
2
2
0
2
1
0
2
2
2
2
2
0
2
2
2
2
2
2
2
2
2
2
2
2
2
2
0
0
2
0
2
0
0
2
2
2
0
2
2
2
2
0
0


Insert DB:  33%|███▎      | 15/46 [00:00<00:01, 24.77it/s]

Đã insert 5500/22575 predictions...
1
0
1
2
0
2
0
2
2
0
1
2
2
2
2
2
0
2
2
2
2
0
0
2
2
0
1
2
2
2
2
0
2
2
2
1
0
2
1
0
0
2
0
0
1
2
0
2
2
2
0
2
2
0
0
2
0
2
2
1
0
0
2
2
0
2
0
2
1
0
0
0
1
2
0
1
0
2
0
2
2
2
2
2
0
0
1
1
0
2
0
0
2
2
2
2
0
0
0
2
0
2
0
2
2
0
0
0
0
0
2
0
2
2
0
1
0
0
2
2
2
2
2
2
0
2
2
2
1
2
2
2
0
1
0
1
0
2
1
2
2
2
2
2
2
2
2
2
0
2
2
1
1
1
2
1
2
2
2
0
1
0
1
2
2
2
2
2
0
2
2
1
0
1
2
0
1
2
2
1
1
0
2
2
2
0
2
1
0
0
1
2
0
2
2
1
2
2
2
0
0
2
0
2
2
2
2
2
2
2
2
2
1
2
2
0
0
1
2
1
2
2
0
2
0
2
2
2
2
2
2
1
2
2
1
2
0
0
2
0
1
2
2
0
0
2
2
2
2
0
2
0
1
1
0
1
1
0
2
2
2
2
0
1
1
0
2
0
0
0
2
2
2
2
2
2
2
2
2
2
2
1
0
2
1
2
0
2
2
2
1
0
0
0
0
0
0
2
0
0
2
1
0
1
2
1
0
1
2
1
1
2
2
2
0
2
2
0
0
0
2
2
0
2
2
0
1
2
2
1
2
2
2
0
1
1
0
0
0
1
0
0
0
2
0
0
2
2
0
2
2
2
0
2
2
2
2
2
2
2
0
0
1
2
2
2
0
2
2
2
0
1
0
0
0
1
0
2
2
1
2
1
2
2
2
1
2
0
0
2
0
2
2
1
2
2
1
2
2
2
0
2
2
0
0
2
2
2
2
0
2
2
0
0
1
2
0
0
2
0
2
2
0
0
1
2
2
2
2
2
2
2
2
2
2
0
2
2
1
2
2
1
2
0
2
2
2
0
0
2
2
2
0
1
0
1
0
1
2
0
2
1
2
0
0
0
1
0
0
2
2
2
2
1
2
0
2
0
0
1
2
0


Insert DB:  46%|████▌     | 21/46 [00:00<00:01, 24.40it/s]

1
2
0
2
0
1
2
2
0
1
0
0
2
0
2
2
0
2
2
2
2
0
2
2
2
0
2
0
0
2
2
0
2
2
2
2
2
0
0
2
2
2
2
0
2
2
2
2
1
2
2
2
0
0
2
2
0
2
2
2
0
1
0
1
2
2
2
2
2
2
1
0
0
0
1
0
0
0
0
0
0
2
2
2
2
2
0
0
2
1
1
2
2
0
0
0
1
1
2
0
0
0
0
0
2
2
2
0
1
0
1
0
2
2
1
2
2
2
2
1
0
2
2
0
2
2
2
2
2
2
0
2
2
2
0
1
2
2
1
2
2
0
0
2
2
2
2
0
2
2
2
0
1
2
2
2
0
2
0
1
2
2
2
0
0
2
2
0
1
2
2
2
2
2
2
2
2
2
0
0
2
1
2
2
0
2
2
2
0
0
0
2
0
2
2
2
0
0
2
0
0
2
0
2
2
2
0
1
1
2
0
1
2
1
2
2
2
2
2
2
2
2
1
2
2
1
2
2
0
1
0
0
2
0
2
0
2
2
2
2
0
0
0
0
2
2
0
2
2
0
0
0
1
2
2
2
2
1
1
0
1
2
2
2
2
0
2
0
1
1
0
2
1
2
0
2
2
1
0
2
2
2
0
0
1
0
0
2
2
0
2
0
2
2
1
0
1
0
2
2
0
0
2
0
2
2
0
1
2
2
2
2
0
0
2
1
2
0
1
0
0
2
2
1
2
2
2
2
2
2
2
2
0
2
0
1
0
0
0
2
2
1
2
2
2
2
2
0
2
0
0
2
2
2
2
0
1
2
1
0
1
0
2
0
0
0
0
2
0
2
2
1
0
2
1
0
0
0
0
2
0
2
2
2
0
2
2
2
1
2
1
0
0
1
0
0
0
2
2
2
2
0
2
0
2
2
0
2
2
2
0
2
2
0
0
2
0
0
0
0
0
0
2
2
2
0
0
1
0
0
2
0
0
2
2
0
2
2
1
2
1
2
2
2
0
2
0
0
0
2
2
2
0
1
2
2
0
0
0
1
1
2
1
2
2
0
2
0
2
2
2
2
0
1
0
2
0
0
1
0
1
0
0
1
1
0
1
2
2
2
2
2
2
0
2
1
2
0
0
0


Insert DB:  52%|█████▏    | 24/46 [00:01<00:00, 24.06it/s]

Đã insert 11000/22575 predictions...
0
2
1
2
0
2
2
2
0
1
2
0
0
2
0
0
1
0
2
2
2
2
2
2
2
2
1
0
2
2
1
2
1
2
1
0
2
0
1
1
2
0
2
2
0
1
1
2
2
2
1
1
2
2
2
0
2
0
0
2
2
2
2
0
0
2
0
0
1
0
2
2
2
1
0
2
2
2
2
2
2
1
2
2
2
2
1
2
2
2
0
2
1
2
1
2
0
1
2
2
2
0
2
1
2
1
2
1
0
0
1
1
2
2
2
0
0
0
0
0
2
2
1
2
2
2
2
2
0
1
1
2
2
2
2
2
0
0
2
2
0
2
2
2
2
2
1
2
2
0
2
2
0
2
2
2
2
2
2
1
2
1
2
0
2
2
2
2
2
0
2
1
0
2
2
2
2
2
1
2
1
2
2
2
2
0
1
2
2
2
1
1
2
2
2
2
2
0
2
2
2
1
2
2
1
0
0
0
2
1
2
1
0
2
2
1
2
2
2
2
2
2
2
2
0
1
2
1
2
2
0
2
2
2
2
1
0
1
1
0
2
1
0
2
0
2
1
0
2
0
0
1
0
2
2
2
0
2
2
2
2
2
2
2
1
1
2
2
0
2
0
1
0
0
2
1
0
2
0
0
0
0
0
1
1
2
1
2
0
1
1
1
0
2
2
2
0
2
1
0
2
2
0
1
2
2
2
2
2
2
2
2
2
1
2
2
2
0
1
0
1
1
0
2
2
0
0
2
2
1
1
0
2
2
2
0
2
1
2
1
0
2
1
0
2
1
1
2
2
0
2
2
2
2
2
2
1
0
0
2
1
2
2
2
0
0
2
0
2
1
0
2
0
1
2
0
2
0
0
0
0
0
0
0
2
2
2
2
2
2
0
1
1
2
1
0
0
0
2
2
2
2
2
2
2
0
0
1
1
2
2
0
2
0
0
0
2
0
2
1
2
1
2
2
0
0
0
1
2
0
0
0
0
0
2
2
2
0
2
1
0
1
0
2
2
2
1
0
1
1
2
1
0
2
1
1
0
2
0
2
0
2
1
0
2
0
0
1
2
2
2
2
2
2
2
1
2
2
0
1
1
2

Insert DB:  65%|██████▌   | 30/46 [00:01<00:00, 24.16it/s]

Đã insert 13500/22575 predictions...
0
2
2
2
1
2
2
2
0
0
2
2
2
0
2
0
1
2
2
2
2
0
2
0
2
0
0
2
2
0
1
2
2
1
2
1
0
2
2
2
0
2
2
2
0
2
2
2
1
1
2
0
2
0
2
2
2
2
2
0
0
2
2
2
2
2
1
0
1
0
2
2
0
0
2
0
2
2
0
2
1
1
2
2
2
2
1
1
2
0
2
2
2
0
1
0
2
2
1
1
2
1
0
0
0
0
2
2
0
2
2
0
0
2
2
2
2
2
2
2
1
0
1
0
1
2
2
0
0
2
0
2
2
2
0
2
1
2
1
2
2
0
2
2
0
2
1
2
1
0
2
2
2
0
2
0
2
2
2
0
1
2
2
2
2
0
0
0
0
2
1
0
0
2
2
0
0
0
0
0
0
0
0
1
2
0
0
0
2
0
2
2
0
0
2
2
0
2
2
2
1
1
2
2
2
2
2
2
0
0
1
2
2
2
1
0
2
2
1
0
2
0
0
0
2
2
0
0
0
0
0
0
0
2
2
2
2
1
0
1
1
2
0
2
2
2
0
1
0
2
0
0
1
0
2
2
1
1
2
2
1
2
2
1
2
0
2
2
2
2
0
1
1
1
2
2
2
2
0
2
0
0
2
1
1
1
2
1
2
2
2
2
1
2
0
0
0
0
0
0
2
2
0
2
2
2
0
2
0
2
2
2
2
2
1
2
2
1
2
2
2
0
2
0
2
0
2
0
0
1
2
0
0
1
1
2
2
2
0
2
0
2
2
2
2
2
0
1
0
2
1
1
2
2
0
2
2
2
2
0
0
2
2
1
2
2
2
1
2
2
0
0
0
2
0
2
0
2
1
0
1
1
2
0
2
2
2
1
1
2
2
0
2
2
2
0
0
2
2
1
1
2
2
1
2
2
2
1
2
1
2
0
0
2
0
1
1
2
2
2
0
1
2
1
2
2
2
1
2
2
2
0
0
0
2
2
2
0
0
0
1
0
2
2
1
0
0
2
0
1
0
2
2
2
2
0
0
0
0
2
2
0
2
1
1
1
2
0
2
2
2
2
1
2
2
2
0
1
1
2
1
2

Insert DB:  78%|███████▊  | 36/46 [00:01<00:00, 23.74it/s]

Đã insert 16000/22575 predictions...
1
2
0
0
2
2
1
0
0
1
2
2
2
2
0
1
0
0
1
2
2
2
2
1
2
2
0
2
0
0
0
2
1
1
2
2
2
1
2
0
2
1
2
1
2
2
2
0
0
2
2
2
0
2
0
0
0
0
1
2
2
0
2
2
0
1
0
2
0
2
2
2
2
2
2
2
2
2
1
2
2
0
1
1
2
2
0
0
1
2
1
2
2
2
0
2
2
0
0
2
2
2
2
2
2
1
1
0
0
2
0
0
0
2
1
2
2
1
1
1
2
0
2
0
1
2
0
2
1
0
0
1
2
2
2
2
1
0
1
2
1
2
2
2
2
0
1
2
2
1
2
2
1
0
0
1
0
2
0
2
0
0
1
0
0
0
2
2
1
2
1
0
2
2
2
2
1
1
0
0
2
1
2
2
2
2
2
0
2
1
0
2
0
1
2
2
1
0
2
2
2
2
0
2
1
2
1
1
2
2
2
1
2
1
2
2
2
2
2
0
0
2
2
2
2
2
0
0
2
0
1
0
2
1
0
0
0
1
2
0
2
0
2
1
0
2
2
2
0
2
2
2
1
1
1
2
2
2
2
2
2
2
2
0
0
1
0
2
0
2
1
2
2
0
2
2
2
2
2
0
2
0
1
2
0
0
1
2
2
1
0
0
1
0
1
2
2
2
0
0
0
0
0
1
0
2
0
2
2
0
1
2
0
2
0
2
2
2
2
2
2
2
2
1
1
0
1
0
2
2
0
1
2
0
0
1
2
1
0
1
1
2
2
0
0
1
0
2
2
1
2
2
1
1
1
2
2
0
1
1
1
2
2
0
2
2
0
0
2
1
0
0
2
2
1
2
2
0
2
1
2
1
1
2
0
0
0
0
0
0
1
2
2
2
2
2
2
2
1
0
2
2
0
2
0
2
2
0
0
0
2
2
0
0
2
2
0
2
1
2
2
2
0
0
0
2
0
1
2
0
2
2
0
2
0
0
2
1
2
1
1
1
1
1
2
2
0
2
2
0
2
2
0
2
2
2
0
0
1
2
0
2
2
2
2
0
0
2
2
2
0
2
2
1
0
0
0
1
2
0
0
0

Insert DB:  85%|████████▍ | 39/46 [00:01<00:00, 23.26it/s]

Đã insert 18500/22575 predictions...
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
1
2
2
2
1
0
2
2
2
2
2
1
1
0
1
1
1
1
1
2
2
2
0
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
1
0
1
0
0
1
0
0
0
0
0
2
1
2
2
2
1
1
2
1
2
2
2
2
2
2
1
1
0
0
1
0
2
2
2
2
2
2
2
2
2
2
2
1
1
2
2
2
1
2
1
2
2
2
2
1
2
2
2
2
2
2
2
1
1
1
1
1
1
0
0
0
0
2
0
0
0
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
1
0
1
1
0
1
1
1
1
0
0
0
1
0
1
0
0
0
1
0
0
1
1
2
1
2
1
2
2
1
1
2
0
1
2
1
2
2
2
2
2
2
2
1
1
1
0
1
1
1
1
1
1
1
1
0
0
0
0
0
2
2
2
2
1
2
2
2
1
1
2
2
1
2
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
1
2
2
2
0
0
0
0
0
1
0
0
0
0
0
0
0
0
2
2
2
2
2
2
2
2
2
2
1
2
1
2
2
2
2
1
0
2
2
2
2
2
1
1
2
1
2
2
1
2
1
1
1
2
2
2
1
1
1
1
0
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
1
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
2
1
2
2
2
2
2
1
1
1
1
2
1
2
1
1
2
2
1
1
2
1
1
1
1
2
1
1
1
1
1
2
1
0
1
1
2
2
0
0
1
0
0
1
1
1
2
1
1
1
2
1
1
2
2
2
1
0
0
1
0
0
0
0
0
1
1
1
1
1
1
1
1
1
2
1
1
2
2
1
2
1
1
1
1
1
1
0
1
1
0
1
1
0
0
0
0
1
1
1
2
1
1
1
1
1
1
1
2
2
2
1
1
1
2
1
1
2
2
2
2
2
2
0
0
2
0
0
0
0

Insert DB: 100%|██████████| 46/46 [00:01<00:00, 24.23it/s]


Đã insert 21000/22575 predictions...
1
1
1
1
1
1
2
1
1
1
2
1
1
2
1
2
1
2
1
2
2
2
2
1
2
1
0
0
0
0
0
1
2
1
1
2
2
2
2
2
1
1
1
2
1
2
1
2
1
2
2
2
2
2
2
2
2
2
1
1
2
1
2
1
2
2
2
0
2
0
1
0
1
0
1
1
1
1
2
2
2
2
2
2
2
2
1
2
2
1
1
1
2
1
2
1
2
1
2
2
1
2
2
1
2
2
2
1
1
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
2
1
1
0
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
2
1
0
0
2
0
1
0
1
1
1
1
0
1
1
1
1
1
1
2
2
1
2
1
2
2
2
2
1
1
1
1
2
0
0
0
0
0
0
0
0
2
2
1
1
2
2
1
2
1
1
1
1
1
1
1
2
1
2
1
1
1
2
2
2
2
2
2
1
1
2
0
0
1
1
1
1
1
0
0
1
0
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
0
0
0
0
0
0
1
1
1
1
1
1
2
1
2
2
2
2
2
2
2
1
2
2
2
2
2
0
0
0
1
0
1
0
0
0
0
0
0
0
1
1
2
2
2
2
1
2
2
2
2
2
1
0
0
0
1
0
1
1
1
0
0
1
0
1
0
1
1
1
2
1
1
2
0
1
1
1
1
1
1
2
1
2
2
1
2
2
2
2
1
1
0
2
1
0
1
2
0
2
1
2
1
1
2
2
2
2
1
2
2
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
1
0
0
0
0
0
0
0
1
1
1
1
1
2
1
1
2
1
2
1
2
2
2
2
2
2
0
2
2
0
2
2
2
2
2
2
1
1
0
1
1
1
2
2
2
2
1
2
2
0
0
0
0
0
0
1
0
1
1
0
0
0
1
1
2
0
2
1
2
1
1
2
1
1
1
2
1
1
2
1
2
2
2
1
0
1
0
1
2
1
1
1
1
2
1
1
2
2

In [4]:
import modin.pandas as pd

# Đọc dữ liệu
df = pd.read_excel("data_20k.xlsx")

# Bỏ cột "label"
df = df.drop(columns=["label"])

# Lấy cột text
texts = df['text']

# Danh sách các kích thước file cần tạo
sizes = [100, 1000, 2500, 5000, 10000, 15000]

for size in sizes:
    # Lấy ra size câu đầu tiên
    subset = texts.iloc[:size]
    # Ghi ra file txt, mỗi câu trên một dòng
    filename = f"texts_{size}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for line in subset:
            f.write(str(line).strip() + "\n")

    print(f"Đã tạo file {filename} với {size} câu.")



Đã tạo file texts_100.txt với 100 câu.
Đã tạo file texts_1000.txt với 1000 câu.
Đã tạo file texts_2500.txt với 2500 câu.
Đã tạo file texts_5000.txt với 5000 câu.
Đã tạo file texts_10000.txt với 10000 câu.
Đã tạo file texts_15000.txt với 15000 câu.


In [11]:
# !pip install "modin[all]"
!pip install openpyxl

