In [None]:
import os
import re
import json
import pathlib
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from multilingual_pdf2text.models.document_model.document import Document as PDFDocument
from multilingual_pdf2text.pdf2text import PDF2Text

# Load environment variables
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("API_KEY")

**Data Preparation Functions**

In [None]:
# Initialize labels for classification task
labels = ["Dao_tao", "Hoc_tap_ren_luyen", "Khen_thuong_ky_luat", "Tot_nghiep", "KTX", "Khac"]

def load_stop_words():
    # Load stopwords.txt
    with open("lib/vietnamese-stopwords.txt", "r", encoding="utf-8") as f:
        stop_words = f.read().splitlines()
    return stop_words

def preprocess(text):
    # Convert non-uppercase words to lowercase
    words = text.split()
    processed_words = [word if word.isupper() else word.lower() for word in words]

    # Join words back into a string
    text = " ".join(processed_words)

    # Remove unwanted special characters (keep letters, numbers, whitespace, , ? . -)
    text = re.sub(r'[^\w\s,?.-]', '', text)

    # Remove leading/trailing whitespace
    text = text.strip()

    return text

def convert_pdf_to_markdown_with_llm(pdf_path, output_path=None):
    # Initialize document object with Vietnamese language setting
    pdf_document = PDFDocument(document_path=pdf_path, language="vie")
    pdf2text = PDF2Text(document=pdf_document)

    # Extract text from PDF
    extracted_text = ""
    for words in pdf2text.extract():
        extracted_text += words["text"]

    # Load prompt template for PDF to Markdown conversion
    with open("prompt/pdf_to_markdown.txt", "r", encoding="utf-8") as f:
        template = f.read()

    # Set up the LLM chain
    prompt = ChatPromptTemplate.from_template(template)
    model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

    # Create processing chain: pass PDF content through the prompt to the model
    chain = (
        {"pdf_content": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )

    # Process the extracted text through the LLM chain
    markdown_text = chain.invoke(extracted_text)

    # Save to file if output path is provided
    if output_path:
        pathlib.Path(output_path).write_text(markdown_text, encoding="utf-8")
        print(f"Markdown saved to {output_path}")

    return markdown_text

def chunking(text):
    # text_splitter = MarkdownTextSplitter(chunk_size=200, chunk_overlap=20)
    # return text_splitter.split_text(text)
    text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="percentile", breakpoint_threshold_amount=95, min_chunk_size=100)
    return text_splitter.create_documents([text])

def labelling():
    choice = input()
    return "__label__" + labels[int(choice)]

**PREPARE CSV DATA**

In [None]:
for file in tqdm(os.listdir("content")):
    print()
    content = convert_pdf_to_markdown_with_llm(os.path.join("content", file), os.path.join("result", file.replace(".pdf", ".md")))
    content = preprocess(content)
    chunks = chunking(content)
    data = []
    label = labelling()
    for chunk in chunks:
        data.append({"text": chunk.page_content, "label": label})
    df = pd.DataFrame(data)
    path = os.path.join("result", file.replace(".pdf", ".csv"))
    df.to_csv(path, index=False, encoding="utf-8-sig", sep=";")

**PARAPHRASE DATA FOR CLASSIFICATION MODEL**

In [None]:
df = pd.read_excel("classed_dataset.xlsx", sheet_name="KTX")

with open("prompt/augment.txt", "r", encoding="utf-8") as f:
    template = f.read()

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.5)
prompt = ChatPromptTemplate.from_template(template)

chain = (
    prompt
    | llm
    | StrOutputParser()
)

augmented_data = []

category = df['label'].iloc[0]

num_variations = input()

for index, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    
    chain_input = {
        "category": category,
        "num_variations": num_variations,
        "text": text
    }
    
    try:
        result = chain.invoke(chain_input)
        
        augmented_texts = json.loads(result)
        
        augmented_data.append({
            "text": text,
            "label": row['label']
        })
        
        for variant in augmented_texts[0]["variants"]:
            augmented_data.append({
                "text": variant,
                "label": row['label']
            })
        
    except Exception as e:
        print(f"Error processing text at index {index}: {e}")

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_excel("augmented_dataset_TN.xlsx", index=False)
print(f"Original dataset size: {len(df)}, Augmented dataset size: {len(augmented_df)}")

**CONVERT EXCEL FILE TO CSV FILE**

In [None]:
df = pd.read_excel("dataset.xlsx")
df.to_csv("dataset.csv", encoding="utf-8-sig", sep=";", index=False)