In [1]:
!pip install pymupdf pdfplumber pandas openai chromadb langchain sentence-transformers groq



In [2]:
!pip install pytesseract
!pip install pillow
!pip install pdf2image
!apt-get install -y poppler-utils
!apt-get install tesseract-ocr
!pip install faiss-cpu
!pip install groq

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 0s (1,803 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 1

In [3]:
import os
import csv
import pdfplumber
import re
import json
import faiss
import pytesseract
import numpy as np
from sentence_transformers import SentenceTransformer
from pdf2image import convert_from_path
from PIL import Image
from groq import Groq

# Set your API key (store securely)
os.environ["GROQ_API_KEY"] = "API_KEY"

# Initialize the client
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])


In [4]:
#extracting text and tables (for tables 0 and 1 since it got seperated by page it got combined here)
def extract_text_and_tables_from_pdf(pdf_path):
    text = ""
    full_tables = []
    table_rows = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
            page_tables = page.extract_tables()
            for table in page_tables:
                full_table_text = "\n".join([" | ".join(str(cell) if cell else "" for cell in row) for row in table])
                full_tables.append(full_table_text)

                # Process each row separately
                for row in table:
                    if None in row or "" in row:
                        continue

                    row_text = " | ".join(str(cell) for cell in row)
                    table_rows.append(row_text)

    if len(full_tables) > 1:
        full_tables[0] = full_tables[0] + "\n" + full_tables[1]
        full_tables.pop(1)

    return text, full_tables, table_rows

pdf_path = "/content/drive/MyDrive/RAG_TEST/RoboSweepX200.pdf"
pdf_text, pdf_full_tables, pdf_table_rows = extract_text_and_tables_from_pdf(pdf_path)

print("\nExtracted Text (Preview):\n", pdf_text[:1000])

print("\nNumber of Full Tables Extracted:", len(pdf_full_tables))
print("Number of Table Rows Extracted:", len(pdf_table_rows))

if pdf_full_tables:
    print("\n🔹 First (Merged) Table:\n", pdf_full_tables[0])

for i in range(min(5, len(pdf_table_rows))):
    print(f"\n🔹 Row {i+1}: {pdf_table_rows[i]}")


Extracted Text (Preview):
 RoboSweep X200
Manual Pengguna & Dokumentasi Teknis
Robot Pembersih Otomatis
Daftar Isi
1. Pendahuluan
1.1 Selamat Datang ke Dunia RoboSweep
1.2 Tentang Manual Ini
1.3 Ikhtisar Produk
1.4 Fitur Utama dan Keunggulan
1.5 Sejarah Singkat Teknologi Pembersih Otomatis
2. Informasi Keselamatan
2.1 Petunjuk Keselamatan Penting
2.2 Simbol Peringatan
2.3 Peringatan Penggunaan Baterai
2.4 Keselamatan Anak dan Hewan Peliharaan
3. Pengenalan Produk
3.1 Isi Kemasan
3.2 Komponen Robot
3.2.1 Tampak Atas
3.2.2 Tampak Bawah
3.2.3 Tampak Samping
3.3 Panel Kontrol dan Indikator
3.4 Stasiun Pengisian Daya
3.5 Spesifikasi Teknis
4. Memulai
4.1 Membuka Kemasan
4.2 Memasang Komponen
4.3 Pengisian Daya Pertama
4.4 Mengunduh Aplikasi Mobile
4.5 Koneksi WiFi
4.6 Pengaturan Awal
5. Pengoperasian Dasar
5.1 Menyalakan dan Mematikan
5.2 Memulai Siklus Pembersihan
5.3 Mode Pembersihan
5.3.1 Mode Otomatis
5.3.2 Mode Spot
5.3.3 Mode Tepi
5.3.4 Mode Terjadwal
5.4 Mengatur Kekuatan Isap
5.5 P

In [5]:
#For chunking i want to find the list of content first so it can be the seperator.
toc_match = re.search(r'Daftar Isi\n(.*?15\..*?)\n', pdf_text, re.DOTALL)
toc_content = toc_match.group(1).strip() if toc_match else ""
print(toc_content)

1. Pendahuluan
1.1 Selamat Datang ke Dunia RoboSweep
1.2 Tentang Manual Ini
1.3 Ikhtisar Produk
1.4 Fitur Utama dan Keunggulan
1.5 Sejarah Singkat Teknologi Pembersih Otomatis
2. Informasi Keselamatan
2.1 Petunjuk Keselamatan Penting
2.2 Simbol Peringatan
2.3 Peringatan Penggunaan Baterai
2.4 Keselamatan Anak dan Hewan Peliharaan
3. Pengenalan Produk
3.1 Isi Kemasan
3.2 Komponen Robot
3.2.1 Tampak Atas
3.2.2 Tampak Bawah
3.2.3 Tampak Samping
3.3 Panel Kontrol dan Indikator
3.4 Stasiun Pengisian Daya
3.5 Spesifikasi Teknis
4. Memulai
4.1 Membuka Kemasan
4.2 Memasang Komponen
4.3 Pengisian Daya Pertama
4.4 Mengunduh Aplikasi Mobile
4.5 Koneksi WiFi
4.6 Pengaturan Awal
5. Pengoperasian Dasar
5.1 Menyalakan dan Mematikan
5.2 Memulai Siklus Pembersihan
5.3 Mode Pembersihan
5.3.1 Mode Otomatis
5.3.2 Mode Spot
5.3.3 Mode Tepi
5.3.4 Mode Terjadwal
5.4 Mengatur Kekuatan Isap
5.5 Penghentian Otomatis dan Manual
6. Fitur Lanjutan
6.1 Pemetaan Ruangan
6.2 Pengenalan Area Terlarang
6.3 Pengaturan J

In [6]:
#since now i found the sections, i can chunk them into sections and then theres gonna be 2 type of chunk , pure chunk and summrized one by LLM
content = pdf_text #move it into content since im going to change the structural of pdf_text
def smart_truncate(text, max_length):
    """
    Truncate text while preserving context, ensuring it doesn’t cut off mid-sentence.
    """
    if len(text) <= max_length:
        return [text]

    chunks = []
    while len(text) > max_length:
        cutoff = text[:max_length].rfind(".")  # Find last full stop
        if cutoff == -1:
            cutoff = max_length  # If no full stop found, cut at max_length
        chunks.append(text[:cutoff + 1].strip())
        text = text[cutoff + 1:].strip()

    if text:
        chunks.append(text)

    return chunks

def groq_summarize_chunk(chunk_text, model="llama3-8b-8192", max_length=512, token_limit=6000):
    """
    Uses Groq API to summarize a chunk into 4 key points, ensuring max token limit with smart truncation.
    """
    chunk_parts = smart_truncate(chunk_text, token_limit)
    summaries = []

    for part in chunk_parts:
        prompt = f"""
        Anda adalah pakar dalam merangkum teks secara ringkas dan jelas. Berikut adalah teks yang perlu dirangkum:

        {part}

        Silakan buat 4 poin utama dari teks ini dalam bahasa Indonesia yang formal. Pastikan setiap poin jelas dan mencerminkan informasi penting dari teks. Jangan menambahkan apapun selain 4 point
        """

        response = groq_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
        )

        summaries.append(response.choices[0].message.content.strip())

    return "\n".join(summaries)

section_titles = re.findall(r'\n(\d+(?:\.\d+)?\s[^\n]+)', toc_content)

full_chunks = []
summarized_chunks = []

overlap_size = 100
max_token_limit = 6000  # Ensure max token limit for Groq API

for i in range(len(section_titles)):
    current_title = section_titles[i].strip()
    next_title = section_titles[i + 1].strip() if i + 1 < len(section_titles) else None

    # Find the second occurrence of the section title (skip TOC match)
    matches = list(re.finditer(rf'\n{re.escape(current_title)}', content))
    if len(matches) > 1:
        start_idx = matches[1].start()
    elif matches:
        start_idx = matches[0].start()
    else:
        start_idx = None

    # Find where the next section starts
    if next_title:
        next_matches = list(re.finditer(rf'\n{re.escape(next_title)}', content))
        end_idx = next_matches[1].start() if len(next_matches) > 1 else len(content)
    else:
        end_idx = len(content)

    # Add overlap for full chunks only
    start_idx_full = max(0, start_idx - overlap_size) if start_idx is not None else None
    end_idx_full = min(len(content), end_idx + overlap_size)

    # Store full chunk with overlap
    if start_idx_full is not None:
        full_chunk_text = content[start_idx_full:end_idx_full].strip()
        full_chunks.append({"title": current_title, "text": full_chunk_text})

    # Store summarized chunk without overlap, using smart truncation
    if start_idx is not None:
        summary_text = content[start_idx:end_idx].strip()
        summary_result = groq_summarize_chunk(summary_text, token_limit=max_token_limit)
        summarized_chunks.append({"title": current_title, "summary": summary_result})

# Step 4: Print the summarized chunks for verification
for i in range(5):  # Preview first 5 sections
    print(f"\n=== {summarized_chunks[i]['title']} ===\n")
    print("📌 Summary:")
    print(summarized_chunks[i]['summary'])




KeyboardInterrupt: 

In [1]:
#This is for fine tuning (no overlaps)
content = pdf_text

section_titles = re.findall(r'\n(\d+(?:\.\d+)?\s[^\n]+)', toc_content)

full_chunks_no_overlap = []

for i in range(len(section_titles)):
    current_title = section_titles[i].strip()
    next_title = section_titles[i + 1].strip() if i + 1 < len(section_titles) else None

    matches = list(re.finditer(rf'\n{re.escape(current_title)}', content))
    if len(matches) > 1:
        start_idx = matches[1].start()
    elif matches:
        start_idx = matches[0].start()
    else:
        start_idx = None

    if next_title:
        next_matches = list(re.finditer(rf'\n{re.escape(next_title)}', content))
        end_idx = next_matches[1].start() if len(next_matches) > 1 else len(content)
    else:
        end_idx = len(content)

    if start_idx is not None:
        full_chunk_no_overlap_text = content[start_idx:end_idx].strip()
        full_chunks_no_overlap.append({"title": current_title, "text": full_chunk_no_overlap_text})

print(full_chunks_no_overlap)

NameError: name 'pdf_text' is not defined

In [None]:
#this is for image, but since the image that i created are really bad i dont use it
def extract_images_with_ocr(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)  # Convert PDF pages to images
    extracted_text = []

    for img in images:
        text = pytesseract.image_to_string(img)  # Apply OCR to extract text
        extracted_text.append(text)

    return "\n".join(extracted_text)  # Combine all extracted text

# Run OCR on images extracted from PDF
image_text = extract_images_with_ocr("/content/drive/MyDrive/RAG_TEST/RoboSweepX200.pdf")
# Show extracted text
print("\nExtracted Text from Images:", image_text)



Extracted Text from Images: RoboSweep X200

Manual Pengguna & Dokumentasi Teknis

Robot Pembersih Otomatis

Daftar Isi

1. Pendahuluan

1.1 Selamat Datang ke Dunia RoboSweep

1.2 Tentang Manual Ini

1.3 Ikhtisar Produk

1.4 Fitur Utama dan Keunggulan

1.5 Sejarah Singkat Teknologi Pembersih Otomatis

2. Informasi Keselamatan

2.1 Petunjuk Keselamatan Penting

2.2 Simbol Peringatan

2.3 Peringatan Penggunaan Baterai

2.4 Keselamatan Anak dan Hewan Peliharaan

3. Pengenalan Produk

3.1 Isi Kemasan
3.2 Komponen Robot
3.2.1 Tampak Atas
3.2.2 Tampak Bawah
3.2.3 Tampak Samping
3.3 Panel Kontrol dan Indikator
3.4 Stasiun Pengisian Daya
3.5 Spesifikasi Teknis

4. Memulai

4.1 Membuka Kemasan
4.2 Memasang Komponen
4.3 Pengisian Daya Pertama

4.4 Mengunduh Aplikasi Mobile
4.5 Koneksi WiFi
4.6 Pengaturan Awal

5. Pengoperasian Dasar

5.1 Menyalakan dan Mematikan

5.2 Memulai Siklus Pembersihan

5.3 Mode Pembersihan

5.3.1 Mode Otomatis

5.3.2 Mode Spot

5.3.3 Mode Tepi

5.3.4 Mode Terjadwal

5.

make all of chunks into json

In [None]:
#convert all chunks that generated into json (full_chunk,summarized chunk,csv,table,table row)
def generate_chunk_id(index):
    """Generates a sequential ID for each chunk."""
    return f"{index:03d}"  # Format as 3-digit ID, e.g., 001, 002, ...

def flatten_row(row):
    """Flatten each row into a single string."""
    return " | ".join([f"{key}: {value}" for key, value in row.items()])

def chunk_csv_file(input_csv_file, chunk_size=10):

    with open(input_csv_file, mode='r', encoding='utf-8') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader)  # Extract the header row (column names)
        rows = [row for row in csvreader]

    chunked_data = [rows[i:i + chunk_size] for i in range(0, len(rows), chunk_size)]
    csv_chunks = []

    for chunk in chunked_data:
        chunk_json = [flatten_row(dict(zip(header, row))) for row in chunk]
        csv_chunks.append(chunk_json)

    return csv_chunks

def save_chunks_with_ids_to_json(full_chunks, summarized_chunks, pdf_full_tables, pdf_table_rows, input_csv_file=None, chunk_size=10, output_file="YOSIAMD.json"):
    data = {"chunks": []}
    chunk_id = 1

    for chunk in full_chunks:
        text_combined = f"{chunk['title']} {chunk['text']}".strip() if isinstance(chunk, dict) else chunk
        data["chunks"].append({
            "id": generate_chunk_id(chunk_id),
            "type": "text_full",
            "content": text_combined
        })
        chunk_id += 1

    for chunk in summarized_chunks:
        text_combined = f"{chunk['title']} {chunk['summary']}".strip() if isinstance(chunk, dict) else chunk
        data["chunks"].append({
            "id": generate_chunk_id(chunk_id),
            "type": "text_summarized",
            "content": text_combined
        })
        chunk_id += 1

    for table in pdf_full_tables:
        data["chunks"].append({
            "id": generate_chunk_id(chunk_id),
            "type": "table_full",
            "content": table
        })
        chunk_id += 1

    for row in pdf_table_rows:
        data["chunks"].append({
            "id": generate_chunk_id(chunk_id),
            "type": "table_row",
            "content": row
        })
        chunk_id += 1

    if input_csv_file:
        csv_chunks = chunk_csv_file(input_csv_file, chunk_size)
        for csv_chunk in csv_chunks:
            data["chunks"].append({
                "id": generate_chunk_id(chunk_id),
                "type": "csv",
                "content": csv_chunk
            })
            chunk_id += 1

    # since im not using image, i comment this
    # if image_chunks:
    #     for idx, image in enumerate(image_chunks):
    #         data["chunks"].append({
    #             "id": generate_chunk_id(chunk_id),
    #             "type": "image",
    #             "content": image
    #         })
    #         chunk_id += 1

    # Save the structured data to a JSON file

    # Save the structured data to a JSON file
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)

    print(f"Chunks with IDs successfully saved to {output_file}")

# Example usage:
input_csv_file = '/content/drive/MyDrive/RAG_TEST/extended_robot_vacuum_data.csv'
save_chunks_with_ids_to_json(full_chunks, summarized_chunks, pdf_full_tables, pdf_table_rows, input_csv_file, chunk_size=10)

Chunks with IDs successfully saved to YOSIAMD.json


EMBEDDING + PUT IT INTO VECTOR DATABASE (FAISS)

In [None]:
# Load the JSON file containing chunked data
with open('YOSIAMD.json', 'r') as file:
    chunked_data = json.load(file)

# Separate the data by type
text_full_chunks = []
text_summarized_chunks = []
csv_chunks = []
table_full_chunks = []
table_row_chunks = []

# Iterate through the chunks and categorize by type
for chunk in chunked_data["chunks"]:
    content = chunk["content"]
    title = chunk.get("title", "")  # Get title if it exists

    # since there are \n (since i convert it from csv to json then to string) i delete all \n only in csv
    if isinstance(content, list) and chunk["type"] != "csv":
        cleaned_content = [" ".join(str(cell).replace("\n", " ").strip() for cell in row) for row in content]
    elif isinstance(content, str):
        cleaned_content = content.replace("\n", " ").strip()
    else:
        cleaned_content = content

    combined_text = f"{title} {cleaned_content}".strip() if title else cleaned_content

    if chunk["type"] == "text_full":
        text_full_chunks.append(combined_text)
    elif chunk["type"] == "text_summarized":
        text_summarized_chunks.append(combined_text)
    elif chunk["type"] == "csv":
        csv_chunks.append(cleaned_content)
    elif chunk["type"] == "table_full":
        table_full_chunks.append(cleaned_content)
    elif chunk["type"] == "table_row":
        table_row_chunks.append(cleaned_content)

# Display the categorized chunks (cleaned)
print(f"Text Full Chunks: {text_full_chunks[:3]}")
print(f"Text Summarized Chunks: {text_summarized_chunks[:3]}")
print(f"CSV Chunks: {csv_chunks[:3]}")
print(f"Table Full Chunks: {table_full_chunks[:3]}")
print(f"Table Row Chunks: {table_row_chunks[:3]}")


Text Full Chunks: ['1.1 Selamat Datang ke Dunia RoboSweep n Sertifikasi 13.9 Spesifikasi Teknis Lengkap 14. Indeks Istilah 15. Catatan Pengguna 1. Pendahuluan 1.1 Selamat Datang ke Dunia RoboSweep Terima kasih telah memilih RoboSweep X200, robot pembersih otomatis generasi terbaru dari PT TechnoClean Indonesia. Dengan desain kompak berbentuk lingkaran dan teknologi pembersihan canggih, RoboSweep X200 dirancang untuk mengubah cara Anda membersihkan rumah. Kombinasi sempurna antara kecerdasan buatan, sensor presisi tinggi, dan sistem isap yang kuat memungkinkan robot ini membersihkan setiap sudut rumah Anda dengan efisiensi yang belum pernah ada sebelumnya. RoboSweep X200 adalah hasil dari lima tahun penelitian intensif dan umpan balik pengguna, yang membuatnya menjadi asisten pembersihan yang paling andal dan intuitif di pasaran saat ini. Kami yakin bahwa RoboSweep X200 akan mengubah rutinitas pembersihan Anda menjadi pengalaman yang menyenangkan dan efisien. 1.2 Tentang Manual Ini Manu

In [None]:
# Initialize Sentence Transformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS index
embedding_dimension = 384
index = faiss.IndexFlatL2(embedding_dimension)

# Function to embbed text
def embed_text(text):
    """Generate embeddings using Sentence-Transformers."""
    if isinstance(text, dict):  # Handle dict type
        text = f"{text.get('title', '')} {text.get('text', '')}".strip()
    if not isinstance(text, str) or not text.strip():
        print(f"invalid or empty text chunk: {text}")
        return None
    embedding = model.encode([text], normalize_embeddings=True)
    return embedding.reshape(1, -1) # Keep shape (1, 384)

# Function to process and store embeddings
def store_embeddings(full_chunks, summarized_chunks, pdf_full_tables, pdf_table_rows, csv_chunks):
    embeddings = []
    metadata = []
    chunk_id = 0

    def process_chunk(data, data_type):
        """Helper to process a chunk and store embedding"""
        nonlocal chunk_id
        embedding = embed_text(data)
        if embedding is None or embedding.shape[1] != embedding_dimension:
            print(f"Invalid embedding for {data_type}: {str(data)[:30]}...")
            return

        # Store in FAISS
        index.add(embedding)
        embeddings.append(embedding)

        # Store metadata
        metadata.append({
            "id": f"{chunk_id + 1:03d}",
            "type": data_type,
            "content": data
        })
        chunk_id += 1

    #text full chunks
    for text in full_chunks:
        process_chunk(text, "text_full")

    #summarized text chunks
    for text in summarized_chunks:
        process_chunk(text, "text_summarized")

    #full tables chunks
    for table in pdf_full_tables:
        process_chunk(table, "table_full")

    #table rows chunks
    for row in pdf_table_rows:
        process_chunk(row, "table_row")

    #CSV chunks
    for csv_data in csv_chunks:
        flattened_csv = " ".join([str(row) for row in csv_data])
        process_chunk(flattened_csv, "csv")

    print(f"✅ Stored {len(embeddings)} embeddings in FAISS.")

    # Save metadata
    with open("metadata.json", "w", encoding='utf-8') as f:
        json.dump(metadata, f, indent=4, ensure_ascii=False)
        print("✅ Metadata saved to metadata.json.")

    return metadata


metadata = store_embeddings(full_chunks, summarized_chunks, pdf_full_tables, pdf_table_rows, csv_chunks)

faiss.write_index(index, "faiss_index.index")
print("✅ FAISS index saved.")

✅ Stored 329 embeddings in FAISS.
✅ Metadata saved to metadata.json.
✅ FAISS index saved.


INDEX SEARCHING ON FAISS

In [None]:
import faiss
import numpy as np
import json

def query_faiss(query, index, metadata, k=5):
    if index.ntotal == 0:
        print(" FAISS index is empty!")
        return [], []

    query_embedding = embed_text(query).reshape(1, -1)

    distances, indices = index.search(query_embedding, k)

    results = []
    matched_content = []

    for i in range(len(indices[0])):
        match_index = indices[0][i]
        match_distance = distances[0][i]

        if match_index == -1:
            continue

        # ensure its on metadata
        if 0 <= match_index < len(metadata):
            match_metadata = metadata[match_index]
            print(f"match_metadata at index {match_index}: {match_metadata}")

            if isinstance(match_metadata, dict):
                content = match_metadata.get("content", {})
                if isinstance(content, dict):
                    match_content = f"{content.get('title', '')} {content.get('text', '')}".strip()
                else:
                    match_content = str(content)
            else:
                print(f"Unexpected metadata format at index {match_index}: {match_metadata}")
                match_content = "No content found."
        else:
            match_content = "No content found."

        #check if its str or not
        match_content = str(match_content)[:300] if isinstance(match_content, str) else "Invalid content"

        # Store results
        results.append((match_index, match_distance, match_content))
        matched_content.append(match_content)

    return results, matched_content

#this is the query
query = "Bagaimana cara mengatur jadwal pembersihan otomatis melalui aplikasi?"

with open("metadata.json", "r") as f:
    metadata = json.load(f)
    if not isinstance(metadata, list):
        raise TypeError("Metadata JSON must be a list of dictionaries!")

index = faiss.read_index("faiss_index.index")

results, matched = query_faiss(query, index, metadata, k=5)

#using llama to summarize top 5 match
def groq_summarize_faiss_results(top_5_matches,question, model="llama3-8b-8192", max_length=512):
    """
    Summarizes the top 5 matches from FAISS using Groq API.
    """
    # Create a structured prompt for summarizing the top 5 matches
    prompt = f"""
    this is the question : {question}
    You are an expert summarizer. use formal bahasa indonesia with Given the following information, summarize the content in a concise and coherent manner. make sure the question and the summary make sense:

    1. {matched[0]}
    2. {matched[1]}
    3. {matched[2]}
    4. {matched[3]}
    5. {matched[4]}

    Please return a brief summary of these pieces of information in clear and concise text, it is imperative that you return BAHASA INDONESIA. you can throw away some not relevant information too.
    """
    response = groq_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )

    summary = response.choices[0].message.content
    return summary

#answer
summary = groq_summarize_faiss_results(matched,query)


🔍 Debug: match_metadata at index 135: {'id': '136', 'type': 'text_summarized', 'content': {'title': '10.6 Pengoptimalan Masa Pakai Baterai', 'summary': 'Berikut adalah 4 poin utama dari teks tersebut dalam bahasa Indonesia yang formal:\n\n1. Untuk memastikan baterai RoboSweep X200 tetap dalam kondisi terbaik, hindari pengisian daya berlebihan dengan mencabut adaptor jika tidak diperlukan, serta jangan biarkan robot terkena suhu ekstrem.\n2. Bersihkan terminal pengisian daya secara berkala untuk memastikan koneksi optimal dan mempertahankan performa pembersihan yang maksimal.\n3. Jika RoboSweep X200 mengalami kendala selama penggunaan, beberapa solusi untuk masalah umum yang mungkin terjadi dapat ditemukan.\n4. Dengan mengikuti panduan pengoptimalan masa pakai baterai dan pemecahan masalah, RoboSweep X200 dapat tetap optimal untuk penggunaan jangka panjang dan memberikan performa pembersihan yang maksimal.'}}
🔍 Debug: match_metadata at index 106: {'id': '107', 'type': 'text_summarized',

Dataset for funetuning LLM Models

In [None]:
import json
import csv

def format_text_chunks(text_chunks):
    """Converts text chunks into instruction-response format."""
    formatted_data = []

    for chunk in text_chunks:
        text = chunk.get("text", "").strip()

        if text:
            lines = text.split("\n")
            first_line = lines[0] if lines else ""
            remaining_text = "\n".join(lines[1:]).strip()
            instruction = f"Apa yang dijelaskan tentang '{first_line}'?"

            formatted_data.append({
                "instruction": instruction,
                "input": "",
                "response": remaining_text
            })

    return formatted_data

def format_table_data(table):
    """Converts table data into instruction-response format."""
    instruction = "Jelaskan informasi dari tabel berikut untuk konteks tabel ini ada di buku manual RoboSweepX200."
    response = "\n".join(table)

    return {
        "instruction": instruction,
        "input": "",
        "response": response
    }

def format_csv_data(csv_chunks):
    """Converts CSV data into instruction-response format with page indexing."""
    formatted_csv = []

    for i, csv_chunk in enumerate(csv_chunks, start=1):
        instruction = f"Tabel berikut merupakan tabel data dari RoboSweep yang ada. Ini adalah halaman {i}."
        response = "\n".join(csv_chunk) if isinstance(csv_chunk, list) else str(csv_chunk)

        formatted_csv.append({
            "instruction": instruction,
            "input": "",
            "response": response
        })

    return formatted_csv

def format_finetune_data(json_file, output_file, full_chunks_no_overlap):
    """Loads JSON data and converts it into fine-tuning format, replacing text_full with full_chunks_no_overlap."""
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    formatted_data = []

    print("🔍 Debug: Checking full_chunks_no_overlap content...")
    if not full_chunks_no_overlap:
        print("full_chunks_no_overlap is empty!")
    else:
        print("Processing full_chunks_no_overlap, total:", len(full_chunks_no_overlap))

    for chunk in full_chunks_no_overlap:
        formatted_data.extend(format_text_chunks([chunk]))

    for chunk in data["chunks"]:
        if chunk["type"] == "text_summarized":
            formatted_data.extend(format_text_chunks([chunk]))

    for chunk in data["chunks"]:
        if chunk["type"] == "table_full":
            formatted_data.append(format_table_data(chunk["content"].split("\n")))

    csv_chunks = [chunk["content"] for chunk in data["chunks"] if chunk["type"] == "csv"]
    formatted_data.extend(format_csv_data(csv_chunks))

    print("Final formatted data sample:", formatted_data[:3] if formatted_data else "No data formatted!")

    # Save to JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(formatted_data, f, indent=4, ensure_ascii=False)

    print(f"Fine-tuning data saved to {output_file}")

json_file = "YOSIAMD.json"
output_file = "YosiaMatthewD.json"

format_finetune_data(json_file, output_file, full_chunks_no_overlap)


🔍 Debug: Checking full_chunks_no_overlap content...
✅ Processing full_chunks_no_overlap, total: 79
📄 Final formatted data sample: [{'instruction': "Apa yang dijelaskan tentang '1.1 Selamat Datang ke Dunia RoboSweep'?", 'input': '', 'response': 'Terima kasih telah memilih RoboSweep X200, robot pembersih otomatis generasi terbaru dari\nPT TechnoClean Indonesia. Dengan desain kompak berbentuk lingkaran dan teknologi\npembersihan canggih, RoboSweep X200 dirancang untuk mengubah cara Anda membersihkan\nrumah. Kombinasi sempurna antara kecerdasan buatan, sensor presisi tinggi, dan sistem isap\nyang kuat memungkinkan robot ini membersihkan setiap sudut rumah Anda dengan efisiensi\nyang belum pernah ada sebelumnya.\nRoboSweep X200 adalah hasil dari lima tahun penelitian intensif dan umpan balik pengguna,\nyang membuatnya menjadi asisten pembersihan yang paling andal dan intuitif di pasaran saat\nini. Kami yakin bahwa RoboSweep X200 akan mengubah rutinitas pembersihan Anda menjadi\npengalaman y

Deloying to hugging face

In [None]:
!git clone https://huggingface.co/spaces/sayaakunikan/my_rag_test
!ls

Cloning into 'my_rag_test'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 9 (delta 1), reused 1 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (9/9), 3.26 KiB | 1.63 MiB/s, done.
app.py	faiss_index.index  metadata.json  my_rag_test  README.md  requirements.txt


In [None]:
!git add .
!git commit -m "Update RAG implementation"
!git push

[33mhint: You've added another git repository inside your current repository.[m
[33mhint: Clones of the outer repository will not contain the contents of[m
[33mhint: the embedded repository and will not know how to obtain it.[m
[33mhint: If you meant to add a submodule, use:[m
[33mhint: [m
[33mhint: 	git submodule add <url> my_rag_test[m
[33mhint: [m
[33mhint: If you added this path by mistake, you can remove it from the[m
[33mhint: index with:[m
[33mhint: [m
[33mhint: 	git rm --cached my_rag_test[m
[33mhint: [m
[33mhint: See "git help submodule" for more information.[m
[main ea69c59] Update RAG implementation
 1 file changed, 1 insertion(+)
 create mode 160000 my_rag_test
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 519.93 KiB | 8.25 MiB/s, done.
Total 7 (delta 1), reused 0 (delta 0), pack-reused 0
To https://huggingface.co/spaces/s