<a href="https://colab.research.google.com/github/ubaidillahfaris/LLM/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---------------------------
# Install dependencies
# ---------------------------
!pip install transformers beautifulsoup4 requests datasets

# ---------------------------
# Import
# ---------------------------
import torch
import json
import requests
from bs4 import BeautifulSoup
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# ---------------------------
# Device setup
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------
# Database handling
# ---------------------------
db = {}

def load_db():
    global db
    try:
        with open("local_db.json", "r") as f:
            db = json.load(f)
    except:
        db = {}

def save_db():
    with open("local_db.json", "w") as f:
        json.dump(db, f)

def show_db():
    for i, (q, a) in enumerate(db.items(),1):
        print(f"{i}. Query: {q}\n   Answer: {a}\n")

load_db()
# Contoh data awal
db["cara install laravel"] = "1. Install PHP & Composer\n2. composer create-project laravel/laravel nama_project\n3. Jalankan server"
db["apa itu laravel"] = "Laravel adalah framework PHP untuk web development"
save_db()

# ---------------------------
# Web search (dummy scraping)
# ---------------------------
def search_web(query):
    """Fetch jawaban sederhana dari Google (dummy)"""
    url = "https://www.google.com/search?q=" + query.replace(" ", "+")
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        resp = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(resp.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = " ".join(p.get_text() for p in paragraphs[:3])
        return text if text else f"Tidak menemukan jawaban jelas untuk '{query}'"
    except:
        return f"Gagal fetch jawaban untuk '{query}'"

# ---------------------------
# Retrieval method
# ---------------------------
def retrieve_answer(query):
    """Cek database dulu, kalau tidak ada fetch dari web"""
    if query in db:
        return db[query]
    answer_text = search_web(query)
    db[query] = answer_text
    save_db()
    return answer_text

# ---------------------------
# Load GPT-2
# ---------------------------
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

# ---------------------------
# Fine-tuning dataset (optional)
# ---------------------------
file_path = "/content/sample_data/tuningData.json"
try:
    with open(file_path, "r") as f:
        tuning_data = json.load(f)

    texts = [item["prompt"] + " " + item["target"] for item in tuning_data]
    dataset = Dataset.from_list([{"text": t} for t in texts])

    # Tokenize function
    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

    tokenized_dataset = dataset.map(tokenize, batched=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir="./fine_tuned_gpt2",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        save_steps=50,
        save_total_limit=2,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator
    )

    trainer.train()
    model.save_pretrained("./fine_tuned_gpt2")
    tokenizer.save_pretrained("./fine_tuned_gpt2")
except:
    print("Fine-tune dataset tidak ditemukan, lewati step ini.")

# ---------------------------
# Generate response
# ---------------------------
def generate_response(query, max_length=150):
    info = retrieve_answer(query)
    prompt = f"Question: {query}\nAnswer: {info}\n\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k,v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=False  # deterministic output
    )
    gen_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return gen_text.split("\n")[-1].strip()

# ---------------------------
# Test
# ---------------------------
print(generate_response("Bagaimana install laravel?"))
print("-----")
print(generate_response("Apa itu laravel?"))
print("-----")
show_db()




Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Step,Training Loss


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer:
-----
Answer: Apa itu laravel jelas t
-----
1. Query: cara install laravel
   Answer: 1. Install PHP & Composer
2. composer create-project laravel/laravel nama_project
3. Jalankan server

2. Query: apa itu laravel
   Answer: Laravel adalah framework PHP untuk web development

3. Query: Bagaimana install laravel?
   Answer: Tidak menemukan jawaban jelas untuk 'Bagaimana install laravel?'

4. Query: Apa itu laravel?
   Answer: Tidak menemukan jawaban jelas untuk 'Apa itu laravel?'

