# üéØ Interactive QA Review & Auto Fine-Tuning

Notebook ini untuk:
1. **Scrape** data Laravel dari internet
2. **Generate** QA pairs otomatis
3. **Review & Approve** dengan checkbox form
4. **Auto fine-tune** model dengan approved data

---

In [None]:
# Clone or update repository
import os

if not os.path.exists('/content/LLM'):
    print("üì• Cloning repository...")
    !git clone https://github.com/ubaidillahfaris/LLM.git /content/LLM
    print("‚úÖ Repository cloned!")
else:
    print("üì• Updating repository...")

# Change to project directory and pull latest changes
%cd /content/LLM
!git restore .
!git pull origin claude/work-in-progress-01E3sNLAhJUX4bm9z7RmBcYr

print("\n‚úÖ Repository updated!")
print(f"üìÅ Current directory: {os.getcwd()}")

## Step 0: Clone/Update Repository (Google Colab)

# Import modules
import sys
import os
import json

# Add src to path (relative path)
if './src' not in sys.path:
    sys.path.insert(0, './src')

print(f"üìÅ Current directory: {os.getcwd()}")
print(f"‚úÖ Added ./src to Python path")

# Import custom modules
from web_scraper import LaravelDataScraper
from qa_generator import QAGenerator
from auto_finetune import AutoFineTuner
from model_utils import ModelManager

print("‚úÖ All modules imported!")

In [None]:
# Install dependencies
!pip install -q ipywidgets beautifulsoup4 requests

# Enable widgets
from IPython.display import display
import ipywidgets as widgets

In [None]:
# Initialize scraper
scraper = LaravelDataScraper()

# Scrape Laravel official docs
topics = ['eloquent', 'routing', 'middleware', 'validation', 'authentication']

print("üîç Scraping Laravel documentation...")
print("‚è∞ This may take 1-2 minutes\n")

scraped_data = scraper.scrape_laravel_docs(topics)

print(f"\n‚úÖ Scraped {len(scraped_data)} sections")

# Save scraped data (using relative path)
scraper.save_scraped_data('./data/raw/scraped_laravel_content.json')

# Show sample
if scraped_data:
    print("\nüìù Sample scraped content:")
    sample = scraped_data[0]
    print(f"Title: {sample.get('title', 'N/A')}")
    print(f"Content: {sample.get('content', '')[:200]}...")

## Step 2: Scrape Data dari Internet

In [None]:
# Initialize QA generator
qa_gen = QAGenerator()

# Generate QA pairs
print("ü§ñ Generating QA pairs from scraped content...\n")

generated_qa = qa_gen.batch_generate_from_scraped_data(scraped_data)

print(f"\n‚úÖ Generated {len(generated_qa)} QA pairs")
print(f"\nüìä Stats: {qa_gen.get_stats()}")

# Save for review (using relative path)
qa_gen.export_review_data('./data/raw/qa_pending_review.json')

print(f"\nüíæ Saved to: ./data/raw/qa_pending_review.json")

## Step 3: Generate QA Pairs Otomatis

In [None]:
# Initialize QA generator
qa_gen = QAGenerator()

# Generate QA pairs
print("ü§ñ Generating QA pairs from scraped content...\n")

generated_qa = qa_gen.batch_generate_from_scraped_data(scraped_data)

print(f"\n‚úÖ Generated {len(generated_qa)} QA pairs")
print(f"\nüìä Stats: {qa_gen.get_stats()}")

# Save for review
review_path = os.path.join(project_root, 'data', 'raw', 'qa_pending_review.json')
qa_gen.export_review_data(review_path)

print(f"\nüíæ Saved to: {review_path}")

## Step 4: Review & Approve QA Pairs (Interactive Form)

Review QA pairs satu per satu dengan checkbox interface.

In [None]:
# Interactive QA Review Interface
class QAReviewInterface:
    def __init__(self, qa_generator):
        self.qa_gen = qa_generator
        self.pending_qa = qa_generator.get_pending_qa()
        self.current_index = 0
        
        # Create widgets
        self.create_widgets()
        
    def create_widgets(self):
        # Progress
        self.progress_label = widgets.HTML(
            value=f"<h3>Review Progress: 0 / {len(self.pending_qa)}</h3>"
        )
        
        # QA Display
        self.qa_display = widgets.HTML(
            value=self.get_qa_html(0),
            layout=widgets.Layout(border='2px solid #4CAF50', padding='15px', margin='10px 0')
        )
        
        # Edit fields
        self.question_edit = widgets.Textarea(
            placeholder='Edit question...',
            layout=widgets.Layout(width='100%', height='80px')
        )
        
        self.answer_edit = widgets.Textarea(
            placeholder='Edit answer...',
            layout=widgets.Layout(width='100%', height='150px')
        )
        
        # Buttons
        self.approve_btn = widgets.Button(
            description='‚úÖ Approve',
            button_style='success',
            layout=widgets.Layout(width='150px')
        )
        self.approve_btn.on_click(self.on_approve)
        
        self.reject_btn = widgets.Button(
            description='‚ùå Reject',
            button_style='danger',
            layout=widgets.Layout(width='150px')
        )
        self.reject_btn.on_click(self.on_reject)
        
        self.skip_btn = widgets.Button(
            description='‚è≠Ô∏è Skip',
            button_style='warning',
            layout=widgets.Layout(width='150px')
        )
        self.skip_btn.on_click(self.on_skip)
        
        # Status
        self.status_label = widgets.HTML(value="")
        
    def get_qa_html(self, index):
        if index >= len(self.pending_qa):
            return "<h3>üéâ All QA pairs reviewed!</h3>"
        
        qa = self.pending_qa[index]
        
        html = f"""
        <div style="font-family: Arial, sans-serif;">
            <p><strong>ID:</strong> {qa['id']}</p>
            <p><strong>Source:</strong> {qa.get('source', 'N/A')} | <strong>Topic:</strong> {qa.get('topic', 'N/A')}</p>
            <hr>
            <h4 style="color: #2196F3;">‚ùì Question:</h4>
            <p style="font-size: 16px; padding: 10px; background: #E3F2FD; border-radius: 5px;">
                {qa['question']}
            </p>
            <h4 style="color: #4CAF50;">üí° Answer:</h4>
            <p style="font-size: 14px; padding: 10px; background: #E8F5E9; border-radius: 5px;">
                {qa['answer'][:500]}{'...' if len(qa['answer']) > 500 else ''}
            </p>
        </div>
        """
        
        return html
    
    def update_display(self):
        self.progress_label.value = f"<h3>Review Progress: {self.current_index} / {len(self.pending_qa)}</h3>"
        self.qa_display.value = self.get_qa_html(self.current_index)
        
        if self.current_index < len(self.pending_qa):
            qa = self.pending_qa[self.current_index]
            self.question_edit.value = qa['question']
            self.answer_edit.value = qa['answer']
        
    def on_approve(self, btn):
        if self.current_index >= len(self.pending_qa):
            return
        
        qa = self.pending_qa[self.current_index]
        
        # Check if edited
        if self.question_edit.value != qa['question'] or self.answer_edit.value != qa['answer']:
            self.qa_gen.edit_qa(
                qa['id'],
                new_question=self.question_edit.value,
                new_answer=self.answer_edit.value
            )
        
        # Approve
        self.qa_gen.mark_as_approved(qa['id'])
        
        self.status_label.value = f"<p style='color: green;'>‚úÖ Approved: {qa['id']}</p>"
        self.current_index += 1
        self.update_display()
    
    def on_reject(self, btn):
        if self.current_index >= len(self.pending_qa):
            return
        
        qa = self.pending_qa[self.current_index]
        self.qa_gen.mark_as_rejected(qa['id'], reason="User rejected")
        
        self.status_label.value = f"<p style='color: red;'>‚ùå Rejected: {qa['id']}</p>"
        self.current_index += 1
        self.update_display()
    
    def on_skip(self, btn):
        self.current_index += 1
        self.status_label.value = "<p style='color: orange;'>‚è≠Ô∏è Skipped</p>"
        self.update_display()
    
    def display(self):
        # Layout
        return widgets.VBox([
            self.progress_label,
            self.qa_display,
            widgets.HTML("<h4>Edit if needed:</h4>"),
            widgets.HTML("<p>Question:</p>"),
            self.question_edit,
            widgets.HTML("<p>Answer:</p>"),
            self.answer_edit,
            widgets.HBox([self.approve_btn, self.reject_btn, self.skip_btn]),
            self.status_label
        ])

# Create and display interface
review_interface = QAReviewInterface(qa_gen)
display(review_interface.display())

print("\nüìù Review QA pairs above using the interactive form!")
print("   - ‚úÖ Approve: Add to training dataset")
print("   - ‚ùå Reject: Discard this QA pair")
print("   - ‚è≠Ô∏è Skip: Review later")
print("\nYou can edit the question/answer before approving!")

## Step 5: View Review Statistics

In [None]:
# Save approved QA to training dataset (using relative path)
training_data = qa_gen.save_to_training_dataset(
    filepath='./data/raw/approved_qa.json',
    approved_only=True
)

print(f"\n‚úÖ Saved {len(training_data)} approved QA pairs")
print(f"üìÅ Location: ./data/raw/approved_qa.json")

## Step 6: Save Approved QA for Training

In [None]:
# Save approved QA to training dataset
approved_dataset_path = os.path.join(project_root, 'data', 'raw', 'approved_qa.json')

training_data = qa_gen.save_to_training_dataset(
    filepath=approved_dataset_path,
    approved_only=True
)

print(f"\n‚úÖ Saved {len(training_data)} approved QA pairs")
print(f"üìÅ Location: {approved_dataset_path}")

# Initialize auto fine-tuner (using relative paths)
auto_tuner = AutoFineTuner(
    model_manager=model_manager,
    base_dataset_path='./data/raw/laravel_qa_dataset.json',
    approved_qa_path='./data/raw/approved_qa.json',
    training_output_dir='./models/auto_finetuned'
)

# Get approved QA
approved_qa = qa_gen.get_approved_qa()

if len(approved_qa) == 0:
    print("‚ö†Ô∏è  No approved QA pairs yet!")
    print("   Go back to Step 4 and approve some QA pairs first.")
else:
    print(f"‚úÖ Ready to fine-tune with {len(approved_qa)} approved QA pairs")
    print("\n‚ö†Ô∏è  Training will start when you run the next cell!")

In [None]:
# Load model
import torch

print("üì¶ Loading model for fine-tuning...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model_manager = ModelManager(model_name="gpt2", device=device)
model_manager.load_model()

print(f"‚úÖ Model loaded on {device}")

In [None]:
# Initialize auto fine-tuner
auto_tuner = AutoFineTuner(
    model_manager=model_manager,
    base_dataset_path=os.path.join(project_root, 'data', 'raw', 'laravel_qa_dataset.json'),
    approved_qa_path=approved_dataset_path,
    training_output_dir=os.path.join(project_root, 'models', 'auto_finetuned')
)

# Get approved QA
approved_qa = qa_gen.get_approved_qa()

if len(approved_qa) == 0:
    print("‚ö†Ô∏è  No approved QA pairs yet!")
    print("   Go back to Step 4 and approve some QA pairs first.")
else:
    print(f"‚úÖ Ready to fine-tune with {len(approved_qa)} approved QA pairs")
    print("\n‚ö†Ô∏è  Training will start when you run the next cell!")

In [None]:
# Load fine-tuned model (using relative path)
print("üì¶ Loading fine-tuned model...")

finetuned_model = ModelManager(
    model_name="gpt2",
    model_path='./models/auto_finetuned',
    device=device
)
finetuned_model.load_model(from_pretrained=True)

print("‚úÖ Fine-tuned model loaded!")

# Test it
def test_model(question):
    prompt = f"Question: {question}\nAnswer:"
    response = finetuned_model.generate_response(
        prompt=prompt,
        max_new_tokens=200,
        temperature=0.7
    )
    print(f"\nQ: {question}")
    print(f"A: {response}\n")

# Test dengan pertanyaan
test_questions = [
    "Bagaimana cara install Laravel?",
    "Apa itu Eloquent ORM?",
    "Bagaimana cara membuat middleware?"
]

print("üß™ Testing fine-tuned model:\n" + "="*50)
for q in test_questions:
    test_model(q)

## Step 8: Test Fine-Tuned Model

In [None]:
# Load fine-tuned model
print("üì¶ Loading fine-tuned model...")

finetuned_model = ModelManager(
    model_name="gpt2",
    model_path=os.path.join(project_root, 'models', 'auto_finetuned'),
    device=device
)
finetuned_model.load_model(from_pretrained=True)

print("‚úÖ Fine-tuned model loaded!")

# Test it
def test_model(question):
    prompt = f"Question: {question}\nAnswer:"
    response = finetuned_model.generate_response(
        prompt=prompt,
        max_new_tokens=200,
        temperature=0.7
    )
    print(f"\nQ: {question}")
    print(f"A: {response}\n")

# Test dengan pertanyaan
test_questions = [
    "Bagaimana cara install Laravel?",
    "Apa itu Eloquent ORM?",
    "Bagaimana cara membuat middleware?"
]

print("üß™ Testing fine-tuned model:\n" + "="*50)
for q in test_questions:
    test_model(q)

## üéâ Success!

### What You Just Did:

1. ‚úÖ Scraped Laravel content dari official docs
2. ‚úÖ Generated QA pairs otomatis
3. ‚úÖ Reviewed & approved dengan interactive form
4. ‚úÖ Auto fine-tuned model dengan approved data
5. ‚úÖ Tested fine-tuned model

### Next Steps:

- **Add more sources**: Scrape dari StackOverflow, Medium, Laracasts
- **Continuous learning**: Run this notebook regularly untuk keep training
- **Deploy**: Deploy fine-tuned model ke production
- **API**: Build API endpoint untuk serve model

---

**Happy Learning! üöÄ**