In [None]:
!pip install transformers datasets peft accelerate bitsandbytes

## 1. Fine-tuning Mistral-7B-Instruct on Chillies IELTS dataset - LoraConfig

### 1.1 Fine-tuning Mistral-7B-Instruct on Chillies IELTS dataset

In [None]:
# —————————————————————————————
# Fine-tuning Mistral-7B-Instruct on Chillies IELTS dataset
# —————————————————————————————

import re
import json
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# --- 1. Load dataset
ds = load_dataset("chillies/IELTS-writing-task-2-evaluation")

# Có các split: train, test
train_ds = ds["train"]
test_ds = ds["test"]

# --- 2. Preprocess: tạo `completion` JSON từ field `evaluation`

def parse_evaluation_to_json(eva: str):
    """
    Chuyển chuỗi evaluation thành dict JSON:
    {
       "TaskResponse": float,
       "Coherence": float,
       "Lexical": float,
       "Grammar": float,
       "Overall": float,
       "Feedback": str
    }
    """
    # dùng regex để tìm các điểm
    # Lưu ý: tên tiêu chí có thể khác một chút, cần tùy theo dataset
    # Ví dụ "Task Achievement" tương ứng TaskResponse
    
    d = {}
    # Task Achievement / Task Response
    m = re.search(r"Task Achievement:\s*\[([0-9]*\.?[0-9]+)\]", eva)
    if m:
        d["TaskResponse"] = float(m.group(1))
    # Coherence and Cohesion
    m = re.search(r"Coherence and Cohesion:\s*\[([0-9]*\.?[0-9]+)\]", eva)
    if m:
        d["Coherence"] = float(m.group(1))
    # Lexical Resource
    m = re.search(r"Lexical Resource:\s*\[([0-9]*\.?[0-9]+)\]", eva)
    if m:
        d["Lexical"] = float(m.group(1))
    # Grammatical Range and Accuracy
    m = re.search(r"Grammatical Range and Accuracy:\s*\[([0-9]*\.?[0-9]+)\]", eva)
    if m:
        d["Grammar"] = float(m.group(1))
    # Overall Band Score
    m = re.search(r"Overall Band Score:\s*\[([0-9]*\.?[0-9]+)\]", eva)
    if m:
        d["Overall"] = float(m.group(1))
    # Feedback (bắt phần sau “Feedback and Additional Comments:”)
    # Giả sử phần feedback bắt đầu từ “Feedback and Additional Comments:”
    split_token = "Feedback and Additional Comments:"
    if split_token in eva:
        feedback = eva.split(split_token, 1)[1].strip()
        d["Feedback"] = feedback
    else:
        d["Feedback"] = ""
    return d

def make_completion(example):
    d = parse_evaluation_to_json(example["evaluation"])
    # convert dict thành JSON string
    return json.dumps(d, ensure_ascii=False)

# tạo completion và full text (prompt + essay + completion)
def make_full_prompt(example):
    topic = example["prompt"]
    essay = example["essay"]
    return f"### Topic:\n{topic}\n\n### Essay:\n{essay}\n\n### JSON Response:\n"

def make_full_text(example):
    return make_full_prompt(example) + make_completion(example)

# Áp map
train_ds = train_ds.map(lambda ex: {"text": make_full_text(ex)}, remove_columns=train_ds.column_names)
test_ds = test_ds.map(lambda ex: {"text": make_full_text(ex)}, remove_columns=test_ds.column_names)

# --- 3. Tokenizer & model
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # set pad = eos để tránh lỗi

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# --- 4. LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Mistral cần q_proj, v_proj
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# --- 5. Tokenize dataset
def tokenize_fn(ex):
    return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=1024)

train_ds = train_ds.map(tokenize_fn, batched=False)
test_ds = test_ds.map(tokenize_fn, batched=False)

# thiết lập format để Trainer hiểu
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

# --- 6. TrainingArguments & Trainer
args = TrainingArguments(
    output_dir="./mistral_ielts_ft",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

# --- 7. Huấn luyện
trainer.train()

# --- 8. Lưu model & tokenizer
model.save_pretrained("./mistral_ielts_ft")
tokenizer.save_pretrained("./mistral_ielts_ft")


### 1.2 Tích hợp đánh giá MAE / R² sau train

In [None]:
from transformers import pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import re, json
import numpy as np

pipe = pipeline("text-generation", model="./mistral_ielts_ft", tokenizer="./mistral_ielts_ft")

preds = []
trues = []

# dùng test_ds gốc với bản chưa map để lấy điểm thật
orig_test = ds["test"]

for ex in orig_test:
    topic = ex["prompt"]
    essay = ex["essay"]
    true_eval = parse_evaluation_to_json(ex["evaluation"])
    true_overall = true_eval.get("Overall", None)
    if true_overall is None:
        continue
    prompt = f"### Topic:\n{topic}\n\n### Essay:\n{essay}\n\n### JSON Response:\n"
    out = pipe(prompt, max_new_tokens=200, temperature=0.5)[0]["generated_text"]
    # trích JSON từ đầu ra
    m = re.search(r"\{.*\}", out, re.S)
    if m:
        try:
            d = json.loads(m.group(0))
            pred = d.get("Overall", None)
        except:
            pred = None
    else:
        pred = None
    if pred is not None:
        preds.append(pred)
        trues.append(true_overall)

# tính MAE, R²
preds = np.array(preds)
trues = np.array(trues)
print("MAE:", mean_absolute_error(trues, preds))
print("R²:", r2_score(trues, preds))


## 2. Fine-tuning Mistral-7B-Instruct-v0.3 with Unsloth

In [None]:
!pip install unsloth[cu118] transformers datasets accelerate bitsandbytes peft -U
# [cu118] cho CUDA 11.8 (Colab T4)
# [cu121] cho CUDA 12.1 (A100)
# Unsloth hỗ trợ PyTorch 2.2+


### 2.1 Fine-tuning Mistral-7B-Instruct-v0.3 with Unsloth

In [None]:
# ================================================
# Fine-tuning Mistral-7B-Instruct-v0.3 with Unsloth
# Dataset: chillies/IELTS-writing-task-2-evaluation
# ================================================

!pip install unsloth[cu118] transformers datasets accelerate bitsandbytes peft -q -U

from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import torch, re, json

# 1️⃣ Load dataset
ds = load_dataset("chillies/IELTS-writing-task-2-evaluation")
train_ds = ds["train"].shuffle(seed=42).select(range(8000))
eval_ds = ds["test"]

# 2️⃣ Load model & tokenizer bằng Unsloth
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 1024,
    load_in_4bit = True,          # QLoRA mode
    use_gradient_checkpointing = True,
    dtype = torch.bfloat16,
)

tokenizer.pad_token = tokenizer.eos_token

# 3️⃣ Add LoRA adapter (rất nhanh)
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,                        # Rank 8
    lora_alpha = 16,
    lora_dropout = 0.05,
    target_modules = ["q_proj", "v_proj"],
    bias = "none",
    use_rslora = True,            # RS-LoRA = Robust scaling (giảm overfitting)
)

# 4️⃣ Preprocess
def preprocess(ex):
    text = f"### Topic:\n{ex['prompt']}\n\n### Essay:\n{ex['essay']}\n\n### JSON Response:\n{ex['evaluation']}"
    return tokenizer(text, truncation=True, padding="max_length", max_length=1024)

train_ds = train_ds.map(preprocess)
eval_ds = eval_ds.map(preprocess)
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

# 5️⃣ TrainingArguments (tối ưu cho Colab Pro)
args = TrainingArguments(
    output_dir="./mistral_unsloth_ft",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    logging_steps=50,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
)

trainer.train()

model.save_pretrained("./mistral_unsloth_ft")
tokenizer.save_pretrained("./mistral_unsloth_ft")


### 2.2 Tích hợp đánh giá MAE / R² sau train

In [None]:
from transformers import pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

pipe = pipeline("text-generation", model="./mistral_unsloth_ft", tokenizer="./mistral_unsloth_ft")

preds, trues = [], []
for ex in eval_ds.select(range(100)):
    text = f"### Topic:\n{ex['prompt']}\n\n### Essay:\n{ex['essay']}\n\n### JSON Response:\n"
    out = pipe(text, max_new_tokens=200, temperature=0.5)[0]['generated_text']
    m = re.search(r"\{.*\}", out, re.S)
    if m:
        try:
            js = json.loads(m.group(0))
            preds.append(js.get("Overall", None))
        except:
            preds.append(None)
    trues.append(re.search(r"Overall Band Score:\s*\[([\d\.]+)\]", ex['evaluation']).group(1))

mask = [i for i, p in enumerate(preds) if p is not None]
y_pred = np.array([preds[i] for i in mask], float)
y_true = np.array([float(trues[i]) for i in mask])

print("MAE:", mean_absolute_error(y_true, y_pred))
print("R²:", r2_score(y_true, y_pred))
