In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.19.1 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

## Embedding Technique

#### `1. TF-IDF`

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

doc1 = "Tôi tên tuyền"

doc2 = "tôi là tuyền"

# Vector hóa với TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc1, doc2])

# Tính cosine similarity
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print("Cosine similarity:", similarity[0][0])

## Model

In [None]:
%%capture
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "tyanfarm/llama3-8b-hotels-information-mixed-5epochs-finetuned", #model folder
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

In [None]:
chat_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
def generate_response(model, tokenizer, query: str,
                      temperature: float = 0.01,
                      top_k: int = 10,
                      top_p: float = 0.95,
                      max_new_tokens: int = 512,
                      device: str = "cuda") -> str:
    inputs = tokenizer(
        [chat_prompt.format(
            "",        # instruction
            query,     # input
            ""         # response
        )],
        return_tensors="pt"
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )

    decoded_output = tokenizer.batch_decode(outputs)[0]
    response = decoded_output.split("### Response:")[-1].strip()
    response = response.split("<|end_of_text|>")[0].strip()

    return response


# Ví dụ sử dụng:
result = generate_response(model, tokenizer, "Có loại phòng Executive Deluxe ở Ramana Saigon không?")
print(result)

## Dataset

In [None]:
EOS_TOKEN = tokenizer.eos_token    # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instruction = ""
    inputs = examples["question"]
    outputs = examples["answer"]
    texts = []

    for input, output in zip(inputs, outputs):
        text = chat_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return { "text": texts, }
pass

In [None]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd     

# dataset = load_dataset("tyanfarm/hotel-addresses-questions-answers", split = "train")
dataset = load_dataset("tyanfarm/hotels-questions-answers-mixed", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

dataset[2]

In [None]:
len(dataset)

## Evaluation

In [None]:
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np

k = len(dataset) - 1

questions = dataset['question'][:k]
references = dataset['answer'][:k]

generated_ans = []

for question in tqdm(questions[:k], desc="Generating answers"):
    gen = generate_response(model, tokenizer, question)
    generated_ans.append(gen)

assert len(references) == len(generated_ans), "references == generated_ans"

def pair_tfidf_cosine(a: str, b: str, **tfidf_kwargs) -> float:
    a = a or ""   # phòng null/None
    b = b or ""
    v = TfidfVectorizer(**tfidf_kwargs)   # mặc định ngram_range=(1,1)
    X = v.fit_transform([a, b])           # fit CHỈ trên 2 văn bản này
    return cosine_similarity(X[0:1], X[1:2])[0, 0]

# similarity theo từng cặp, không tạo ma trận k×k
tfidf_similarities = np.array([
    pair_tfidf_cosine(pred, ref) 
    for pred, ref in zip(generated_ans, references)
])


In [None]:
similarities = tfidf_similarities

# Phân nhóm
low_mask = similarities < 0.70
mid_mask = (similarities >= 0.70) & (similarities <= 0.85)
high_mask = similarities > 0.85

In [None]:
from typing import Sequence
import numpy as np
from datasets import Dataset

def make_bucket_ds(
    mask: Sequence[bool],
    questions: Sequence[str],
    similarities: Sequence[float],
    predicted: Sequence[str],
    references: Sequence[str],
) -> Dataset:
    # Bảo đảm kiểu & kích thước
    mask = np.asarray(mask, dtype=bool)
    questions  = np.asarray(questions,  dtype=object)
    predicted  = np.asarray(predicted,  dtype=object)
    references = np.asarray(references, dtype=object)
    similarities = np.asarray(similarities, dtype=float)

    assert len(questions) == len(similarities) == len(predicted) == len(references) == mask.shape[0], \
        "Các mảng phải cùng độ dài với mask."

    return Dataset.from_dict({
        "Question":   questions[mask].tolist(),
        "Predicted":  predicted[mask].tolist(),
        "Reference":  references[mask].tolist(),
        "Similarity": np.round(similarities[mask], 4).tolist(),
    })


In [None]:
ds_low  = make_bucket_ds(low_mask,  questions, similarities, generated_ans, references)
ds_mid  = make_bucket_ds(mid_mask,  questions, similarities, generated_ans, references)
ds_high = make_bucket_ds(high_mask, questions, similarities, generated_ans, references)

# Lưu 3 dataset (chỉ gồm 2 cột Question & Similarity)
ds_low.save_to_disk("buckets/low_0.00_0.70")     # [0.0, 0.70)
ds_mid.save_to_disk("buckets/mid_0.70_0.85")     # [0.70, 0.85]
ds_high.save_to_disk("buckets/high_0.85_1.00")   # (0.85, 1.0]

In [None]:
print("Summary:")
print(f"  LOW  [0.00, 0.70): {len(ds_low)} rows")
print(f"  MID  [0.70, 0.85]: {len(ds_mid)} rows")
print(f"  HIGH (0.85, 1.00]: {len(ds_high)} rows")

In [None]:
from huggingface_hub import login

login(token="")

In [None]:
from huggingface_hub import HfApi, login

dataset_name = "tyanfarm/llama3-8b-hotels-information-mixed-5epochs-finetuned-evaluation"

api = HfApi()
api.create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True)

In [None]:
for split_name, ds in {
    "low": ds_low,
    "mid": ds_mid,
    "high": ds_high
}.items():
    if len(ds) > 0:   # chỉ push nếu có record
        ds.push_to_hub(dataset_name, split=split_name)
    else:
        print(f"⚠️ Skip {split_name}, no records.")

In [None]:
!pip install flask python-telegram-bot

In [None]:
import requests

TOKEN = ""
CHAT_ID = ""


def send_message(text: str):
    url = f"https://api.telegram.org/bot{TOKEN}/sendMessage"
    payload = {
        "chat_id": CHAT_ID,
        "text": text,
        "parse_mode": "MarkdownV2"  # optional: 'MarkdownV2' or 'HTML'
    }
    response = requests.post(url, json=payload)
    return response.json()

text = """
Evaluate done \!
```kaggle-finetuning-evaluation
tyanfarm/llama3-8b-hotels-information-mixed-5epochs-finetuned-evaluation
```
"""

send_message(text)