In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/codellama-7b-bnb-4bit", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

df_train = pd.read_json("../datasets/good_title_under/train.json")
df_valid = pd.read_json("../datasets/good_title_datas/valid.json")


train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    texts = []
    for tags, desc, code, title in zip(examples['tags'], examples['desc'], examples['code'], examples['title']):
        instruction = "Below is related tags, a code snippet and a description. Based on this information, please write a suitable title for this question."
        input_text =  ( f"### Related tags:\n{tags}\n\n"
                        f"### Code snippet:\n{code}\n\n"
                        f"### Description:\n{desc}\n\n")
        output_text = title
        text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
valid_dataset = valid_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/158292 [00:00<?, ? examples/s]

Map:   0%|          | 0/19789 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/158292 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/19789 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
4.121 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 158,292 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 120
 "-____-"     Number of trainable parameters = 39,976,960


Step,Training Loss
1,1.0653
2,1.0954
3,1.0102
4,1.1388
5,1.116
6,1.0796
7,1.1092
8,1.2953
9,1.1315
10,1.2776


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

476.7837 seconds used for training.
7.95 minutes used for training.
Peak reserved memory = 5.77 GB.
Peak reserved memory for training = 1.649 GB.
Peak reserved memory % of max memory = 39.124 %.
Peak reserved memory for training % of max memory = 11.181 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Below is related tags, a code snippet and a description. Based on this information, please write a suitable title for this question.", # instruction
        (f"### Related tags:\nc# forearch\n\n"
                    '### Code snippet:\nint index = 0; foreach (var websitePage in websitePages) { if(index == 0) classAttributePart = " class="first""; sb.AppendLine(String.Format("<li" + classAttributePart + ">" + "<a href="{0}">{1}</a></li>", websitePage.GetFileName(), websitePage.Title)); index++; }\n\n'
                    '### Description:\nI often find myself doing the following indexcounter messiness in a foreach loop to find out if I am on the first element or not Is there a more elegant way to do this in C something along the lines of ifthisforeachPass 1 etc\n\n'), # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nBelow is related tags, a code snippet and a description. Based on this information, please write a suitable title for this question.\n\n### Input:\n### Related tags:\nc# forearch\n\n### Code snippet:\nint index = 0; foreach (var websitePage in websitePages) { if(index == 0) classAttributePart = " class="first""; sb.AppendLine(String.Format("<li" + classAttributePart + ">" + "<a href="{0}">{1}</a></li>", websitePage.GetFileName(), websitePage.Title)); index++; }\n\n### Description:\nI often find myself doing the following indexcounter messiness in a foreach loop to find out if I am on the first element or not Is there a more elegant way to do this in C something along the lines of ifthisforeachPass 1 etc\n\n\n\n### Response:\nC#: Is there a better way to find out if I am on the first element in a foreach 

In [None]:
model.push_to_hub("your_name/file_name", token = "your_key") # Online saving

README.md:   0%|          | 0.00/579 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

Saved model to https://huggingface.co/ItsmeDat/lora_model_under2


Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
!pip install sentence-transformers tensorflow tensorflow-hub rouge

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/227.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cac

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "your_name/file_name", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)# Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


adapter_config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from sentence_transformers import SentenceTransformer, util as sbert_util
import tensorflow_hub as hub
import tensorflow as tf
from rouge import Rouge
from io import open
import nltk
import torch
from tqdm import tqdm
import os
import numpy as np
import pandas as pd
from openpyxl import load_workbook, Workbook
import re

rouge = Rouge()

nltk.download('punkt')

# SBERT
def get_sbert_score(references, hypotheses):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    reference_embeddings = model.encode(references, convert_to_tensor=True)
    hypothesis_embeddings = model.encode(hypotheses, convert_to_tensor=True)
    cosine_scores = sbert_util.pytorch_cos_sim(reference_embeddings, hypothesis_embeddings)
    return cosine_scores.diagonal().mean().item()

# ROUGE
def get_rouge_score(references, hypotheses):
    rouge1_array = []
    rouge2_array = []
    rouge3_array = []
    for ref, hyp in zip(references, hypotheses):
        if isinstance(hyp, str) and isinstance(ref, str):
            if hyp.strip() != '' or ref.strip() != '':
                rouge_scores = rouge.get_scores(hyp.strip(), ref.strip())
                rouge1_array.append(rouge_scores[0]["rouge-1"]['r'])
                rouge2_array.append(rouge_scores[0]["rouge-2"]['r'])
                rouge3_array.append(rouge_scores[0]["rouge-l"]['r'])

    rouge_1 = np.mean(np.array(rouge1_array))
    rouge_2 = np.mean(np.array(rouge2_array))
    rouge_l = np.mean(np.array(rouge3_array))
    return rouge_1, rouge_2, rouge_l

def getAllMetric(preds, golden):

    # with open(golden, 'r', encoding='utf8') as t, open(preds, 'r', encoding='utf8') as p:
    #     tline = t.readlines()
    #     pline = p.readlines()

    pline = pd.read_csv(preds)
    tline = pd.read_csv(golden)
    pline = pline['pred_title'].tolist()
    tline = tline['gold_title'].tolist()
    
    
    assert len(tline) == len(pline)

    rouge_1, rouge_2, rouge_l = get_rouge_score(pline, tline)
    sbert_score = get_sbert_score(tline, pline)

    ret_scores = {
        "ROUGE_1": rouge_1,
        "ROUGE_2": rouge_2,
        "ROUGE_L": rouge_l,
        "SBERT": sbert_score
    }
    return ret_scores

langs = ['python','javascript','php','c#']


ip = 1

for lang in langs:
    print(lang)
    df_test = pd.read_json(f"../datasets/good_title_datas/{lang}/test.json")

    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
    inputs = []
    for example in df_test.to_dict(orient='records'):
        instruction = "Below is related tags, a code snippet and a description. Based on this information, please write a suitable title for this question."
        input_text =  ( f"### Related tags:\n{example['tags']}\n\n"
                        f"### Description:\n{example['desc']}\n\n"
                        f"### Code snippet:\n{example['code']}\n\n")
        text = alpaca_prompt.format(instruction, input_text, '')
        inputs.append(text)

    # Tokenize the inputs for batch processing
    batch_size = 2  # You can adjust this batch size based on your GPU memory capacity

    if ip == 1:
      findex = 0
    else:
      findex = 0
    ip = 3
    for i in tqdm(range(findex, len(inputs), batch_size), total=(len(inputs) - findex) // batch_size + 1):
        with torch.no_grad():
            batch_inputs = inputs[i:i+batch_size]
            tokenized_inputs = tokenizer(batch_inputs, return_tensors="pt", padding=True, truncation=True).to("cuda")

            # Generate outputs for the batch
            outputs = model.generate(**tokenized_inputs, max_new_tokens=64, use_cache=True)

            # Decode the generated outputs and add them to the list of predicted titles
            batch_pred_titles = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            pattern = r'### Response:\n(.+)'

            filtered_pred_titles = []
            filtered_gold_titles = []

            golds = df_test['title'].tolist()

            gold_results = golds[i:i + batch_size]

            for pred, gold in zip(batch_pred_titles, gold_results):
                matches = re.findall(pattern, pred)
                if matches:
                    filtered_pred_titles.append(matches[0].strip())
                    filtered_gold_titles.append(gold)

            pred_titles = filtered_pred_titles
            gold_titles = filtered_gold_titles

            # Đường dẫn tới thư mục lưu trữ file
            results_dir = './results/'
            pred_file_path = os.path.join(results_dir, f"{lang}.pred.csv")
            gold_file_path = os.path.join(results_dir, f"{lang}.gold.csv")

            # Chuyển đổi pred_titles và gold_titles thành các DataFrame
            df_pred = pd.DataFrame(pred_titles, columns=["pred_title"])
            df_gold = pd.DataFrame(gold_titles, columns=["gold_title"])

            # Đọc các file CSV hiện có (nếu tồn tại) và thêm dữ liệu mới vào
            if os.path.exists(pred_file_path):
                existing_pred = pd.read_csv(pred_file_path)
                df_pred = pd.concat([existing_pred, df_pred], ignore_index=True)

            if os.path.exists(gold_file_path):
                existing_gold = pd.read_csv(gold_file_path)
                df_gold = pd.concat([existing_gold, df_gold], ignore_index=True)

            # Lưu lại DataFrame vào file CSV
            df_pred.to_csv(pred_file_path, index=False)
            df_gold.to_csv(gold_file_path, index=False)

            print(i)
            torch.cuda.empty_cache()

    # Compare predicted titles with the gold titles
    pred_file_path = os.path.join('./results/', f"{lang}.pred.csv")
    gold_file_path = os.path.join('./results/', f"{lang}.gold.csv")
    # Compute metrics
    metrics_dict = getAllMetric(pred_file_path, gold_file_path)

    # Excel file path
    excel_file_path = './results/evaluation.xlsx'

    # Check if the Excel file already exists
    if not os.path.exists(excel_file_path):
        # Create a new workbook and sheet
        book = Workbook()
        sheet = book.active
        sheet.title = "Instruction fine-tuned no rag"
        # Define fieldnames
        fieldnames = ["Language", "Rouge-1", "Rouge-2", "Rouge-l", "SBERT"]
        # Append fieldnames to the sheet
        sheet.append(fieldnames)
    else:
        # Load the existing workbook
        book = load_workbook(excel_file_path)
        sheet = book.active

    # Prepare data to append
    data = [
        lang,
        f"{metrics_dict['ROUGE_1']}",
        f"{metrics_dict['ROUGE_2']}",
        f"{metrics_dict['ROUGE_L']}",
        f"{metrics_dict['SBERT']}",
    ]

    # Append data to the sheet
    sheet.append(data)

    # Save the workbook
    book.save(excel_file_path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


java
[{'rouge-1': {'r': 0.5, 'p': 0.6363636363636364, 'f': 0.5599999950720002}, 'rouge-2': {'r': 0.3076923076923077, 'p': 0.4, 'f': 0.34782608204158794}, 'rouge-l': {'r': 0.5, 'p': 0.6363636363636364, 'f': 0.5599999950720002}}]
Java 8 extract all keys from matching values in a Map
Java 8 - Get all keys from a Map which matched with the objects
