In [1]:
%%capture
!pip install --upgrade unsloth

In [2]:
# Reset the environment (clear all variables)
%reset -f


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
from pprint import pprint
import os
for dirname, _, filenames in os.walk('/kaggle/input/multi-lingual-sentiment-analysis'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from unsloth import FastLanguageModel
import torch


/kaggle/input/multi-lingual-sentiment-analysis/sample_submission.csv
/kaggle/input/multi-lingual-sentiment-analysis/train.csv
/kaggle/input/multi-lingual-sentiment-analysis/test.csv
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
# model_path = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"
max_seq_length = 2048
dtype = None
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

/kaggle/input/llama-3.1/transformers/8b-instruct/2 does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


## Dataset formatting

In [5]:
train = pd.read_csv('/kaggle/input/multi-lingual-sentiment-analysis/train.csv')
train.head()

Unnamed: 0,ID,sentence,label,language
0,1,কর্মীদের ভাল আচরণ এবং খাবারের পাশাপাশি পানীয় ...,Positive,bn
1,2,ગોદરેજ સેન્ટ્રલ એસીમાં તેના કન્ડેન્સર પર 2 વર્...,Positive,gu
2,3,"கதைக்களம் பிடித்திருந்தது, அனைத்து நடிகர்களும்...",Positive,ta
3,4,ਵੌਇਸ-ਓਵਰ ਬਹੁਤ ਵਧੀਆ ਸੀ ਅਤੇ ਕਹਾਣੀ ਦੀ ਸੀਮਾ ਵਿੱਚ ਇ...,Positive,pa
4,5,जुथानि थाखाय जायगा गैया। गुबुन मुवा सोग्रा जाय...,Negative,bd


In [6]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

train_df, test_df = train_test_split(train, test_size=0.2, random_state=37)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

data = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print(data)


DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'label', 'language', '__index_level_0__'],
        num_rows: 800
    })
    test: Dataset({
        features: ['ID', 'sentence', 'label', 'language', '__index_level_0__'],
        num_rows: 200
    })
})


In [7]:
ds = data['train']
print(ds)

Dataset({
    features: ['ID', 'sentence', 'label', 'language', '__index_level_0__'],
    num_rows: 800
})


In [8]:
ds[0]

{'ID': 401,
 'sentence': 'அதிக நீடித்த தன்மைக்காக ஹை இம்பாக்ட் ஃபைபர் கொண்டு தயாரிக்கப்பட்டிருக்கிறது.',
 'label': 'Positive',
 'language': 'ta',
 '__index_level_0__': 400}

In [9]:
def preprocess(sample):
    return {
        "conversations": [
            {"from": "user", "value": sample["sentence"]},
            {"from": "assistant", "value": sample["label"]}
        ]
    }

processed_ds = ds.map(preprocess, remove_columns=['ID','sentence','label'])
pprint(processed_ds[0])

def formatting_prompts_func(examples):
    chats = examples["conversations"]
    texts = [tokenizer.apply_chat_template(chat, tokenize = False, add_generation_prompt = False) for chat in chats]

    return { "text" : texts, }

from unsloth.chat_templates import standardize_sharegpt
processed_ds = standardize_sharegpt(processed_ds)
processed_ds = processed_ds.map(formatting_prompts_func, batched = True,)

processed_ds[0]['text']


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

{'__index_level_0__': 400,
 'conversations': [{'from': 'user',
                    'value': 'அதிக நீடித்த தன்மைக்காக ஹை இம்பாக்ட் ஃபைபர் '
                             'கொண்டு தயாரிக்கப்பட்டிருக்கிறது.'},
                   {'from': 'assistant', 'value': 'Positive'}],
 'language': 'ta'}


Standardizing format:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nஅதிக நீடித்த தன்மைக்காக ஹை இம்பாக்ட் ஃபைபர் கொண்டு தயாரிக்கப்பட்டிருக்கிறது.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPositive<|eot_id|>'

## Train

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 13,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = processed_ds,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 25,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 34,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Applying chat template to train dataset (num_proc=2):   0%|          | 0/800 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/800 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/800 [00:00<?, ? examples/s]

In [12]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

tokenizer.decode(trainer.train_dataset[7]["input_ids"])


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nछोटे और फोल्डेबल लेंस को एक जगह से दूसरी जगह कैरी किया जा सकता है।<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPositive<|eot_id|>'

In [13]:
trainer_stats = trainer.train()
trainer_stats

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 800 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 25
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss
5,8.4359
10,3.6968
15,0.2468
20,0.215
25,0.0809


TrainOutput(global_step=25, training_loss=2.5350935506820678, metrics={'train_runtime': 116.9373, 'train_samples_per_second': 0.855, 'train_steps_per_second': 0.214, 'total_flos': 1123842161885184.0, 'train_loss': 2.5350935506820678})

In [14]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model);

## metric

In [15]:
# labels = []
# sentences = test_df['sentence'].tolist()
# len(sentences)

In [16]:
# sen = sentences[0]
# messages = [
#         {"role": "user", "content": f"{sen}"},
#     ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True,
#     return_tensors = "pt",
# ).to("cuda")

# outputs = model.generate(input_ids = inputs, max_new_tokens = 32, use_cache = True,
#                          temperature = 0.1, min_p = 0.1)

# output_text = tokenizer.batch_decode(outputs)[0]
# pprint(output_text)


In [17]:
# import re
# match = re.search(r'<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>', output_text, re.DOTALL)
# pprint(match)
# # if match:
# #     user_response = match.group(1).strip()
# #     labels.append(user_response)
# # else:
# #     print("No response")

In [18]:
# from tqdm import tqdm
# for sen in tqdm(sentences):
#     messages = [
#         {"role": "user", "content": f"{sen}"},
#     ]
#     inputs = tokenizer.apply_chat_template(
#         messages,
#         tokenize = True,
#         add_generation_prompt = True,
#         return_tensors = "pt",
#     ).to("cuda")
    
#     outputs = model.generate(input_ids = inputs, max_new_tokens = 32, use_cache = True,
#                              temperature = 0.1, min_p = 0.1)
    
#     output_text = tokenizer.batch_decode(outputs)[0]
    
#     import re
#     match = re.search(r'<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>', output_text, re.DOTALL)
    
#     if match:
#         user_response = match.group(1).strip()
#         labels.append(user_response)
#     else:
#         print("No response")

In [19]:
# test_labels = test_df['label'].tolist()
# pred_labels = labels

# from sklearn.metrics import f1_score

# test_labels_numeric = [1 if label == 'Positive' else 0 for label in test_labels]
# pred_labels_numeric = [1 if label == 'Positive' else 0 for label in pred_labels]

# f1 = f1_score(test_labels_numeric, pred_labels_numeric)

# print("F1 Score:", f1)
# # F1 Score: 0.9

## Test

In [20]:
labels = []
sentences = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/test.csv")['sentence'].tolist()
print(len(sentences))

from tqdm import tqdm
for sen in tqdm(sentences):
    messages = [
        {"role": "user", "content": f"{sen}"},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")
    
    outputs = model.generate(input_ids = inputs, max_new_tokens = 32, use_cache = True,
                             temperature = 0.1, min_p = 0.1)
    
    output_text = tokenizer.batch_decode(outputs)[0]
    
    import re
    match = re.search(r'<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>', output_text, re.DOTALL)
    
    if match:
        user_response = match.group(1).strip()
        labels.append(user_response)
    else:
        print("No response")

labels[:5]

100


  0%|          | 0/100 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 100/100 [00:52<00:00,  1.91it/s]


['Positive', 'Positive', 'Positive', 'Positive', 'Negative']

In [21]:
submission = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/sample_submission.csv")
submission['label'] = labels
submission.head()

Unnamed: 0,ID,label
0,1,Positive
1,2,Positive
2,3,Positive
3,4,Positive
4,5,Negative


In [22]:
submission.to_csv("submission.csv",index=False)
!ls -l

total 72
---------- 1 root root 58661 Feb 16 18:24 __notebook__.ipynb
drwxr-xr-x 3 root root  4096 Feb 16 18:23 outputs
-rw-r--r-- 1 root root  1201 Feb 16 18:24 submission.csv
drwxr-xr-x 3 root root  4096 Feb 16 18:19 unsloth_compiled_cache
