In [1]:
!pip install accelerate bitsandbytes transformers

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

In [3]:
model_name = "meta-llama/Llama-3.2-1B"
HF_TOKEN = "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"

from huggingface_hub import login
login(HF_TOKEN)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    if tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Padding token set to {tokenizer.pad_token}.")
    else:
        raise ValueError("Both pad_token and eos_token are None. Set a padding token.")
else:
    print("Padding token already defined.")

Padding token set to <|end_of_text|>.


In [5]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [6]:
from datasets import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_json("/home/iiitd/LLM/dataset/meld/train.json")
valid_data = pd.read_json("/home/iiitd/LLM/dataset/meld/valid.json")
test_data = pd.read_json("/home/iiitd/LLM/dataset/meld/test.json")

In [7]:
def aplicator(index : int, df : pd.DataFrame):
    t = df.iloc[max(0,index-5):index]
    text = "Find the next sentiment of the given sequence:- \n"
    for ind, rw in t.iterrows():
        text += rw["input"] + " " + rw["target"] + '\n'
    text += df["input"][index] + " "
    return text

In [8]:
def null_aplicator(index: int, df: pd.DataFrame):
    if((index+1) % 5 != 0):
        return None
    return aplicator(index,df)

In [9]:
train_data["comb"] = train_data.apply(lambda x: aplicator(int(x.name),train_data),axis=1)
valid_data["comb"] = valid_data.apply(lambda x: aplicator(int(x.name),valid_data),axis=1)
test_data["comb"] = test_data.apply(lambda x: aplicator(int(x.name),test_data),axis=1)

In [10]:
train_data["ncomb"] = train_data.apply(lambda x: null_aplicator(int(x.name),train_data),axis=1)
valid_data["ncomb"] = valid_data.apply(lambda x: null_aplicator(int(x.name),valid_data),axis=1)
test_data["ncomb"] = test_data.apply(lambda x: null_aplicator(int(x.name),test_data),axis=1)

In [11]:
label_encoder = LabelEncoder()
label_encoder.fit(train_data['target'])

train_data['label'] = label_encoder.transform(train_data['target'])
valid_data['label'] = label_encoder.transform(valid_data['target'])
test_data['label'] = label_encoder.transform(test_data['target'])

train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)
test_dataset = Dataset.from_pandas(test_data)

def preprocess_function(examples):
    tokenized_output = tokenizer(
        examples['comb'],
        truncation=True,
        padding='longest',
        return_tensors='pt'
    )
    tokenized_output['label'] = examples['label']
    return tokenized_output

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
print("Datasets tokenized successfully.")

tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Tokenizing datasets...


Map:   0%|          | 0/9989 [00:00<?, ? examples/s]

Map:   0%|          | 0/1109 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Datasets tokenized successfully.


In [12]:
df = pd.concat([train_data, test_data, valid_data])
num_classes = len(df['target'].unique())

In [13]:
from transformers import LlamaForSequenceClassification

model = LlamaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

model.config.pad_token_id = tokenizer.pad_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def preprocess_function(examples):
    tokenized_output = tokenizer(
        examples['input'],
        truncation=True,
        padding='longest',
        return_tensors='pt'
    )
    tokenized_output['label'] = examples['label']
    return tokenized_output

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
print("Datasets tokenized successfully.")

tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Tokenizing datasets...


Map:   0%|          | 0/9989 [00:00<?, ? examples/s]

Map:   0%|          | 0/1109 [00:00<?, ? examples/s]

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Datasets tokenized successfully.


In [15]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
   

In [16]:
from peft import PromptTuningConfig, get_peft_model, TaskType

prompt_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=20
)

peft_model = get_peft_model(model, prompt_config)

In [17]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    f1_macro = f1_score(labels, predictions, average='macro')
    return {
        'accuracy': acc,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro
    }


training_args = TrainingArguments(
    output_dir='./Prompt-Tuning',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    no_cuda=False,
    logging_dir='./logs',
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
!pip install wandb

import wandb

wandb.login(key="363c88e3d300dc7d04a703b840d8538e57c617c4")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting wandb
  Downloading wandb-0.18.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting sentry-sdk>=2.0.0
  Downloading sentry_sdk-2.17.0-py2.py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.5/314.5 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting gitpython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 KB[0m [31m2.7 

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/iiitd/.netrc


True

In [19]:
print("Training started....")
trainer.train()
print("Training complete.")

Training started....




Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,1.843674,0.246168,0.203297,0.092815
2,1.603100,1.780713,0.247971,0.204399,0.093758
3,1.603100,1.734487,0.400361,0.304829,0.12918




Training complete.


In [None]:
test_results = trainer.evaluate(tokenized_test_dataset)

In [21]:
for i,j in test_results.items():
  print(f'{i} : {j}')

eval_loss : 1.5948773622512817
eval_accuracy : 0.42298850574712643
eval_f1_weighted : 0.322548035719598
eval_f1_macro : 0.12018087486292772
eval_runtime : 41.9542
eval_samples_per_second : 62.211
eval_steps_per_second : 1.955
epoch : 3.0


In [23]:
model.save_pretrained('./Prompt-Tuning/model')
tokenizer.save_pretrained('./Prompt-Tuning/tokenizer')

('./Prompt-Tuning/tokenizer/tokenizer_config.json',
 './Prompt-Tuning/tokenizer/special_tokens_map.json',
 './Prompt-Tuning/tokenizer/tokenizer.json')