In [None]:
# Install necessary libraries
!pip install transformers datasets huggingface_hub accelerate

import os
import torch
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Disable WandB logging by setting environment variables
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"  # Disable advisory warnings for transformers

# Authenticate Hugging Face account (use your token here)
os.environ["HF_HOME"] = "/root/.cache/huggingface"  # Set Hugging Face cache location
token = "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"  # Replace with your Hugging Face token
login(token)

# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Llama model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to account for new token

# Check total model parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params / 1e6:.2f} million")

# Load the SST-2 dataset for binary classification
dataset = load_dataset("glue", "sst2")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set the dataset format for PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Split dataset into training and testing subsets
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(500))  # First 500 samples for training
test_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))  # First 100 samples for testing

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  
    evaluation_strategy="epoch",  
    learning_rate=2e-5,  
    per_device_train_batch_size=1,  # Small batch size to avoid OOM errors
    per_device_eval_batch_size=1,   
    gradient_accumulation_steps=8,  # Accumulate gradients for larger effective batch size
    num_train_epochs=3,  # Reduce epochs for quicker training
    weight_decay=0.01,  
    seed=1,
    logging_dir="./logs",  
    logging_strategy="epoch",  
    run_name="llama3_finetuning",  # Custom name for logging
    fp16=True  # Enable mixed precision training
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
print("Starting training...")
trainer.train()

# Evaluate the model
print("Evaluating the model...")
results = trainer.evaluate()

# Print evaluation results
print("Evaluation Results:")
print(results)

# Save the fine-tuned model
model_dir = "./fine-tuned-llama"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print(f"Model and tokenizer saved to {model_dir}")

In [None]:
# Predictions
!pip install transformers datasets huggingface_hub accelerate

import os
import torch
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding)


import numpy as np

import random


# Disable WandB logging by setting environment variables
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"  # Disable advisory warnings for transformers

# Authenticate Hugging Face account (use your token here)
os.environ["HF_HOME"] = "/root/.cache/huggingface"  # Set Hugging Face cache location
token = "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"  # Replace with your Hugging Face token
login(token)

# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Llama model and tokenizer
model_name = "/kaggle/input/llama-sst-fine-tuned-model/fine-tuned-llama" #load the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to account for new token

# Check total model parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params / 1e6:.2f} million")




In [None]:
# Load the SST-2 dataset for binary classification
dataset = load_dataset("glue", "sst2", split='train')
dataset

In [None]:
dataset=dataset.train_test_split(.2)
# dataset['test']['label']

In [None]:
def predict(input_text):
    """
    Predicts the sentiment label for a given text input.

    Args:
        input_text (str): The text to predict the sentiment for.

    Returns:
        float: The predicted probability of the text being positive sentiment.
    """
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # Convert to PyTorch tensors and move to GPU (if available)
    with torch.no_grad():
        outputs = model(**inputs).logits  # Get the model's output logits
    y_prob = torch.sigmoid(outputs).tolist()[0]  # Apply sigmoid activation and convert to list
    return np.round(y_prob, 5)  # Round the predicted probability to 5 decimal places
input

In [None]:
df_test = pd.DataFrame(dataset['test'])
df_test['prediction'] = df_test['sentence'].map(predict)
df_test['y_pred'] = df_test['prediction'].apply(lambda x: np.argmax(x, axis=0)) 

In [None]:

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set the dataset format for PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Split dataset into training and testing subsets
train_dataset = tokenized_datasets["train"] #.shuffle(seed=42).select(range(500))  # First 500 samples for training
test_dataset = tokenized_datasets["test"] #.shuffle(seed=42).select(range(100))  # First 100 samples for testing

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate metrics for multiclass setting
accuracy = accuracy_score(df_test['label'], df_test['y_pred'])
precision = precision_score(df_test['label'], df_test['y_pred'], average='weighted')
recall = recall_score(df_test['label'], df_test['y_pred'], average='weighted')
f1 = f1_score(df_test['label'], df_test['y_pred'], average='weighted')

# Print metrics
print(f"Model Metrics on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [1]:
# Pre training
# !pip install transformers datasets huggingface_hub accelerate

import os
import torch
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np  # Ensure numpy is imported

# Disable WandB logging by setting environment variables
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"  # Disable advisory warnings for transformers

# Authenticate Hugging Face account
os.environ["HF_HOME"] = "/root/.cache/huggingface"  # Set Hugging Face cache location
token = "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"  # Replace with your Hugging Face token
login(token)

# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Llama model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to account for new token

# Check total model parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params / 1e6:.2f} million")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Using device: cuda


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 1235.82 million


In [2]:
# Load the SST-2 dataset for binary classification
dataset = load_dataset("glue", "sst2", split='train')
dataset = dataset.train_test_split(test_size=0.2)  # Split dataset into train/test

# Define the prediction function
def predict(input_text):
    """
    Predicts the sentiment label for a given text input.

    Args:
        input_text (str): The text to predict the sentiment for.

    Returns:
        float: The predicted probability of the text being positive sentiment.
    """
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)  # Use defined device
    with torch.no_grad():
        outputs = model(**inputs).logits  # Get the model's output logits
    y_prob = torch.softmax(outputs, dim=-1).tolist()[0]  # Use softmax for multi-class classification
    return np.round(y_prob, 5)  # Round the predicted probabilities to 5 decimal places

# Predict for the test set
df_test = pd.DataFrame(dataset['test'])
df_test['prediction'] = df_test['sentence'].map(predict)
df_test['y_pred'] = df_test['prediction'].apply(lambda x: np.argmax(x))  # Extract the predicted class
df_test['label'] = dataset['test']['label']  # Add ground truth labels

# Calculate metrics for the test set
accuracy = accuracy_score(df_test['label'], df_test['y_pred'])
precision = precision_score(df_test['label'], df_test['y_pred'], average='weighted')
recall = recall_score(df_test['label'], df_test['y_pred'], average='weighted')
f1 = f1_score(df_test['label'], df_test['y_pred'], average='weighted')

# Print metrics
print(f"Model Metrics on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Display sample predictions
print(df_test.head())


Model Metrics on Test Data:
Accuracy: 0.4352
Precision: 0.7542
Recall: 0.4352
F1 Score: 0.2640
                                            sentence  label    idx  \
0  maintain both a level of sophisticated intrigu...      1  22735   
1                                   unsophisticated       0  44001   
2             mr. scorsese 's bravery and integrity       1  28822   
3                           spectacularly beautiful       1  45791   
4  , parker exposes the limitations of his skill ...      0  54951   

           prediction  y_pred  
0  [0.98533, 0.01467]       0  
1  [0.98213, 0.01787]       0  
2    [0.9626, 0.0374]       0  
3  [0.99514, 0.00486]       0  
4  [0.92242, 0.07758]       0  


In [3]:
# Post training
# !pip install transformers datasets huggingface_hub accelerate

import os
import torch
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np  # Ensure numpy is imported

# Disable WandB logging by setting environment variables
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"  # Disable advisory warnings for transformers

# Authenticate Hugging Face account
os.environ["HF_HOME"] = "/root/.cache/huggingface"  # Set Hugging Face cache location
token = "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"  # Replace with your Hugging Face token
login(token)

# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the Llama model and tokenizer
model_name = "/kaggle/input/llama-sst-fine-tuned-model/fine-tuned-llama"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to account for new token

# Check total model parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params / 1e6:.2f} million")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Using device: cuda
Total parameters: 1235.82 million


In [6]:
# Load the SST-2 dataset for binary classification
dataset = load_dataset("glue", "sst2", split='train')
dataset = dataset.train_test_split(test_size=0.2)  # Split dataset into train/test

# Define the prediction function
def predict(input_text):
    """
    Predicts the sentiment label for a given text input.

    Args:
        input_text (str): The text to predict the sentiment for.

    Returns:
        float: The predicted probability of the text being positive sentiment.
    """
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)  # Use defined device
    with torch.no_grad():
        outputs = model(**inputs).logits  # Get the model's output logits
    y_prob = torch.softmax(outputs, dim=-1).tolist()[0]  # Use softmax for multi-class classification
    return np.round(y_prob, 5)  # Round the predicted probabilities to 5 decimal places

# Predict for the test set
df_test = pd.DataFrame(dataset['test'])
df_test['prediction'] = df_test['sentence'].map(predict)
df_test['y_pred'] = df_test['prediction'].apply(lambda x: np.argmax(x))  # Extract the predicted class
df_test['label'] = dataset['test']['label']  # Add ground truth labels

# Calculate metrics for the test set
accuracy = accuracy_score(df_test['label'], df_test['y_pred'])
precision = precision_score(df_test['label'], df_test['y_pred'], average='weighted')
recall = recall_score(df_test['label'], df_test['y_pred'], average='weighted')
f1 = f1_score(df_test['label'], df_test['y_pred'], average='weighted')

# Print metrics
print(f"Model Metrics on Test Data:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Display sample predictions
print(df_test.head())


Model Metrics on Test Data:
Accuracy: 0.8850
Precision: 0.8862
Recall: 0.8850
F1 Score: 0.8852
                                            sentence  label    idx  \
0  rivals the top japanese animations of recent v...      1  37334   
1  fondly remembered in the endlessly challenging...      1  40739   
2  a glorious spectacle like those d.w. griffith ...      1  44751   
3  begins to drag two-thirds through , when the m...      0  44536   
4      take as many drugs as the film 's characters       0  35696   

         prediction  y_pred  
0  [1e-05, 0.99999]       1  
1        [0.0, 1.0]       1  
2        [0.0, 1.0]       1  
3        [1.0, 0.0]       0  
4  [0.99997, 3e-05]       0  
