In [None]:
# @title Installation
import re
import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [None]:
# @title Importing all needed libraries
from unsloth import FastLanguageModel
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import TrainingArguments, Trainer
from unsloth import is_bfloat16_supported

In [None]:
# @title Initializing main part
# The imdb dataset is for binary classification (positive/negative),
# so we need to set num_labels to 2.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="distilbert/distilbert-base-uncased",
    auto_model=AutoModelForSequenceClassification,
    max_seq_length=512,  # Max sequence length for DistilBert
    dtype=None,
    num_labels=2,  # Corrected from 6 to 2 for IMDb
    full_finetuning=True,
    load_in_4bit=True,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Low-rank adaptation rank
    target_modules=["q_lin", "v_lin"],  # For DistilBert
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
)


In [None]:
# @title Supporting Functions and Dataset

# Load simple dataset
dataset = load_dataset("imdb", split="train[:1000]")

# Format the dataset correctly
def format_data(example):
    return {"text": example["text"], "labels": example["label"]}

dataset = dataset.map(format_data)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "label"])


In [5]:
# @title Config Trainer
# Set up Trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=5e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        eval_steps=0.10,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
    train_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)



In [None]:
# @title Starting training
# Train the model
trainer_stats = trainer.train()


In [None]:
# @title Quick Test
from transformers import pipeline

sentence1 = """
This movie was not that great at all.


"""

classifier = pipeline("sentiment-analysis", model=model,tokenizer=tokenizer)

classifier(sentence1)

In [None]:
# @title Deep Testing (Evaluating)
from datasets import load_dataset
import torch

# Load the test split of the IMDb dataset
test_dataset = load_dataset("imdb", split="test")

# Re-use the same formatting and tokenization functions as before
def format_data(example):
    return {"text": example["text"], "labels": example["label"]}
test_dataset = test_dataset.map(format_data)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text", "label"])

# Set up a new Trainer for evaluation
eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./evaluation_results",
        report_to="none",
    ),
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
)

# Run the evaluation and print the results
eval_results = eval_trainer.evaluate()
print(f"Evaluation results: {eval_results}")


In [None]:
# @title Test by reviews
# Create a text review to test
positive_review = "This movie was an absolute masterpiece! The acting, direction, and story were perfect."
negative_review = "I was incredibly disappointed by this movie. The plot was boring and the ending made no sense."

# Tokenize the input text
inputs_positive = tokenizer(positive_review, return_tensors="pt", padding=True, truncation=True)
inputs_negative = tokenizer(negative_review, return_tensors="pt", padding=True, truncation=True)

# Ensure the model is in evaluation mode
model.eval()

# Move the inputs to the correct device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs_positive = {k: v.to(device) for k, v in inputs_positive.items()}
inputs_negative = {k: v.to(device) for k, v in inputs_negative.items()}
model.to(device)

# Get the model's predictions
with torch.no_grad():
    outputs_positive = model(**inputs_positive)
    outputs_negative = model(**inputs_negative)

# Interpret the results
# The logits correspond to the two labels: 0 (negative) and 1 (positive)
positive_prediction = torch.argmax(outputs_positive.logits, dim=-1).item()
negative_prediction = torch.argmax(outputs_negative.logits, dim=-1).item()

print(f"Review: '{positive_review}'")
print(f"Prediction: {'Positive' if positive_prediction == 1 else 'Negative'}")
print(f"Confidence: {outputs_positive.logits.softmax(dim=-1).max().item()}")

print(f"\nReview: '{negative_review}'")
print(f"Prediction: {'Positive' if negative_prediction == 1 else 'Negative'}")
print(f"Confidence: {outputs_negative.logits.softmax(dim=-1).max().item()}")


In [None]:
# @title Save local
model.save_pretrained("sentim_movies")
tokenizer.save_pretrained("sentim_movies")

In [None]:
# @title login hf
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# @title Save to huggingface
model.push_to_hub("aired/sentim_movies")
tokenizer.push_to_hub("aired/sentim_movies")