In [121]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [122]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [123]:
dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [124]:
def clean_text(sample):
    sample["text"]=sample["text"].strip()
    return sample
dataset=dataset.map(clean_text)

In [125]:
dataset.pop("unsupervised")

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})

In [126]:
import collections

labels = dataset['train']['label']
collections.Counter(labels)

Counter({0: 12500, 1: 12500})

In [127]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [128]:
MAX_LEN=256
def tokenization_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

In [129]:
tokenized_dataset=dataset.map(
    tokenization_batch,
    batched=True,
    remove_columns=["text"]
)

In [130]:
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids","attention_mask","label"]
)

In [131]:
tokenized_dataset["train"][0]


{'label': tensor(0),
 'input_ids': tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
          2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
          2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
          2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
          1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
          2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
          6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
          1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
          5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
         14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
          1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
          2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
         25430, 14

In [132]:
from torch.utils.data import DataLoader
BATCH_SIZE=16

In [133]:
train_loader=DataLoader(
    tokenized_dataset["train"],
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_loader=DataLoader(
    tokenized_dataset["test"],
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [134]:
batch=next(iter(train_loader))
for key,value in batch.items():
    print(key,value.shape)

label torch.Size([16])
input_ids torch.Size([16, 256])
attention_mask torch.Size([16, 256])


In [135]:
import torch
from transformers import AutoModelForSequenceClassification

In [136]:
model=AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [137]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using Device: {device}")

Using Device: cuda


In [138]:
batch=next(iter(train_loader))
batch={k:v.to(device) for k,v in batch.items()}
outputs=model(
    input_ids=batch["input_ids"],
    attention_mask=batch["attention_mask"],
    labels=batch["label"]
)

print(outputs.loss,outputs.logits.shape)

tensor(0.6773, device='cuda:0', grad_fn=<NllLossBackward0>) torch.Size([16, 2])


In [139]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
LEARNING_RATE=2e-5
optimizer=AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
)

In [140]:
EPOCHS=3
total_steps=len(train_loader)*EPOCHS

In [141]:
print(type(total_steps),total_steps)

<class 'int'> 4689


In [142]:
scheduler=get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1*total_steps),
    num_training_steps=total_steps
)
print("Total training steps:", total_steps)


Total training steps: 4689


In [None]:
from tqdm import tqdm

In [144]:
def train_model(model, train_loader, test_loader, optimizer, scheduler, device, epochs):
    best_accuracy = 0.0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        
        # ---- TRAINING ----
        model.train()
        total_train_loss = 0

        for batch in tqdm(train_loader, desc="Training"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["label"]
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping (important for transformers)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # ---- VALIDATION ----
        model.eval()
        total_eval_loss = 0
        correct_preds = 0
        total_preds = 0

        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Validation"):
                batch = {k: v.to(device) for k, v in batch.items()}

                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["label"]
                )

                loss = outputs.loss
                logits = outputs.logits

                total_eval_loss += loss.item()

                predictions = torch.argmax(logits, dim=1)
                correct_preds += (predictions == batch["label"]).sum().item()
                total_preds += batch["label"].size(0)

        avg_eval_loss = total_eval_loss / len(test_loader)
        accuracy = correct_preds / total_preds
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), "best_model.pt")
            print("✅ Best model saved")


        print(f"Validation loss: {avg_eval_loss:.4f}")
        print(f"Validation accuracy: {accuracy:.4f}")


In [145]:
train_model(
    model=model,
    train_loader=train_loader,
    test_loader=test_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    device=device,
    epochs=EPOCHS
)


Epoch 1/3


Training: 100%|██████████| 1563/1563 [08:17<00:00,  3.14it/s]


Average training loss: 0.3255


Validation: 100%|██████████| 1563/1563 [02:28<00:00, 10.54it/s]


✅ Best model saved
Validation loss: 0.2268
Validation accuracy: 0.9084

Epoch 2/3


Training: 100%|██████████| 1563/1563 [08:19<00:00,  3.13it/s]


Average training loss: 0.1791


Validation: 100%|██████████| 1563/1563 [02:28<00:00, 10.54it/s]


✅ Best model saved
Validation loss: 0.2652
Validation accuracy: 0.9144

Epoch 3/3


Training: 100%|██████████| 1563/1563 [08:19<00:00,  3.13it/s]


Average training loss: 0.1040


Validation: 100%|██████████| 1563/1563 [02:28<00:00, 10.54it/s]

Validation loss: 0.3454
Validation accuracy: 0.9134





In [147]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [149]:
import torch 
model.load_state_dict(torch.load('best_model.pt',map_location=device))
model.to(device)
model.eval()

  model.load_state_dict(torch.load('best_model.pt',map_location=device))


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [152]:
text = "A very well made movie. Loved every minute of it. It was a roller coaster ride the acting was good the direction was good and story everything was perfect ill give it a solid 8.5/10"

inputs = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=256
)

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
prediction = torch.argmax(logits, dim=1).item()

print("Prediction:", "Positive" if prediction == 1 else "Negative")


Prediction: Positive


In [153]:
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')