# Importing the requirements that is load_dataset from the dataset and the AutoTokenizer and AutoModelForSequenceClassification 

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer , AutoModelForSequenceClassification

## I will load the imdb dataset which contains the reviews and labels about the review as it is negative or positive

In [3]:
dataset = load_dataset("imdb")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
dataset["train"]["text"][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [9]:
dataset["train"]["label"][0]

0

In [12]:
import pandas as pd
df = pd.DataFrame(dataset["train"])
df.sample(5)

Unnamed: 0,text,label
11327,Every once in a while I will rent an action/ad...,0
16314,"First off, this movie was a wild ride the whol...",1
10478,"At the beginning of the film, you might double...",0
20163,"Saw this my last day at the festival, and was ...",1
239,"There is no way to describe how really, really...",0


In [19]:
df["label"].nunique()

2

In [23]:
dataset.shape

{'train': (25000, 2), 'test': (25000, 2), 'unsupervised': (50000, 2)}

### I will use the bertweet-base-sentiment-analysis which we will further tune on our imdb dataset 

In [22]:
checkpoint = "finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2 , ignore_mismatched_sizes = True)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenizing the reviews further to use it in our model 

I will only train on the 1000 rows of the dataset because whole dataset will take long time as I am training it on my laptop , I don't have any gpu

In [68]:
def tokenize(example):
    return tokenizer(
        example["text"],               
        truncation=True,              
        padding=False,               
        return_attention_mask=True,  
        return_token_type_ids=False   
    )


train_data = dataset["train"].select(range(1000)).map(tokenize, batched=True)
test_data = dataset["test"].select(range(1000)).map(tokenize, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Here I am using DadaCollatorWithPadding  so that length of the small reviews should be padded according to the larget review in the batch 

In [69]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer , padding=True , return_tensors="pt")


No we will decide the parameters and their values  to tune our model 

In [70]:
from transformers import TrainingArguments , Trainer

train_arg = TrainingArguments(
    output_dir = "./result",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs = 2,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    load_best_model_at_end = True,
    weight_decay = 0.01,
    logging_dir = "./log",
    logging_steps =10,
    metric_for_best_model = "eval_loss",
)


In [71]:
trainer = Trainer(
    model=model,
    args=train_arg,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=collator,  # Use DataCollatorWithPadding
)


In [72]:
trainer.train()

  0%|          | 0/126 [00:00<?, ?it/s]

{'loss': 0.0023, 'grad_norm': 0.024123752489686012, 'learning_rate': 1.8412698412698415e-05, 'epoch': 0.16}
{'loss': 0.0012, 'grad_norm': 0.014975331723690033, 'learning_rate': 1.6825396825396828e-05, 'epoch': 0.32}
{'loss': 0.0007, 'grad_norm': 0.01013997383415699, 'learning_rate': 1.523809523809524e-05, 'epoch': 0.48}
{'loss': 0.0005, 'grad_norm': 0.007157672196626663, 'learning_rate': 1.3650793650793652e-05, 'epoch': 0.63}
{'loss': 0.0004, 'grad_norm': 0.0061019789427518845, 'learning_rate': 1.2063492063492064e-05, 'epoch': 0.79}
{'loss': 0.0003, 'grad_norm': 0.005015259142965078, 'learning_rate': 1.0476190476190477e-05, 'epoch': 0.95}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.00021320370433386415, 'eval_runtime': 122.704, 'eval_samples_per_second': 8.15, 'eval_steps_per_second': 0.513, 'epoch': 1.0}
{'loss': 0.0003, 'grad_norm': 0.0046694315969944, 'learning_rate': 8.888888888888888e-06, 'epoch': 1.11}
{'loss': 0.0002, 'grad_norm': 0.004160549957305193, 'learning_rate': 7.301587301587301e-06, 'epoch': 1.27}
{'loss': 0.0002, 'grad_norm': 0.004182990174740553, 'learning_rate': 5.7142857142857145e-06, 'epoch': 1.43}
{'loss': 0.0002, 'grad_norm': 0.003618058981373906, 'learning_rate': 4.126984126984127e-06, 'epoch': 1.59}
{'loss': 0.0002, 'grad_norm': 0.0038140094839036465, 'learning_rate': 2.53968253968254e-06, 'epoch': 1.75}
{'loss': 0.0002, 'grad_norm': 0.003393556922674179, 'learning_rate': 9.523809523809525e-07, 'epoch': 1.9}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.00014690835087094456, 'eval_runtime': 122.5268, 'eval_samples_per_second': 8.161, 'eval_steps_per_second': 0.514, 'epoch': 2.0}
{'train_runtime': 1059.0607, 'train_samples_per_second': 1.888, 'train_steps_per_second': 0.119, 'train_loss': 0.0005403225724067953, 'epoch': 2.0}


TrainOutput(global_step=126, training_loss=0.0005403225724067953, metrics={'train_runtime': 1059.0607, 'train_samples_per_second': 1.888, 'train_steps_per_second': 0.119, 'total_flos': 131555527680000.0, 'train_loss': 0.0005403225724067953, 'epoch': 2.0})

In [73]:
trainer.evaluate()

  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.00014690835087094456,
 'eval_runtime': 120.9809,
 'eval_samples_per_second': 8.266,
 'eval_steps_per_second': 0.521,
 'epoch': 2.0}

In [74]:
import numpy as np
predictions = trainer.predict(test_data)
preds = np.argmax(predictions.predictions, axis=-1)



  0%|          | 0/63 [00:00<?, ?it/s]

In [78]:
from sklearn.metrics import accuracy_score

accuracy_score(test_data["label"] , preds)

1.0

In [92]:
input = tokenizer("Very Great" , truncation=True , return_tensors="pt")

output = model(**input)

In [93]:
import torch
logits = output.logits

prob = torch.nn.functional.softmax(logits , dim = -1)


In [94]:
torch.argmax(prob , dim = -1)

tensor([0])