In [None]:
!pip install datasets
!pip install transformers

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments

import numpy as np
from datasets import load_metric

from transformers import Trainer



**Explore imdb data**

In [None]:
raw_datasets = load_dataset("imdb")

In [None]:
raw_datasets.keys()

In [None]:
raw_datasets['train'], raw_datasets['test']

In [None]:
raw_datasets['train']['text'][0]

In [None]:
raw_datasets['train']['label'][0]

**Tokenize and create datasets**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Hello, this is one sentence!")

In [None]:
def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets


In [None]:
tokenized_datasets['train']['label'][0]

In [None]:
tokenized_datasets["train"]['input_ids'][0][:10]

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]


**Create training pipeline using pretrained model**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

training_args = TrainingArguments("test_trainer")

trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

**Train model (Finetune) with imdb data**

In [None]:
trainer.train()

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

**Evaluate model**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

**Exercise**

Train (finetune) a Bert pretrained model using quora dataset

In [None]:
!wget https://github.com/ravi-ilango/acm-dec-2020-nlp/blob/main/lab2_2/quora.csv?raw=true -O quora.csv


In [None]:
custom_dataset = load_dataset('csv', 
                              data_files={'train':['quora.csv'], 'test':['quora.csv']})

In [None]:
custom_dataset

**Create tokenized dataset**

In [None]:
#include code to create tokenized dataset


**Create small training and eval datasets**

In [None]:
#include code to create small train and eval datasets

**Download model and setup pipeline**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

training_args = TrainingArguments("test_trainer")

#include code to create trainer pipeline


**Train model**

In [None]:
trainer.train()

**Evaluate model**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

**Try predictions**

In [None]:
model.to('cpu')
model.eval()

def predict(model, sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    output = model(**inputs)
    logits = output.logits
    logits = logits.detach().cpu().numpy()

    pred = np.argmax(logits, axis=1)[0]
    return 'Insincere' if pred == 1 else 'Sincere'

In [None]:
sentence = "What is your favorite person in history?"

In [None]:
predict(model, sentence)

In [None]:
sentence = "Why do people Indian girls go crazy about marrying Shri. Rahul Gandhiji?"

In [None]:
predict(model, sentence)