In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline

#Pre-trained models

So first im gonna use pre-trained models available from huggingface to check on my dataset of imdb reviews

In [None]:
#using distilBERT(small and fast)
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
#using siebert, found it in the huggingface page, looks convenient for my work
classifier2 = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
#reading the csv file into a pd dataframe
data = pd.read_csv("/content/movie.csv")

In [None]:
print(data.loc[17,"text"])

The main problem with "Power" is that it features way too may pointless characters and subplots that add absolutely nothing to the movie whatsoever. It gets boring after awhile, sitting around waiting through scenes that don't connect to find something that drives the movie forward. You could probably pass it all off as character development, but all of them are either recycled from earlier scenes in the movie, or are just simply to flat and uninteresting. Lumet never gives enough time to let any of the supporting cast blossom. He should have cut a few of the characters (hackman, the wife) and concentrated harder on others (Billings). It could have been a great, hard political thriller instead of a jumbled mess that loses any message in a sea of bad writing and acting, a fact that amazed me considering the cast. Even Gene Hackman performance wasn't up to par. Denzel Washington is the only real actor of note here. Gere and the others have all done much better performances elsewhere. <br

In [None]:
#pre-trained models take up only a max of 512 tokens, so im making a truncation function here
def trunctext(text):
  if len(text)>400:
    text = text[:400]
  text = " ".join(text)
  return text

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)
data['text'].head()

0    [I, grew, up, b, 1965, watching, and, loving, ...
1    [When, I, put, this, movie, in, my, DVD, playe...
2    [Why, do, people, who, do, not, know, what, a,...
3    [Even, though, I, have, great, interest, in, B...
4    [Im, a, die, hard, Dads, Army, fan, and, nothi...
Name: text, dtype: object

In [None]:
data['text']=data['text'].apply(trunctext)

In [None]:
for i in range(0,10):
  print((data.loc[i,"text"]))
#how my data looks like

I grew up b 1965 watching and loving the Thunderbirds All my mates at school watched We played Thunderbirds before school during lunch and after school We all wanted to be Virgil or Scott No one wanted to be Alan Counting down from 5 became an art form I took my children to see the movie hoping they would get a glimpse of what I loved as a child How bitterly disappointing The only high point was the snappy theme tune Not that it could compare with the original score of the Thunderbirds Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created Jonatha Frakes should hand in his directors chair his version was completely hopeless A waste of film Utter rubbish A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp sapiens was a huge error of judgment
When I put this movie in my DVD player and sat down with a coke and some chips I had some expectations I was hoping that this movie would contain so

In [None]:
sent =[]
for i in range(0,100):
  sent.append(classifier2(data.loc[i,"text"]))
#classifying using the model

In [None]:
y_preds=[]
from sklearn.metrics import accuracy_score
for i in range(len(sent)):
  if sent[i][0]['label']=="POSITIVE":
    y_preds.append(1)
  else:
    y_preds.append(0)
#metric models can only compare between 1s and 0s, and the classifier gives us "positive" or "negative"

In [None]:
#for distilbert pre-trained model
from sklearn.metrics import precision_score,f1_score
y_true = data["label"][:1000]
print(accuracy_score(y_true,y_preds))
print(precision_score(y_true,y_preds))
print(f1_score(y_true,y_preds))

0.875
0.927360774818402
0.8597081930415263


In [None]:
#for siebert pre-trained model
y_true = data["label"][:100]
print(accuracy_score(y_true,y_preds))
print(precision_score(y_true,y_preds))
print(f1_score(y_true,y_preds))

0.93
0.9183673469387755
0.9278350515463918


#Fine-tuning using torch

We will use our dataset to fine tune our model, and this way hope on better predictions on our data

In [None]:
#100 rows take 10 mins, so im limiting myself to a 1000 rows and since its randomly organised i'll take the first 1000 rows
data=data.head(1000)

In [None]:
#converting out pd df to a datset pytorch can read

from datasets import Dataset
dataset = Dataset.from_pandas(data)

In [None]:
#tokenizing our dataset as models take only numbers
from transformers import AutoTokenizer
tokenizer  = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer
tokenizer  = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
train_test = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test['train']
eval_dataset = train_test['test']

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, load_metric

# Define a function to compute additional metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=4e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForSequenceClassification

# Load and compile our model
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [None]:
#distilbert
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.777415,0.86,0.840909,0.902439,0.787234
2,No log,0.924688,0.85,0.842105,0.833333,0.851064
3,0.271400,0.986554,0.85,0.842105,0.833333,0.851064


TrainOutput(global_step=675, training_loss=0.20634730621620462, metrics={'train_runtime': 159.7596, 'train_samples_per_second': 16.9, 'train_steps_per_second': 4.225, 'total_flos': 357661976371200.0, 'train_loss': 0.20634730621620462, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.9865542650222778,
 'eval_accuracy': 0.85,
 'eval_f1': 0.8421052631578947,
 'eval_precision': 0.8333333333333334,
 'eval_recall': 0.851063829787234,
 'eval_runtime': 1.8795,
 'eval_samples_per_second': 53.206,
 'eval_steps_per_second': 13.301,
 'epoch': 3.0}

We got an accuracy of 0.85, meaning after fine tuning our model with 1000 rows of data, we didnt improve at all!

In [None]:
#siebert
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.622445,0.6,0.75,0.6,1.0
2,No log,0.384785,0.92,0.929825,0.981481,0.883333
3,0.359800,0.280045,0.95,0.957265,0.982456,0.933333


TrainOutput(global_step=675, training_loss=0.3194350857204861, metrics={'train_runtime': 996.9584, 'train_samples_per_second': 2.708, 'train_steps_per_second': 0.677, 'total_flos': 2516214680985600.0, 'train_loss': 0.3194350857204861, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.28004467487335205,
 'eval_accuracy': 0.95,
 'eval_f1': 0.9572649572649572,
 'eval_precision': 0.9824561403508771,
 'eval_recall': 0.9333333333333333,
 'eval_runtime': 10.948,
 'eval_samples_per_second': 9.134,
 'eval_steps_per_second': 2.284,
 'epoch': 3.0}

An improvement in accuracy from 0.93 to 0.95! fine-tuning our model improved performance here for the siebert model

In [None]:
mkdir -p saved_model
model.save('saved_model/my_model')