# Sentiment Classification on IMDb

In this notebook we're training a binary text classifier. 

## Toy Example

In [1]:
import pandas as pd

from simpletransformers.classification import ClassificationModel

In [3]:
train_data = [
    ["This is fun!!", 1],
    ["This week I really had a good time :-)", 1],
    ["True happiness is one of the key goals in life", 1],
    ["This is horrible", 0],
    ["The play deserves only bad critics because the actors are so bad", 0],
    ["The worst experience in my whole life. I'm not coming back again.", 0],
]

train_df = pd.DataFrame(train_data, columns=["text", "labels"])

eval_data = [["This is fun!!", 1], ["This is horrible", 0]]

eval_df = pd.DataFrame(eval_data, columns=["text", "labels"])

In [12]:
# configuration
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 32,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 10,
    "weight_decay": 0,
    "learning_rate": 4e-4,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "logging_steps": 50,
    "save_steps": 2000,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    #"wandb_project": "nlp-exercises",
}

In [13]:
# Create a ClassificationModel
model = ClassificationModel("roberta", "roberta-base", args=args)

# Train the model
model.train_model(train_df)

Features loaded from cache at cache_dir/cached_train_roberta_128_2_6


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.730047


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.690679


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.560007


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.554207


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.326359


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 1.079962


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.031869


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.019121


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.003598


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=1.0, style=ProgressStyle(descript…

Running loss: 0.001642

Training of roberta model complete. Saved to outputs/.


In [15]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


{'mcc': 1.0, 'tp': 1, 'tn': 1, 'fp': 0, 'fn': 0, 'eval_loss': 0.0008248090744018555}


In [14]:
predictions, raw_outputs = model.predict(["I'm having such a good time :-)"])
print(predictions)

Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


[1]


## Training a Sentiment Classifier for Movie Reviews

In [2]:
train = pd.read_csv("../datasets/imdb/train.csv")
train.head(10)

Unnamed: 0,text,label
0,I had already heard of Ali G in Madonna's musi...,0
1,Most of Kieslowski's films seem like puzzles t...,1
2,"This is not the video nastie, but only because...",0
3,During 1933 this film had many cuts taken from...,1
4,This latter-day Fulci schlocker is a totally a...,0
5,"Billy Wilder is co-credited for the story, and...",1
6,This film deserves a 10 for its brilliant port...,1
7,"In this ""critically acclaimed psychological th...",1
8,"""Fanfan la tulipe"" is still Gerard Philippe's ...",1
9,I got interested in this movie because somebod...,0


In [3]:
# configuration
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 64,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 10,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "logging_steps": 50,
    "save_steps": 2000,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    "wandb_project": "nlp-exercises",
}

In [4]:
# Create a ClassificationModel
model = ClassificationModel("distilbert", "distilbert-base-uncased", args=args)

# Train the model
model.train_model(train)

Converting to features started. Cache is not used.




HBox(children=(FloatProgress(value=0.0, max=37500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.643520



Running loss: 0.527306


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.189582


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.136082


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.040707


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.028142


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.017000


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.135445


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.084338


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.000375


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=586.0, style=ProgressStyle(descri…

Running loss: 0.050116

Training of distilbert model complete. Saved to outputs/.


In [22]:
# load the test set
test = pd.read_csv("../datasets/imdb/test.csv")

In [26]:
# Evaluate the model
from sklearn.metrics import f1_score

result, model_outputs, wrong_predictions = model.eval_model(test, f1=f1_score)



Features loaded from cache at cache_dir/cached_dev_distilbert_128_2_12500


HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))


{'mcc': 0.7585958182473415, 'tp': 5566, 'tn': 5424, 'fp': 826, 'fn': 684, 'eval_loss': 0.7797861061370571, 'f1': 0.8805568739123557}


In [5]:
model.predict(["This is a great movie!! Seems it's still possible to make a good movie with good actors"])

Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




(array([1]), array([[-3.5713227,  2.7546487]], dtype=float32))

## A Quick Demo

In [None]:
def load_model(
    model_architecture: str,
    directory: str = "outputs/",
    use_cuda: bool = False,
    **kwargs
):
    """Loads a pre-trained model"""
    model = ClassificationModel(
        model_architecture, directory, use_cuda=use_cuda, args=kwargs
    )
    return model

In [None]:
model = load_model("bert")

In [6]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def classify_reviews(line, text):
    """Prints predictions of a Text Classifier"""
    predictions, raw_outputs = model.predict([text])
    return predictions[0]

In [7]:
%%classify_reviews
What a superb performance! Spectacular story, great playing!

Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




1

In [9]:
%%classify_reviews
A complete nightmare. Worst restaurant in town :-(

Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




0

In [8]:
%%classify_reviews
COVID-19 is gonna kill us all!

Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




1