# Sentiment Classification on IMDb

In this notebook we're training a binary text classifier. 

## Toy Example

In [None]:
import pandas as pd

from simpletransformers.classification import ClassificationModel

In [None]:
train_data = [
    ["This is fun!!", 1],
    ["This week I really had a good time :-)", 1],
    ["True happiness is one of the key goals in life", 1],
    ["This is horrible", 0],
    ["The play deserves only bad critics because the actors are so bad", 0],
    ["The worst experience in my whole life. I'm not coming back again.", 0],
]

train_df = pd.DataFrame(train_data, columns=["text", "labels"])

eval_data = [["This is fun!!", 1], ["This is horrible", 0]]

eval_df = pd.DataFrame(eval_data, columns=["text", "labels"])

In [None]:
# configuration
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 32,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 1,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "logging_steps": 50,
    "save_steps": 2000,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
}

In [None]:
# Create a ClassificationModel
model = ClassificationModel("roberta", "roberta-base", use_cuda=False, args=args)

# Train the model
model.train_model(train_df)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
predictions, raw_outputs = model.predict(["I'm having such a good time :-)"])
print(predictions)

## Training a Sentiment Classifier for Movie Reviews

In [None]:
train = pd.read_csv("../datasets/imdb/train.csv")
train.head(10)

In [None]:
# configuration
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 32,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 10,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "logging_steps": 50,
    "save_steps": 2000,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    "wandb_project": "test-master",
}

In [None]:
# Create a ClassificationModel
model = ClassificationModel(
    "bert", "bert-base-cased", use_cuda=False, num_labels=4, args=args
)

# Train the model
model.train_model(train)

In [None]:
# load the test set
test = pd.read_csv("../datasets/imdb/train.csv")

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test)

In [None]:
model.predict(["This is a great movie!! Seems it's still possible to make a good movie with good actors"])

## A Quick Demo

In [None]:
def load_model(
    model_architecture: str,
    directory: str = "outputs/",
    use_cuda: bool = False,
    **kwargs
):
    """Loads a pre-trained model"""
    model = ClassificationModel(
        model_architecture, directory, use_cuda=use_cuda, args=kwargs
    )
    return model

In [None]:
model = load_model("bert")

In [None]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def classify_reviews(line, text):
    """Prints predictions of a Text Classifier"""
    predictions, raw_outputs = model.predict([text])
    return predictions[0]

In [None]:
%%classify_reviews
What a superb performance! Spectacular story, great playing!