# News Classification

In this notebook we're training a multiclass text classifier. 

## Toy Example

In [None]:
import pandas as pd

from simpletransformers.classification import ClassificationModel

In [None]:
train_data = [
    ["Pizza and pasta are Italian food", 0],
    ["Before start cooking find a good recipe", 0],
    ["Cooking is one of my hobbies", 0],
    ["I like football", 1],
    ["I hate tennis", 1],
    ["This year the Olympic Games are held in Tokyo", 1],
    ["Natural Language Processing deals with talking machines", 2],
    ["Textual entailment and semantic similarity are NLP tasks", 2],
    ["NLU stands for natural language understanding", 2],
]

train_df = pd.DataFrame(train_data, columns=["text", "labels"])

eval_data = [
    ["NLU stands for natural language understanding", 2],
    ["I hate tennis", 1],
    ["Cooking is one of my hobbies", 0],
]

eval_df = pd.DataFrame(eval_data, columns=["text", "labels"])

In [None]:
# configuration
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 32,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 1,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "logging_steps": 50,
    "save_steps": 2000,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    "wandb_project": "test-master",
}

In [None]:
# Create a ClassificationModel
model = ClassificationModel(
    "bert", "bert-base-cased", use_cuda=False, num_labels=3, args=args
)

# Train the model
model.train_model(train_df)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)
print(result)

In [None]:
predictions, raw_outputs = model.predict(["This class is about natural language"])
print(predictions)

## Training a Text Classifier for News


In [None]:
train = pd.read_csv("../datasets/agnews/train.csv", header=None)
train.columns = "labels headline text".split()
train.head(10)

AGNews is a collection of news categorized under 4 distinc categories:

- 1: World
- 2: Sports
- 3: Business
- 4: Sci/Tech

In [None]:
train[train["labels"] == 1].head()

In [None]:
train[train["labels"] == 2].head()

In [None]:
train[train["labels"] == 3].head()

In [None]:
train[train["labels"] == 4].head()

In [None]:
train.hist(column="labels")

In [None]:
# Create a ClassificationModel
model = ClassificationModel(
    "bert", "bert-base-cased", use_cuda=False, num_labels=4, args=args
)

# Train the model
model.train_model(train)

In [None]:
# configuration
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",
    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 32,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    "num_train_epochs": 10,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "logging_steps": 50,
    "save_steps": 2000,
    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": False,
    # "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    "wandb_project": "test-master",
}

In [None]:
# load the test set
test = pd.read_csv("../datasets/agnews/test.csv", header=None)
test.columns = "labels headline text".split()

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test)

In [None]:
model.predict(["Brazil recalls diplomats, officials from Argentina."])

## A Quick Demo

In [None]:
def load_model(
    model_architecture: str,
    directory: str = "outputs/",
    use_cuda: bool = False,
    **kwargs
):
    """Loads a pre-trained model"""
    model = ClassificationModel(
        model_architecture, directory, use_cuda=use_cuda, args=kwargs
    )
    return model

In [None]:
model = load_model("bert")

In [None]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def classify_news(line, text):
    """Prints predictions of a Text Classifier"""
    predictions, raw_outputs = model.predict([text])
    return predictions[0]

In [None]:
%%classify_news
Venezuelan President has urged families to have six children for the good of the country.