# **Multiclass Text Classification (News Topic Classification)**

# **This notebook demonstrates multiclass classification using Hugging Face Transformers on the AG News dataset.**

# **Step 1: Install & Import Dependencies**

In [None]:
pip install transformers datasets evaluate



In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np

# **Step 2: Quick Pipeline Demo (before training)**

In [None]:
demo_pipeline = pipeline(task = 'sentiment-analysis')

text = 'Apple unveils the new iphone with advanced camera features'

result = demo_pipeline(text)

print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9914082884788513}]


# **Step 3: Load AG News Dataset (Multiclass)**

In [None]:
dataset = load_dataset('ag_news')
print(dataset)

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [None]:
print(dataset['train'][10])

{'text': "Oil and Economy Cloud Stocks' Outlook  NEW YORK (Reuters) - Soaring crude prices plus worries  about the economy and the outlook for earnings are expected to  hang over the stock market next week during the depth of the  summer doldrums.", 'label': 2}


In [None]:
print(dataset['train']['label'][:10])

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [None]:
# unique label
set(dataset['train']['label'])

{0, 1, 2, 3}

In [None]:
print(dataset['train'].features)

{'text': Value('string'), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'])}


In [None]:
print(dataset['test'].features)

{'text': Value('string'), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'])}


In [None]:
# get label names directly

label_names = dataset['train'].features['label'].names
print(label_names)

['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
# Get label names
label_names = dataset["train"].features["label"].names

# Show first 10 texts with decoded labels
for text, label_id in zip(dataset["train"]["text"][:10], dataset["train"]["label"][:10]):
    print(f"Label: {label_names[label_id]} | Text: {text}\n")


Label: Business | Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

Label: Business | Text: Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.

Label: Business | Text: Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.

Label: Business | Text: Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.

# **Step 4: Tokenization**

In [None]:
model = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model)

def tokenizer_function(example):
  return tokenizer(example['text'],truncation = True, padding= 'max_length',max_length = 512)

tokenized = dataset.map(tokenizer_function,batched = True)

small_train = tokenized['train'].shuffle(seed=42).select(range(2000))
small_test = tokenized['test'].shuffle(seed=42).select(range(1000))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

# **Step 5: Fine-tuning using Hugging Face Trainer**

In [19]:
model_name = 'distilbert-base-uncased'

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=4)

args = TrainingArguments(
    output_dir = './multiclass_cls_result',
    eval_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 2,
    weight_decay = 0.01,
    logging_dir = './multiclass_cls_logs'
)

accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f2 = evaluate.load('f1')


def compute_metrics(eval_pred):
  logits,labels = eval_pred
  preds = np.argmax(logits,axis = -1)
  return{
  'accuracy':accuracy.compute(predictions = preds,references = labels)['accuracy'],
  'precision':precision.compute(predictions = preds,references = labels, average = 'macro')['precision'],
  'recall':recall.compute(predictions = preds,references = labels,average = 'macro')['recall'],
  'f1':f1.compute(predictions = preds,references = labels,average = 'macro')['f1']
  }

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = small_train,
    eval_dataset = small_test,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# **Step 6: Evaluate Model**

In [None]:
result = trainer.evaluate()
print(result)

# **Step 7: Save Model Locally**

In [None]:
save_dir = './multiclass-text-classificatio'
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print('Saved model to',save_dir)

# **Step 8: Load the saved Model And Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)

In [None]:
# prediction

classifier = pipeline('sentiment-analysis',model = model,tokenizer = tokenizer)

text = 'Apple unveils the new iphone with advanced camera features'

result = classifier(text)

print(result)

# **Push to Hugging Face Hub**

In [None]:
from huggingface_hub import notebook_login
notebook_login()
trainer.push_to_hub('multiclass-text-classification-swapnil-12')