<a href="https://colab.research.google.com/github/vishnudas08/Small-AI-Tool-for-Text-Classification/blob/main/Text_Classification_using_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import transformers
import torch
import torch.nn as nn
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

In [None]:
train_df= pd.read_csv('/content/train.csv')
test_df= pd.read_csv('/content/test.csv')
train_df.columns
train_df['Class Index'].value_counts()

In [None]:
min_samples_per_class = train_df['Class Index'].value_counts().min()
print(min_samples_per_class)
samples_per_class = min(min_samples_per_class, 807)

In [None]:
from sklearn.utils import resample

train_df_reduced = (train_df
                    .groupby('Class Index', group_keys=False)
                    .apply(lambda x: resample(x, replace=False, n_samples=samples_per_class, random_state=42))
                    .reset_index(drop=True))
train_df_reduced

In [None]:
dict={1:0,2:1,3:2,4:3}
train_df_reduced['Class Index']= train_df_reduced['Class Index'].map(dict)

train_df_reduced['Class Index'].value_counts()

In [None]:
train_df_reduced= train_df_reduced.drop('Title', axis=1)

In [None]:
train_df_reduced

In [None]:
print(train_df_reduced)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df_reduced["Description"],train_df_reduced['Class Index'] , test_size=0.2, random_state=42)

In [None]:
len(X_train), len(y_train)

In [None]:
num_classes=  train_df_reduced['Class Index'].nunique()

num_classes

In [None]:
from transformers import logging
logging.set_verbosity_error()

from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader
tokenizer= BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenizer_fn(text,label):
  inputs= tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
  return torch.tensor(inputs['input_ids']), torch.tensor(inputs['attention_mask']), torch.tensor(label, dtype=torch.long)
print(tokenizer_fn(['manu bhaker won the two olympic gold medals'] ,[0,1,2,3]))

In [None]:
train_input_ids,  train_attention_mask, train_labels  =tokenizer_fn(X_train.values.tolist(), y_train.tolist())
test_inputs_ids, test_attention_mask, test_labels = tokenizer_fn(X_test.values.tolist(), y_test.tolist())

In [None]:
train_dataset= torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset= torch.utils.data.TensorDataset(test_inputs_ids, test_attention_mask, test_labels)

In [None]:
train_dataLoader=DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataLoader=DataLoader(val_dataset, batch_size=128, shuffle=False)

In [None]:
from transformers import logging
logging.set_verbosity_error()
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased', output_loading_info=False)
print(bert.config.hidden_size)  # This prints 768 without rendering widgets


In [None]:
#bert= BertModel.from_pretrained('bert-base-uncased')
#bert.config.hidden_size

In [None]:
print("Train labels:", torch.unique(train_labels))

In [None]:
class textClassifier(nn.Module):
  def __init__(self, num_classes):
    super(). __init__()
    self.bert= BertModel.from_pretrained('bert-base-uncased')

    for param in self.bert.parameters():
      param.requires_grad=False
    self.classifier= nn.Sequential(
        nn.Linear(self.bert.config.hidden_size, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256,num_classes),



    )
  def forward(self,input_ids, attention_mask):
    bert_output= self.bert(input_ids=input_ids,attention_mask=attention_mask)
    sentence_embedding= bert_output.last_hidden_state[:,0,:]
    return self.classifier(sentence_embedding)

In [None]:
model= textClassifier(num_classes=4).to(device)
optimizer= optim.Adam(model.parameters(), lr=0.001)
criterion= nn.CrossEntropyLoss()
print(model)

In [None]:
epochs=2
for epoch in range(epochs):
  model.train()
  total_train_loss=0

  for batch, (input_ids, attention_mask, labels) in enumerate(train_dataLoader):
    input_ids, attention_mask, labels= input_ids.to(device), attention_mask.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs= model(input_ids, attention_mask).squeeze()
    loss= criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f"Batch : {batch} epoch {epoch}, loss {loss.item():0.2f}")
    total_train_loss+= loss.item()
avg_train_loss= total_train_loss/len(train_dataLoader)
print(f" epoch {epoch +1}/{epochs}, Training loss : {avg_train_loss}")

In [None]:
model.eval()
total_val_loss=0
correct_predicted=0

with torch.no_grad():
  for input_ids, attention_mask, labels in val_dataLoader:
    input_ids, attention_mask, labels= input_ids.to(device), attention_mask.to(device), labels.to(device)
    outputs= model(input_ids, attention_mask).squeeze()
    loss= criterion(outputs, labels)
    total_val_loss += loss.item()
    pred= torch.argmax(outputs, dim=1)
    correct_predicted= torch.sum(pred == labels)
avg_val_loss= total_val_loss/ len(val_dataLoader)
val_accuracy= correct_predicted/ len(val_dataset)
print(f"validation loss{avg_val_loss},  val accuracy {val_accuracy}")

In [None]:
label_map= {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

In [None]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
para="""The 1936 Berlin Olympics stands as one of the most iconic moments in Indian sports history. Under the captaincy of Dhyan Chand, the Indian men’s hockey team delivered a masterclass in skill and dominance, defeating Germany 8–1 in the final — all in front of a massive crowd, including Adolf Hitler himself.

Held in Nazi Germany, the Games were used as a propaganda tool by Hitler to showcase Aryan supremacy. However, India’s resounding victory disrupted that narrative. Dhyan Chand, often called the "Wizard of Hockey," scored three goals in the final, mesmerizing the crowd with his dribbling and control.

Legend has it that Hitler was so impressed by Dhyan Chand’s performance that he offered him a high-ranking position in the German army — which Dhyan Chand politely declined.

This match wasn't just a sporting win — it was a symbolic moment of pride for a colonized India, proving its excellence on a global stage during one of the most politically charged Olympics in history."""
para

In [None]:
def predict(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask).float()
        predicted_class = torch.argmax(outputs, dim=1).item()
        return label_map[predicted_class]

#print(predict("manu bhaker won the two olympic gold medals"))
#print(predict("vishnu has discover tech solution for coolent in data centers"))
print(predict(para))