In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("uci-news-aggregator.csv")
df

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027
...,...,...,...,...,...,...,...,...
422414,422933,Surgeons to remove 4-year-old's rib to rebuild...,http://www.cbs3springfield.com/story/26378648/...,WSHM-TV,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.cbs3springfield.com,1409229190251
422415,422934,Boy to have surgery on esophagus after battery...,http://www.wlwt.com/news/boy-to-have-surgery-o...,WLWT Cincinnati,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wlwt.com,1409229190508
422416,422935,Child who swallowed battery to have reconstruc...,http://www.newsnet5.com/news/local-news/child-...,NewsNet5.com,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.newsnet5.com,1409229190771
422417,422936,Phoenix boy undergoes surgery to repair throat...,http://www.wfsb.com/story/26368078/phoenix-boy...,WFSB,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wfsb.com,1409229191071


In [3]:
import torch

In [4]:
from torch.utils.data import Dataset,DataLoader

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification

In [6]:
import torch.nn as nn

In [7]:
from torch.optim import AdamW

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
print(df.head())

   ID                                              TITLE  \
0   1  Fed official says weak data caused by weather,...   
1   2  Fed's Charles Plosser sees high bar for change...   
2   3  US open: Stocks fall after Fed official hints ...   
3   4  Fed risks falling 'behind the curve', Charles ...   
4   5  Fed's Plosser: Nasty Weather Has Curbed Job Gr...   

                                                 URL          PUBLISHER  \
0  http://www.latimes.com/business/money/la-fi-mo...  Los Angeles Times   
1  http://www.livemint.com/Politics/H2EvwJSK2VE6O...           Livemint   
2  http://www.ifamagazine.com/news/us-open-stocks...       IFA Magazine   
3  http://www.ifamagazine.com/news/fed-risks-fall...       IFA Magazine   
4  http://www.moneynews.com/Economy/federal-reser...          Moneynews   

  CATEGORY                          STORY             HOSTNAME      TIMESTAMP  
0        b  ddUyU0VZz0BRneMioxUPQVP6sIxvM      www.latimes.com  1394470370698  
1        b  ddUyU0VZz0BRneMi

In [10]:
print(df["CATEGORY"].value_counts())

CATEGORY
e    152469
b    115967
t    108344
m     45639
Name: count, dtype: int64


In [11]:
df=df[["TITLE","CATEGORY"]]
df=df[df["CATEGORY"].isin(["b", "t", "e", "m"])]

In [12]:
df["label"]=df["CATEGORY"].astype("category").cat.codes

In [13]:
df["text"]=df["TITLE"]

In [14]:
train_df=df.sample(frac=0.8,random_state=42)

In [15]:
test_df=df.drop(train_df.index)

In [16]:
print(train_df.head())

                                                    TITLE CATEGORY  label  \
280241  Only 2 people allowed in the delivery room: Pr...        e      1   
96315             A wealth of info, a welter of questions        b      0   
280410      Robin Thicke Naming Album After Paula Patton…        e      1   
284975  Actress And Civil Rights Activist Ruby Dee Die...        e      1   
379072                Feds to review Ga. Medicaid backlog        m      2   

                                                     text  
280241  Only 2 people allowed in the delivery room: Pr...  
96315             A wealth of info, a welter of questions  
280410      Robin Thicke Naming Album After Paula Patton…  
284975  Actress And Civil Rights Activist Ruby Dee Die...  
379072                Feds to review Ga. Medicaid backlog  


In [17]:
train_df=train_df.sample(5000,random_state=42)

In [18]:
test_df=test_df.sample(1000,random_state=42)

In [19]:
tokenizer= BertTokenizer.from_pretrained("bert-base-uncased")

In [20]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels,tokenizer,max_len=128):
          self.texts = texts
          self.labels=labels
          self.tokenizer=tokenizer
          self.max_len=max_len
    def __len__(self):
          return len(self.texts)
    def __getitem__(self,idx):
          text=self.texts[idx]
          label=self.labels[idx]
          encoding= self.tokenizer.encode_plus(
                    text,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    return_token_type_ids=False,
                    padding="max_length",
                    truncation=True,
                    return_attention_mask=True,
                    return_tensors="pt",
          )
          return{"input_ids":encoding["input_ids"].flatten(),"attention_mask":encoding["attention_mask"].flatten(),"labels":torch.tensor(label,dtype=torch.long) }
train_dataset = NewsDataset(train_df["text"].tolist(), train_df["label"].tolist(), tokenizer)
test_dataset = NewsDataset(test_df["text"].tolist(), test_df["label"].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [21]:
# ✅ Step 4: Define Model
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
tokenizer=DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=4).to(device)
optimizer = AdamW(model.parameters(),lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# ✅ Step 5: Training Loop (2 epochs)
for epoch in range(2):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1, Loss: 0.5521
Epoch 2, Loss: 0.2409


In [23]:
# ✅ Step 6: Evaluation
model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
acc = accuracy_score(true_labels, preds)
print(f"✅ Accuracy: {acc*100:.2f}%")

✅ Accuracy: 88.70%


In [25]:
# ✅ Step 7: Save Model
model.save_pretrained("./news_model")
tokenizer.save_pretrained("./news_model")
print("✅ Model Saved!")

✅ Model Saved!


In [27]:
import os

In [29]:
print(os.listdir("news_model"))

['config.json', 'model.safetensors', 'special_tokens_map.json', 'tokenizer_config.json', 'vocab.txt']


In [33]:
health_news = df[df["CATEGORY"] == "m"]

In [37]:
print(health_news["TITLE"].head(50))

4207      Grown-ups: Put down the smartphones at mealtime
4208    Cellphone addiction may damage parent-child bo...
4209    Parents won't stop using smartphones even whil...
4210         Smartphones making parents ignore their kids
4211    Smartphones can loosen emotional bonding with ...
4212    Parents Distracted By Smartphones Ignore Their...
4213    Smartphones may threaten parent-child emotiona...
4214      Cell phone addiction may kill parent-child bond
4215    Parents often glued to cellphone while kids ea...
4216      Mobile Phones Affecting Parenting Skills: Study
4217        When smartphone is near, parenting may falter
4218         Study: Parents Using Smartphones Ignore Kids
4219      Mommies, get your noses out of your smartphones
4220    Smartphones: The cause of ignorance between pa...
4221    Parents absorbed in smartphones scold kids mor...
4222    Parents pay attention to their phones more tha...
4223    Are Our Mobile Devices Effecting The Relations...
4224    Parent