Write a crawler that crawls and collects news following business news websites and categorize the news based on title

1. Business Today
2. Economic Times
3. Mint
4. Money Control
5. Business World
6. Forbes India

In [None]:
# !pip install requests beautifulsoup4 newspaper3k lxml_html_clean --quiet
# !pip install transformers torch --quiet

In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import requests
from bs4 import BeautifulSoup
from newspaper import Article

In [171]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [131]:
# Dataset with news headlines and categories
news_df = pd.read_csv('/content/NewsCategorizer.csv')
news_df.head()

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods


In [132]:
# Dropping unnecessary columns
news_df.drop(columns=['links','short_description','keywords'], inplace=True)

In [133]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50000 non-null  object
 1   headline  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [134]:
news_df['category'].value_counts(dropna=False)

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
WELLNESS,5000
POLITICS,5000
ENTERTAINMENT,5000
TRAVEL,5000
STYLE & BEAUTY,5000
PARENTING,5000
FOOD & DRINK,5000
WORLD NEWS,5000
BUSINESS,5000
SPORTS,5000


In [135]:
# Cleaning data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

news_df['category'] = news_df['category'].apply(clean_text)
news_df['headline'] = news_df['headline'].apply(clean_text)

In [136]:
news_df.head()

Unnamed: 0,category,headline
0,wellness,143 miles in 35 days lessons learned
1,wellness,talking to yourself crazy or crazy helpful
2,wellness,crenezumab trial will gauge whether alzheimers...
3,wellness,oh what a difference she made
4,wellness,green superfoods


In [137]:
# Encoding target features

ohe = OneHotEncoder()
encoded_categories = ohe.fit_transform(news_df[['category']])
encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=ohe.get_feature_names_out(['category']))
news_df = pd.concat([news_df, encoded_df], axis=1)
news_df.head()

Unnamed: 0,category,headline,category_business,category_entertainment,category_food drink,category_parenting,category_politics,category_sports,category_style beauty,category_travel,category_wellness,category_world news
0,wellness,143 miles in 35 days lessons learned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,wellness,talking to yourself crazy or crazy helpful,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,wellness,crenezumab trial will gauge whether alzheimers...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,wellness,oh what a difference she made,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,wellness,green superfoods,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [138]:
target_cols = ohe.get_feature_names_out(['category']).tolist()

In [140]:
X_train, X_test, y_train, y_test = train_test_split(news_df['headline'], news_df[target_cols], test_size=0.2, random_state=42)

In [139]:
# Finding total no.of unique words in headlines
unique_words = set()
news_df['headline'].str.split().apply(unique_words.update)
print(len(unique_words))

34216


In [142]:
# Finding the no of words in each headline
word_counts = news_df['headline'].apply(lambda x: len(x.split()))
word_counts.describe()

Unnamed: 0,headline
count,50000.0
mean,9.31842
std,3.13973
min,1.0
25%,7.0
50%,9.0
75%,11.0
max,43.0


In [154]:
# No of words at 95% percentile
word_counts.quantile(0.95)

np.float64(14.0)

In [156]:
vocab_size = 30000
embedding_dim = 100
max_len = 14
num_classes = len(target_cols)

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences for consistent input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [162]:
# Creating our LSTM model with 1 Embedding layer, 1 LSTM layer, 3 Dense layers and 1 output Dense layer
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    LSTM(256),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Using Adam optimizer and cross entropy for loss
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy', 'precision', 'recall']
)

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    epochs=25,
    batch_size=50,
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/25
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.1474 - loss: 2.5357 - precision: 0.2005 - recall: 0.0202 - val_accuracy: 0.3074 - val_loss: 1.8183 - val_precision: 0.8595 - val_recall: 0.0642
Epoch 2/25
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.3459 - loss: 1.6950 - precision: 0.7252 - recall: 0.1075 - val_accuracy: 0.4509 - val_loss: 1.4430 - val_precision: 0.7838 - val_recall: 0.2026
Epoch 3/25
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.5424 - loss: 1.2228 - precision: 0.7824 - recall: 0.3037 - val_accuracy: 0.6104 - val_loss: 1.1749 - val_precision: 0.7894 - val_recall: 0.4504
Epoch 4/25
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.7094 - loss: 0.8467 - precision: 0.8506 - recall: 0.5705 - val_accuracy: 0.6453 - val_loss: 1.1681 - val_precision: 0.7512 - val_recall: 0.5579
Epoch 5/25
[1m64

In [163]:
# Generating classification report of model
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test.values, axis=1)

print(classification_report(y_test_classes, y_pred_classes))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.75      0.69      0.71       955
           1       0.74      0.70      0.72       985
           2       0.84      0.75      0.79      1021
           3       0.56      0.75      0.64      1030
           4       0.71      0.65      0.68      1034
           5       0.89      0.87      0.88       995
           6       0.87      0.78      0.82       986
           7       0.69      0.75      0.72      1008
           8       0.61      0.55      0.58      1009
           9       0.69      0.79      0.74       977

    accuracy                           0.73     10000
   macro avg       0.74      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



In [54]:
# List of news websites and their URLs
news_sites = {
    "Business Today": "https://www.businesstoday.in/latest",
    "Economic Times": "https://economictimes.indiatimes.com/news/latest-news",
    "Mint": "https://www.livemint.com/latest-news",
    "Money Control": "https://www.moneycontrol.com/news/business/",
    "Forbes India": "https://www.forbesindia.com/top-news/"
}

In [167]:
# Method to crawl and extract news headlines from news websites
def get_headlines(site, url, max_articles=5):
    """Fetches headlines from the given news site URL."""
    headlines = []
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        if site == "Business Today":
            articles = soup.select("div.Section_widget_listing_content_section__wSvZN h2 a")[:max_articles]
        elif site == "Economic Times":
            articles = soup.select("ul.data li a")[:max_articles]
        elif site == "Mint":
            articles = soup.select("h2.headline a")[:max_articles]
        elif site == "Money Control":
            articles = soup.select("div.fleft ul li.clearfix h2 a")[:max_articles]
        elif site == "Forbes India":
            articles = soup.find_all("a", class_="jsx-7e2b30fcc84f6f01 ctnm")[:max_articles]
        else:
            articles = []
        for a in articles:
            title = a.get_text(strip=True)
            link = url + a.get('href', '')

            headlines.append({"title": title, "url": link})
    except Exception as e:
        print(f"Error fetching from {site}: {e}")
    return headlines

In [61]:
# Method to process headline into embedding and predict the category using model
def categorize_headline(title):
    title_seq = tokenizer.texts_to_sequences([title])
    title_pad = pad_sequences(title_seq, maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(title_pad, verbose='None')
    predicted_category = np.argmax(prediction)
    return target_cols[predicted_category]

In [168]:
# Crawl and store headlines
news_data = {}
for site, url in news_sites.items():
    news_data[site] = get_headlines(site, url)

# Classifying the headline using model and printing details
for site, articles in news_data.items():
    print(f"=== {site} ===")
    for article in articles:
        category = categorize_headline(article['title'])
        print(f"- [{category[9:]}] {article['title']} ({article['url']})")
    print("")

=== Business Today ===
- [wellness] INR vs USD: Why a stronger dollar doesn’t mean the rupee fell by the same percentage, explains Samir Arora (https://www.businesstoday.in/latesthttps://www.businesstoday.in/latest/economy/story/inr-vs-usd-why-a-stronger-dollar-doesnt-mean-the-rupee-fell-by-the-same-percentage-explains-samir-arora-491740-2025-08-29)
- [business] 'From delivery to house help': Startup founder warns gig economy is trapping India’s young (https://www.businesstoday.in/latesthttps://www.businesstoday.in/latest/trends/story/from-delivery-to-house-help-startup-founder-warns-gig-economy-is-trapping-indias-young-491738-2025-08-29)
- [business] Q1FY26 GDP growth: Five-quarter high growth gives hope for continued momentum (https://www.businesstoday.in/latesthttps://www.businesstoday.in/latest/economy/story/q1fy26-gdp-growth-five-quarter-high-growth-gives-hope-for-continued-momentum-491737-2025-08-29)
- [parenting] 'Supply chain disruptions, trade shifts posed challenges': Mukesh 