# NLP Assignment â€“ Web Scraping & Text Processing

## Part A: Web Scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

articles = []
headers = {"User-Agent": "Mozilla/5.0"}

urls = [
    "https://www.thehindu.com/news/national/",
    "https://www.thehindu.com/news/international/",
    "https://www.thehindu.com/sci-tech/"
]

for url in urls:
    response = requests.get(url, headers=headers)
    time.sleep(1)
    soup = BeautifulSoup(response.text, "html.parser")

    for item in soup.find_all("a", class_="story-card"):
        title = item.text.strip()
        link = "https://www.thehindu.com" + item.get("href")

        try:
            article_page = requests.get(link, headers=headers)
            time.sleep(1)
            s2 = BeautifulSoup(article_page.text, "html.parser")
            content = " ".join([p.text for p in s2.find_all("p")])

            articles.append({"title": title, "url": link, "content": content})
        except:
            continue

df = pd.DataFrame(articles)
df.to_csv("news_dataset.csv", index=False)
df.head()

In [5]:
!pip install nltk


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 4.1 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.8 MB/s eta 0:00:00
Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl (277 kB)
Downloading click-8.3.1-py3-none-any.whl (108 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.3.1 nltk-3.9.2 regex-2025.11.3 tqdm-4.67.1



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Part B: NLP Pipeline

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

df = pd.read_csv("news_dataset.csv")

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"<.*?>", "", text)
    text = text.encode("ascii", "ignore").decode()
    return text.strip()

df["clean_content"] = df["content"].apply(clean_text)

stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

df["processed"] = df["clean_content"].apply(preprocess)
df.head()

## Tokenization

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

df["tokens"] = df["processed"].apply(word_tokenize)
df["sentences"] = df["clean_content"].apply(sent_tokenize)
df.head()

## Language Statistics

In [None]:
df["word_count"] = df["tokens"].apply(len)
df["sentence_count"] = df["sentences"].apply(len)

from collections import Counter
all_words = []
for row in df["tokens"]:
    all_words.extend(row)

vocab = set(all_words)
print("Vocabulary Size:", len(vocab))
df.head()