In [2]:
import requests
from bs4 import BeautifulSoup
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time
import random
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Create directories for raw and cleaned data
os.makedirs('raw_data', exist_ok=True)
os.makedirs('cleaned_data', exist_ok=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Define 20 categories and 3 websites for each


In [5]:
categories = {
    'technology': [
        'https://techcrunch.com/', 
        'https://www.wired.com/', 
        'https://www.theverge.com/'
    ],
    'health': [
        'https://www.health.com/', 
        'https://www.mayoclinic.org/diseases-conditions', 
        'https://www.webmd.com/'
    ],
    'finance': [
        'https://www.bloomberg.com/', 
        'https://www.cnbc.com/', 
        'https://www.investopedia.com/'
    ],
    'sports': [
        'https://www.espn.com/', 
        'https://sports.yahoo.com/', 
        'https://www.cbssports.com/'
    ],
    'entertainment': [
        'https://variety.com/', 
        'https://www.hollywoodreporter.com/', 
        'https://deadline.com/'
    ],
    'science': [
        'https://www.scientificamerican.com/', 
        'https://www.livescience.com/', 
        'https://www.nature.com/news'
    ],
    'travel': [
        'https://www.travelandleisure.com/', 
        'https://www.lonelyplanet.com/', 
        'https://www.nationalgeographic.com/travel/'
    ],
    'food': [
        'https://www.allrecipes.com/',
        'https://www.epicurious.com/',
        'https://www.bonappetit.com/'
    ],
    'education': [
        'https://www.edutopia.org/',
        'https://www.edweek.org/',
        'https://www.insidehighered.com/'
    ],
    'environment': [
        'https://www.nationalgeographic.com/environment/',
        'https://www.nature.org/',
        'https://www.ecowatch.com/'
    ],
    'politics': [
        'https://www.politico.com/',
        'https://thehill.com/',
        'https://www.fivethirtyeight.com/'
    ],
    'fashion': [
        'https://www.vogue.com/',
        'https://www.elle.com/',
        'https://www.harpersbazaar.com/'
    ],
    'art': [
        'https://www.artnews.com/',
        'https://www.artforum.com/',
        'https://news.artnet.com/'
    ],
    'automotive': [
        'https://www.caranddriver.com/',
        'https://www.motortrend.com/',
        'https://www.autocar.co.uk/'
    ],
    'business': [
        'https://www.forbes.com/',
        'https://www.entrepreneur.com/',
        'https://hbr.org/'
    ],
    'real_estate': [
        'https://www.realtor.com/news/',
        'https://www.zillow.com/blog/',
        'https://www.redfin.com/news/'
    ],
    'gaming': [
        'https://www.ign.com/',
        'https://www.gamespot.com/',
        'https://kotaku.com/'
    ],
    'pets': [
        'https://www.akc.org/',
        'https://www.petmd.com/',
        'https://www.thesprucepets.com/'
    ],
    'history': [
        'https://www.history.com/',
        'https://www.historyextra.com/',
        'https://www.ancient-origins.net/'
    ],
    'psychology': [
        'https://www.psychologytoday.com/',
        'https://www.apa.org/',
        'https://www.verywellmind.com/'
    ]
}

Implement web crawler to extract text from the websites

In [6]:
# Define headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_article_links(url, max_articles=10):
    """Extract article links from the main page of a website"""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for common article patterns (a tags with href)
        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            
            # Make sure the link is an article (contains keywords or patterns)
            article_indicators = ['/article/', '/news/', '/story/', '/post/', '/blog/']
            if any(indicator in href for indicator in article_indicators) or re.search(r'\d{4}/\d{2}', href):
                # Handle relative URLs
                if href.startswith('/'):
                    base_url = '/'.join(url.split('/')[:3])  # Get domain
                    href = base_url + href
                elif not href.startswith(('http://', 'https://')):
                    if url.endswith('/'):
                        href = url + href
                    else:
                        href = url + '/' + href
                
                # Only include links from the same domain
                if url.split('/')[2] in href:
                    links.append(href)
        
        # Remove duplicates and limit to max_articles
        return list(set(links))[:max_articles]
    except Exception as e:
        print(f"Error fetching links from {url}: {e}")
        return []

def extract_article_content(url):
    """Extract title, date, and content from an article page"""
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = ""
        title_tag = soup.find('h1')
        if title_tag:
            title = title_tag.get_text().strip()
        
        # Extract date (common patterns)
        date = ""
        # Look for time tags
        time_tag = soup.find('time')
        if time_tag:
            date = time_tag.get_text().strip()
        else:
            # Look for elements with date-related classes or properties
            date_patterns = ['date', 'publish', 'posted', 'time']
            for pattern in date_patterns:
                date_element = soup.find(class_=lambda c: c and pattern in c.lower())
                if date_element:
                    date = date_element.get_text().strip()
                    break
        
        # Extract main content
        content = ""
        # Look for article tags
        article_tag = soup.find('article')
        if article_tag:
            paragraphs = article_tag.find_all('p')
            content = ' '.join([p.get_text().strip() for p in paragraphs])
        else:
            # Look for div with content classes
            content_div = soup.find('div', class_=lambda c: c and any(x in c.lower() for x in ['content', 'article', 'story', 'entry']))
            if content_div:
                paragraphs = content_div.find_all('p')
                content = ' '.join([p.get_text().strip() for p in paragraphs])
            else:
                # Just get all p tags in the body as a fallback
                paragraphs = soup.find_all('p')
                content = ' '.join([p.get_text().strip() for p in paragraphs])
        
        return {
            'url': url,
            'title': title,
            'date': date,
            'content': content
        }
    except Exception as e:
        print(f"Error extracting content from {url}: {e}")
        return {
            'url': url,
            'title': '',
            'date': '',
            'content': ''
        }

Crawl websites and store data

In [7]:
all_data = {}

for category, websites in tqdm(categories.items(), desc="Processing categories"):
    print(f"\nCrawling websites for category: {category}")
    category_data = []
    
    for website in websites:
        print(f"  Getting links from {website}")
        article_links = get_article_links(website)
        
        print(f"  Found {len(article_links)} articles, extracting content...")
        for link in article_links:
            # Add a random delay to be respectful to the websites
            time.sleep(random.uniform(1, 3))
            
            article_data = extract_article_content(link)
            if article_data['content']:  # Only add if we got some content
                category_data.append(article_data)
                print(f"  Extracted: {article_data['title'][:50]}..." if article_data['title'] else "  Extracted article")
    
    all_data[category] = category_data
    
    # Save raw data to file
    raw_file_path = os.path.join('raw_data', f"{category}_raw.txt")
    with open(raw_file_path, 'w', encoding='utf-8') as f:
        for article in category_data:
            f.write(f"URL: {article['url']}\n")
            f.write(f"TITLE: {article['title']}\n")
            f.write(f"DATE: {article['date']}\n")
            f.write(f"CONTENT:\n{article['content']}\n")
            f.write("-" * 80 + "\n")
    
    print(f"Saved {len(category_data)} articles for category '{category}'")

Processing categories:   0%|          | 0/20 [00:00<?, ?it/s]


Crawling websites for category: technology
  Getting links from https://techcrunch.com/
  Found 10 articles, extracting content...
  Extracted: Apple’s smart home hub reportedly delayed by Siri ...
  Extracted: Judge allows authors’ AI copyright lawsuit against...
  Extracted: How to stop doomscrolling...
  Extracted: US charges admins of Garantex for allegedly facili...
  Extracted: Scale AI is being investigated by the US Departmen...
  Extracted: ChatGPT: Everything you need to know about the AI-...
  Extracted: Why VCs ghost founders, or reject deals and never ...
  Extracted: Anthropic’s Claude Code tool had a bug that ‘brick...
  Extracted: SpaceX Starship spirals out of control in second s...
  Extracted: Google scrubs mentions of ‘diversity’ and ‘equity’...
  Getting links from https://www.wired.com/
  Found 10 articles, extracting content...
  Extracted: ‘Startup Nation’ Groups Say They’re Meeting Trump ...
  Extracted: With GPT-4.5, OpenAI Trips Over Its Own AGI Ambiti...
  

  Extracted: China’s other great wall is impressive, too—and st...
  Extracted: 10 family-friendly things to do in Texas...
  Extracted: 10 family-friendly hotels in Texas, from El Paso t...
  Extracted: A practical guide to travelling in southern Mexico...
  Extracted: The unexpected wine country you need to visit: New...
  Extracted: It's the 100th anniversary of The Great Gatsby and...
  Extracted: Meet the female rappers carving out a home in Nash...
  Extracted: National Geographic Masthead...
  Extracted: Where to try the 6 traditional dishes served on ‘N...
  Extracted: Where to travel in April...
Saved 10 articles for category 'travel'

Crawling websites for category: food
  Getting links from https://www.allrecipes.com/
  Found 3 articles, extracting content...
  Extracted: Secrets to the Perfect St. Paddy's Day Corned Beef...
  Extracted: Tips for Submitting Your Recipe to Allrecipes...
  Extracted: How To Make Irish Soda Bread...
  Getting links from https://www.epicurious.c

  Found 0 articles, extracting content...
Saved 20 articles for category 'automotive'

Crawling websites for category: business
  Getting links from https://www.forbes.com/
  Found 10 articles, extracting content...
  Extracted: Malaysian Tycoon Lim Han Weng’s HI Mobility Adding...
  Extracted: Forbes Editorial Values And Standards...
  Extracted: Trump Threatens New Tariffs On Canada As Soon As T...
  Extracted: Stocks Suffer Worst Week In 6 Months—Despite Frida...
  Extracted: Elon Musk Dubiously Blames Billionaires George Sor...
  Extracted: Here’s Where Trump’s Government Layoffs Are Target...
  Extracted: Gene Hackman Died Of Heart Disease—And Wife Killed...
  Extracted: Hong Kong’s 50 Richest 2025: Stock Market Rally Li...
  Extracted: Trump Won’t Deny There Could Be A Recession—Warns ...
  Extracted: CDC Plans Large Study On Long-Debunked Connection ...
  Getting links from https://www.entrepreneur.com/
  Found 2 articles, extracting content...
  Extracted: New Tech That Will Im

Clean the text using NLP preprocessing

In [8]:
def clean_text(text):
    """Apply NLP preprocessing to clean the text"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin tokens
    cleaned_text = ' '.join(tokens)
    
    # Remove extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

# Process and save cleaned data for each category
for category, articles in tqdm(all_data.items(), desc="Cleaning text data"):
    cleaned_texts = []
    
    for article in articles:
        cleaned_title = clean_text(article['title'])
        cleaned_content = clean_text(article['content'])
        
        cleaned_texts.append(f"{cleaned_title} {cleaned_content}")
    
    # Save cleaned data to file
    cleaned_file_path = os.path.join('cleaned_data', f"{category}_cleaned.txt")
    with open(cleaned_file_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(cleaned_texts))
    
    print(f"Saved cleaned text for category '{category}'")

Cleaning text data:   0%|          | 0/20 [00:00<?, ?it/s]

Saved cleaned text for category 'technology'
Saved cleaned text for category 'health'
Saved cleaned text for category 'finance'
Saved cleaned text for category 'sports'
Saved cleaned text for category 'entertainment'
Saved cleaned text for category 'science'
Saved cleaned text for category 'travel'
Saved cleaned text for category 'food'
Saved cleaned text for category 'education'
Saved cleaned text for category 'environment'
Saved cleaned text for category 'politics'
Saved cleaned text for category 'fashion'
Saved cleaned text for category 'art'
Saved cleaned text for category 'automotive'
Saved cleaned text for category 'business'
Saved cleaned text for category 'real_estate'
Saved cleaned text for category 'gaming'
Saved cleaned text for category 'pets'
Saved cleaned text for category 'history'
Saved cleaned text for category 'psychology'


In [11]:
# Create a simple feature extraction function
def extract_features(text, common_words_per_category):
    """Extract features based on the presence of category-specific words"""
    features = {}
    words = set(text.split())
    
    for category, common_words in common_words_per_category.items():
        # Count how many common words from each category appear in the text
        category_score = sum(1 for word in common_words if word in words)
        features[f"{category}_score"] = category_score
    
    return features

In [12]:
# Find most common words per category (simplified TF-IDF approach)
common_words_per_category = {}

for category in categories.keys():
    cleaned_file_path = os.path.join('cleaned_data', f"{category}_cleaned.txt")
    try:
        with open(cleaned_file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # Get word frequencies
        words = text.split()
        word_freq = {}
        for word in words:
            if len(word) > 3:  # Only consider words longer than 3 characters
                word_freq[word] = word_freq.get(word, 0) + 1
        
        # Get top 20 words
        top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]
        common_words_per_category[category] = [word for word, _ in top_words]
    except Exception as e:
        print(f"Error processing {category}: {e}")
        common_words_per_category[category] = []

In [13]:
# Prepare sample data for classifier demonstration
samples = []

for category in categories.keys():
    cleaned_file_path = os.path.join('cleaned_data', f"{category}_cleaned.txt")
    try:
        with open(cleaned_file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Take first 2 samples from each category
        for i, line in enumerate(lines[:2]):
            samples.append({
                'text': line,
                'actual_category': category
            })
    except Exception as e:
        print(f"Error getting samples for {category}: {e}")

# Classify samples
for sample in samples:
    features = extract_features(sample['text'], common_words_per_category)
    
    # Find category with highest score
    predicted_category = max(features.items(), key=lambda x: x[1])[0].split('_')[0]
    
    sample['predicted_category'] = predicted_category
    sample['features'] = features

In [14]:
# Calculate accuracy
correct = sum(1 for sample in samples if sample['actual_category'] == sample['predicted_category'])
accuracy = correct / len(samples) if samples else 0

print(f"Simple classifier accuracy: {accuracy:.2f} ({correct}/{len(samples)})")

# Display sample classifications
print("\nSample Classifications:")
for i, sample in enumerate(samples[:5]):  # Show first 5 samples
    print(f"Sample {i+1}:")
    print(f"  Actual category: {sample['actual_category']}")
    print(f"  Predicted category: {sample['predicted_category']}")
    print(f"  Text snippet: {sample['text'][:100]}...")
    print()

Simple classifier accuracy: 0.66 (23/35)

Sample Classifications:
Sample 1:
  Actual category: technology
  Predicted category: technology
  Text snippet: apples smart home hub reportedly delayed siri challenges latest ai amazon apps biotech health climat...

Sample 2:
  Actual category: technology
  Predicted category: technology
  Text snippet: judge allows authors ai copyright lawsuit meta move forward latest ai amazon apps biotech health cli...

Sample 3:
  Actual category: health
  Predicted category: health
  Text snippet: fda approves new treatment stroke march fda approved first newâ clotbustingâ drug nearly years offer...

Sample 4:
  Actual category: health
  Predicted category: health
  Text snippet: common vaginal issue really std study march â men whose female sex partners get itchy vaginal infect...

Sample 5:
  Actual category: finance
  Predicted category: finance
  Text snippet: evtols flying cars finally becoming reality video inventors fixated idea flying cars nearly

In [15]:
# Save the dataset information
dataset_info = {
    'name': 'MultiDomain-TextCorpus',
    'description': 'A comprehensive text dataset covering 20 different domains with articles collected from 3 websites per domain.',
    'categories': list(categories.keys()),
    'total_articles': sum(len(articles) for articles in all_data.values()),
    'articles_per_category': {category: len(articles) for category, articles in all_data.items()},
    'average_content_length': sum(len(article['content']) for articles in all_data.values() for article in articles) / 
                              sum(len(articles) for articles in all_data.values() if articles),
}

print("\nDataset Summary:")
print(f"Name: {dataset_info['name']}")
print(f"Description: {dataset_info['description']}")
print(f"Total categories: {len(dataset_info['categories'])}")
print(f"Total articles: {dataset_info['total_articles']}")
print(f"Average content length: {dataset_info['average_content_length']:.2f} characters")
print("\nArticles per category:")
for category, count in dataset_info['articles_per_category'].items():
    print(f"  {category}: {count}")


Dataset Summary:
Name: MultiDomain-TextCorpus
Description: A comprehensive text dataset covering 20 different domains with articles collected from 3 websites per domain.
Total categories: 20
Total articles: 222
Average content length: 6325.24 characters

Articles per category:
  technology: 20
  health: 4
  finance: 7
  sports: 20
  entertainment: 26
  science: 10
  travel: 10
  food: 13
  education: 14
  environment: 10
  politics: 0
  fashion: 11
  art: 14
  automotive: 20
  business: 21
  real_estate: 5
  gaming: 0
  pets: 1
  history: 6
  psychology: 10


In [16]:
# Export dataset information to CSV
dataset_stats = pd.DataFrame([{
    'Category': category,
    'Articles': count,
    'Websites': ', '.join(categories[category])
} for category, count in dataset_info['articles_per_category'].items()])

dataset_stats.to_csv('dataset_stats.csv', index=False)
print("\nDataset statistics saved to 'dataset_stats.csv'")


Dataset statistics saved to 'dataset_stats.csv'


Challenge: Demonstrate a use case - Create a simple document classifier
MultiDomain-TextCorpus Use Case: Document Classification


Dataset Name: WebText20: A Multi-Domain Text Dataset

Use Case: This dataset can be used for topic modeling, text classification, NLP-based recommendation systems, and AI-driven content summarization.