In [21]:
#Task 1

import requests
from bs4 import BeautifulSoup

In [22]:
def extract_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    links = []
    titles = []
    descriptions = []
    
    # Extracting links
    for link in soup.find_all('a'):
        links.append(link.get('href'))
    
    # Extracting titles and descriptions
    articles = soup.find_all('article')
    for article in articles:
        title_tag = article.find('h2')
        description_tag = article.find('p')
        
        # Check if both title and description tags are found
        if title_tag and description_tag:
            title = title_tag.get_text()
            description = description_tag.get_text()
            titles.append(title)
            descriptions.append(description)
    
    return links, titles, descriptions

In [23]:
dawn_url = 'https://www.dawn.com/'
bbc_url = 'https://www.bbc.com/'

In [24]:
dawn_links, dawn_titles, dawn_descriptions = extract_data(dawn_url)
print("Dawn Links:", dawn_links)
print("Dawn Titles:", dawn_titles)
print("Dawn Descriptions:", dawn_descriptions)

Dawn Links: ['https://epaper.dawn.com', 'https://www.dawnnews.tv/watch-live', 'https://www.dawnnews.tv', 'https://images.dawn.com', 'https://herald.dawn.com', 'https://aurora.dawn.com', 'https://cityfm89.com', 'https://www.dawn.com/advertise', 'https://educationexpo.dawn.com', 'https://www.dawn.com/events/supplements', 'http://classifieds.dawn.com/', 'https://obituary.dawn.com', 'https://www.dawn.com/', 'https://epaper.dawn.com', '//www.dawn.com', '/latest-news', 'https://www.dawn.com/trends/gaza-invasion', '/pakistan', '/opinion', '/business', '/world', '/entertainment', '/prism', '/sport', '/magazines', '/tech', '/videos', '/popular', '/newspaper', 'https://www.dawnrelief.com/', '//www.dawn.com', '/latest-news', 'https://www.dawn.com/trends/gaza-invasion', '/pakistan', '/opinion', '/business', '/world', '/entertainment', '/prism', '/sport', '/magazines', '/tech', '/videos', '/popular', '/newspaper', 'https://www.dawnrelief.com/', '/live/elections-2024#1822566', '/live/elections-2024#

In [25]:
bbc_links, bbc_titles, bbc_descriptions = extract_data(bbc_url)
print("BBC Links:", bbc_links)
print("BBC Titles:", bbc_titles)
print("BBC Descriptions:", bbc_descriptions)



BBC Links: ['/', '/', '/news', '/sport', '/business', '/innovation', '/culture', '/travel', '/future-planet', '/video', '/live', '/home', '/news', '/news/topics/c2vdnvdg6xxt', '/news/war-in-ukraine', '/news/topics/ce483qevngqt', '/news/us-canada', '/news/uk', '/news/politics', '/news/england', '/news/northern_ireland', '/news/northern_ireland/northern_ireland_politics', '/news/scotland', '/news/scotland/scotland_politics', '/news/wales', '/news/wales/wales_politics', '/news/world/africa', '/news/world/asia', '/news/world/asia/china', '/news/world/asia/india', '/news/world/australia', '/news/world/europe', '/news/world/latin_america', '/news/world/middle_east', '/news/in_pictures', '/news/reality_check', '/sport', '/business', '/business/future-of-business', '/business/technology-of-business', '/business/c-suite', '/innovation', '/innovation/technology', '/innovation/science', '/innovation/artificial-intelligence', '/culture', '/culture/film-tv', '/culture/music', '/culture/art', '/cult

In [35]:
#Task 2

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Set NLTK data path
nltk.data.path.append("/path/to/nltk_data") 

# Function to clean and preprocess text
def clean_text(text):
    # Remove HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    clean_text = re.sub(r'[^a-zA-Z]', ' ', clean_text)
    
    # Convert text to lowercase
    clean_text = clean_text.lower()
    
    # Tokenize text
    tokens = word_tokenize(clean_text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join tokens back into text
    clean_text = ' '.join(lemmatized_tokens)
    
    return clean_text

# Clean and preprocess the extracted titles and descriptions
clean_dawn_titles = [clean_text(title) for title in dawn_titles]
clean_dawn_descriptions = [clean_text(description) for description in dawn_descriptions]

clean_bbc_titles = [clean_text(title) for title in bbc_titles]
clean_bbc_descriptions = [clean_text(description) for description in bbc_descriptions]

# Print cleaned data
print("Cleaned Dawn Titles:", clean_dawn_titles)
print("Cleaned Dawn Descriptions:", clean_dawn_descriptions)

print("Cleaned BBC Titles:", clean_bbc_titles)
print("Cleaned BBC Descriptions:", clean_bbc_descriptions)



Cleaned Dawn Titles: []
Cleaned Dawn Descriptions: []
Cleaned BBC Titles: ['russia blame ukraine blast destroys apartment']
Cleaned BBC Descriptions: ['least two people said killed section storey block collapsed belgorod']


In [36]:
import csv

# Write data to CSV file
with open('data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Title', 'Description'])
    for title, description in zip(clean_dawn_titles, clean_dawn_descriptions):
        writer.writerow([title, description])
    for title, description in zip(clean_bbc_titles, clean_bbc_descriptions):
        writer.writerow([title, description])
