In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
import csv
import time
from random import randint 
import os
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import ne_chunk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vetri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vetri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vetri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vetri\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\vetri\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\vetri\AppData\Roaming\nltk_data...
[nltk_data]   Package words is

True

In [3]:

def scrape_and_store_data(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        print("HTTP Error:", errh)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
        return
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
        return
    except requests.exceptions.RequestException as err:
        print("Something went wrong!", err)
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Assuming the web pages contain articles and each article is wrapped in a 'div' tag with class 'article-content'
    article_text = soup.find('div', {'class': 'article__body'})

    if article_text:
        article = article_text.get_text()

        # Calculate the NLP-related statistics
        tokenized_article = nltk.word_tokenize(article)
        num_words = len(tokenized_article)
        num_unique_words = len(set(tokenized_article))
        num_stopwords = len([word for word in tokenized_article if word.lower() in nltk.corpus.stopwords.words('english')])
        num_sentences = len(nltk.sent_tokenize(article))
# Part-of-speech tagging
        pos_tags = nltk.pos_tag(tokenized_article)
        num_nouns = len([tag for word, tag in pos_tags if tag.startswith('NN')])

        # Average word length
        avg_word_length = sum(len(word) for word in tokenized_article) / num_words if num_words > 0 else 0

        # Vocabulary Diversity (Type-Token Ratio - TTR)
        ttr = num_unique_words / num_words if num_words > 0 else 0

        # Sentiment Analysis
        sentiment_analyzer = SentimentIntensityAnalyzer()
        compound_score = sentiment_analyzer.polarity_scores(article)['compound']

        # Named Entity Recognition (NER) Statistics
        ner_tags = ne_chunk(pos_tags)
        num_entities = len([chunk for chunk in ner_tags if hasattr(chunk, 'label')])

        # Write the results to a CSV file
        with open('results.csv', 'a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Write the results to the CSV file
            #check if the file is empty
            if file.tell() == 0:  
                writer.writerow(['URL', 'Number of Words', 'Number of Unique Words', 'Number of Stopwords', 'Number of Sentences', 'Number of Nouns', 'Average Word Length', 'Type-Token Ratio', 'Compound Sentiment Score', 'Number of Named Entities'])
            writer.writerow([url, num_words, num_unique_words, num_stopwords, num_sentences, num_nouns, avg_word_length, ttr, compound_score, num_entities])

        time.sleep(1)  # Add a delay to avoid overwhelming the server

def get_links(website):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        page = requests.get(website, headers=headers)
        page.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        print("HTTP Error:", errh)
        return []
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
        return []
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
        return []
    except requests.exceptions.RequestException as err:
        print("Something went wrong!", err)
        return []

    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', href=True)
    return [link['href'] for link in links]

# Example usage
website_url = 'https://www.webmd.com/a-to-z-guides/health-topics'
links_to_scrape = get_links(website_url)

print(links_to_scrape)
#delete the results file if it exists
try:
    os.remove('results.csv')
except OSError:
    pass

for link in links_to_scrape:
    full_url = website_url + link if link.startswith('/') else link
    scrape_and_store_data(full_url)


['#main-container', 'https://www.webmd.com/', 'https://www.webmd.com', 'https://www.webmd.com/a-to-z-guides/health-topics', 'https://www.webmd.com/add-adhd/default.htm', 'https://www.webmd.com/allergies/default.htm', 'https://www.webmd.com/arthritis/default.htm', 'https://www.webmd.com/heart-disease/atrial-fibrillation/default.htm', 'https://www.webmd.com/breast-cancer/default.htm', 'https://www.webmd.com/cancer/default.htm', 'https://www.webmd.com/ibd-crohns-disease/crohns-disease/default.htm', 'https://www.webmd.com/depression/default.htm', 'https://www.webmd.com/diabetes/default.htm', 'https://www.webmd.com/dvt/default.htm', 'https://www.webmd.com/skin-problems-and-treatments/eczema/default.htm', 'https://www.webmd.com/eye-health/default.htm', 'https://www.webmd.com/heart-disease/default.htm', 'https://www.webmd.com/hiv-aids/default.htm', 'https://www.webmd.com/lung/default.htm', 'https://www.webmd.com/lupus/default.htm', 'https://www.webmd.com/mental-health/default.htm', 'https://w

In [4]:
import statistics

In [5]:
# Read the results from the CSV file
with open('results.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader) # Skip the header row

    num_words = []
    num_unique_words = []
    num_stopwords = []
    num_sentences = []
    num_nouns = []
    avg_word_length = []
    ttr = []
    compound_score = []
    num_entities = []

    for row in reader:
        num_words.append(int(row[1]))
        num_unique_words.append(int(row[2]))
        num_stopwords.append(int(row[3]))
        num_sentences.append(int(row[4]))
        num_nouns.append(int(row[5]))
        avg_word_length.append(float(row[6]))
        ttr.append(float(row[7]))
        compound_score.append(float(row[8]))
        num_entities.append(int(row[9]))

# Calculate the average of each statistic
average_num_words = statistics.mean(num_words)
average_num_unique_words = statistics.mean(num_unique_words)
average_num_stopwords = statistics.mean(num_stopwords)
average_num_sentences = statistics.mean(num_sentences)
average_num_nouns = statistics.mean(num_nouns)
average_avg_word_length = statistics.mean(avg_word_length)
average_ttr = statistics.mean(ttr)
average_compound_score = statistics.mean(compound_score)
average_num_entities = statistics.mean(num_entities)

# Write the aggregated results to a file
with open('aggregated_results.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Number of Words', 'Number of Unique Words', 'Number of Stopwords', 'Number of Sentences', 'Number of Nouns', 'Average Word Length', 'Type-Token Ratio', 'Compound Sentiment Score', 'Number of Named Entities'])
    writer.writerow(['Average', average_num_words, average_num_unique_words, average_num_stopwords, average_num_sentences, average_num_nouns, average_avg_word_length, average_ttr, average_compound_score, average_num_entities])