<a href="https://colab.research.google.com/github/vasupradha2003/Web_Data_ETL_Pipeline_Article_Word_Frequency_Analysis/blob/main/Web_Data_ETL_Pipeline_Article_Word_Frequency_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Web Scraper class to extract text from the article
class WebScraper:
    def __init__(self, url):
        self.url = url

    def extract_article_text(self):
        response = requests.get(self.url)
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        article_text = soup.get_text()
        return article_text

In [None]:
# Text Processor class to clean and tokenize the article text
class TextProcessor:
    def __init__(self, nltk_stopwords):
        self.nltk_stopwords = nltk_stopwords

    def tokenize_and_clean(self, text):
        words = text.split()
        filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
        return filtered_words

In [None]:
# ETL Pipeline class that ties everything together
class ETLPipeline:
    def __init__(self, url):
        self.url = url
        self.nltk_stopwords = set(stopwords.words("english"))

    def run(self):
        # Step 1: Extract article text
        scraper = WebScraper(self.url)
        article_text = scraper.extract_article_text()

        # Step 2: Process the text (tokenization & cleaning)
        processor = TextProcessor(self.nltk_stopwords)
        filtered_words = processor.tokenize_and_clean(article_text)

        # Step 3: Calculate word frequencies
        word_freq = Counter(filtered_words)

        # Step 4: Convert the word frequencies to a DataFrame
        df = pd.DataFrame(word_freq.items(), columns=["Words", "Frequencies"])
        df = df.sort_values(by="Frequencies", ascending=False)

        return df

In [None]:
# Main function to execute the ETL pipeline
if __name__ == "__main__":
    # Replace this with the URL of the new article you want to scrape
    article_url = "https://kahedu.edu.in/why-data-science-is-the-future-of-technology/#:~:text=One%20of%20the%20major%20reasons,realm%20and%20make%20decisions%20swiftly."

    # Create the ETL pipeline and run it
    pipeline = ETLPipeline(article_url)
    result_df = pipeline.run()

In [None]:
# Print the top 10 most frequent words
print(result_df.head(10))

           Words  Frequencies
0           data           57
29    department           50
1        science           39
102     research           25
39   engineering           24
3     technology           16
4       overview           16
75        policy           14
2         future           12
36      computer           11
