# KISI BI Website me Unique Word Find code

In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import stopwords

class WebScraper:
    def __init__(self, url):
        self.url = url

    def extract_article_text(self):
        response = requests.get(self.url)
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        article_text = ' '.join([p.text for p in soup.find_all('p')])  # Extract text from <p> tags
        return article_text

class TextProcessor:
    def __init__(self, nltk_stopwords):
        self.nltk_stopwords = nltk_stopwords

    def tokenize_and_clean(self, text):
        words = text.split()
        filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
        return filtered_words

class ETLPipeline:
    def __init__(self, url):
        self.url = url
        nltk.download('stopwords')
        self.nltk_stopwords = set(stopwords.words('english'))

    def run(self):
        scraper = WebScraper(self.url)
        article_text = scraper.extract_article_text()
        
        processor = TextProcessor(self.nltk_stopwords)
        filtered_words = processor.tokenize_and_clean(article_text)
        
        word_freq = Counter(filtered_words)
        df = pd.DataFrame(word_freq.items(), columns=['Words', 'Frequencies'])
        df = df.sort_values(by='Frequencies', ascending=False)
        return df

if __name__ == '__main__':
    article_url = 'https://intellipaat.com/blog/interview-question/python-interview-questions/'
    
    pipeline = ETLPipeline(article_url)
    result_df = pipeline.run()
    print(result_df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


        Words  Frequencies
2      python          261
9        data           89
300      used           89
129      code           78
322  function           76
