In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\Text_analysis\\research'

In [3]:
os.chdir("../")

In [4]:

%pwd

'c:\\DataScience\\Projects\\Text_analysis'

In [5]:

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TextExtractionConfig:
    root_dir: Path
    input_file: Path
    destination_folder: Path

In [6]:
from src.text_analysis.constants import *
from src.text_analysis.utils.common import read_yaml_file, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):
        # params_filepath = PARAMS_FILE_PATH):
        # schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml_file(config_filepath)
        # self.params = read_yaml_file(params_filepath)
        # self.schema = read_yaml_file(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_text_extraction_config(self) -> TextExtractionConfig:
        config = self.config.text_extraction

        create_directories([config.root_dir])

        data_ingestion_config = TextExtractionConfig(
            root_dir=config.root_dir,
            input_file=config.input_file,
            destination_folder=config.destination_folder
        )

        return data_ingestion_config

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [9]:
class TextExtraction:
    def __init__(self, config: TextExtractionConfig):
        self.config = config

    def extract_text(self):
        
        df=pd.read_excel(self.config.input_file)
        
        url_list=list(df["URL"])
        url_id_list=list(df["URL_ID"])

        for id, url in zip(url_id_list,url_list):
            print(id,url)

            response=requests.get(url,headers=HEADERS)
            html_content=response.text

            soup=BeautifulSoup(html_content,"html.parser")

            for ele in soup.find_all(["script","style","nav","footer","header","aside"]):
                ele.decompose()

            article_selectors=[
                "article",
                "[class*='article']",
                "[class*='post']",
                "main",
                ".content",
                "#content"
            ]

            title=""
            title_tag=soup.find("h1") or soup.find("title")
            if title_tag:
                title=title_tag.get_text().strip()

            article_content=None
            for selector in article_selectors:
                article_content=soup.select_one(selector)
                if article_content:
                    break
            text=article_content.get_text(separator="\n",strip=True)

            # Create filename from title or timestamp
            filename = id
            filename = f"artifacts/text_extraction/{filename}.txt"

            with open(filename, 'w', encoding='utf-8') as file:
                file.write(f"Title: {id}\n")
                file.write(f"Source URL: {url}\n")
                file.write("=" * 50 + "\n\n")
                file.write(text)

                

In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_text_extraction_config()
    data_ingestion = TextExtraction(config=data_ingestion_config)
    data_ingestion.extract_text()
except Exception as e:
    raise e

[2024-11-14 17:38:52,135: INFO:common: yaml file: config\config.yaml loaded successfully]
[2024-11-14 17:38:52,135: INFO:common: createD Directory at:artifacts]
[2024-11-14 17:38:52,135: INFO:common: createD Directory at:artifacts/text_extraction]
Netclan20241017 https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Netclan20241018 https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
Netclan20241019 https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
Netclan20241020 https://insights.blackcoffer.com/efficient-processing-and-analysis-of-financial-data-from-pdf-files-addressing-formatting-inconsistencies-and-ensuring-data-integrity-for-a-toyota-dealership-management-firm/
Netclan20241021 https://insights.blackcoffer.com/d