## Data Ingestion

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import sys

In [2]:
%pwd

'c:\\Personal AI Projects\\FORAGE JOB SIMULATIONS\\British Airline Data Science Virtual Internship\\customer-reviews-analysis\\trials'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Personal AI Projects\\FORAGE JOB SIMULATIONS\\British Airline Data Science Virtual Internship\\customer-reviews-analysis'

#### src/reviewAnalyser/entity/config_entity.py

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    pages: int
    page_size: int

#### src/reviewAnalyser/config/configuration.py

In [8]:
from src.reviewAnalyzer.constants import *
from src.reviewAnalyzer.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            pages=config.pages,
            page_size=config.page_size
        )

        return data_ingestion_config



#### src/reviewAnalyser/components/data_ingestion.py

In [10]:
from src.reviewAnalyzer import logger
from src.reviewAnalyzer.utils.common import get_size
from src.reviewAnalyzer.entity.config_entity import DataIngestionConfig
from pathlib import Path

In [11]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    
    def scrape_data(self):
        if not os.path.exists(self.config.local_data_file):
            reviews = []

            for i in range(1, self.config.pages + 1):

                print(f"Scraping page {i}")

                # Create URL to collect links from paginated data
                url = f"{self.config.source_URL}/page/{i}/?sortby=post_date%3ADesc&pagesize={self.config.page_size}"

                # Collect HTML data from this page
                response = requests.get(url)

                # Parse content
                content = response.content
                parsed_content = BeautifulSoup(content, 'html.parser')
                for para in parsed_content.find_all("div", {"class": "text_content"}):
                    reviews.append(para.get_text())
            
                print(f"   ---> {len(reviews)} total reviews")
            logger.info("Scraping complete!")

            # Load scraped data into a dataframe
            df = pd.DataFrame()
            df["reviews"] = reviews

            # Save to csv file
            df.to_csv(self.config.local_data_file)

            logger.info(f"Scraped File location: {self.config.local_data_file}")
            
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")




#### src/reviewAnalyser/pipeline/stage_01_data_ingestion.py

In [12]:
class DataIngestionPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        data_ingestion_config = config.get_data_ingestion_config()
        data_ingestion = DataIngestion(config=data_ingestion_config)
        data_ingestion.scrape_data()


### main.py

In [13]:
from src.reviewAnalyzer.exceptions import CustomException

In [14]:
STAGE_NAME = "Data Ingestion Stage"

try: 
   logger.info(f"*******************")
   logger.info(f">>>>>> {STAGE_NAME} started <<<<<<")
   data_ingestion = DataIngestionPipeline()
   data_ingestion.main()
   logger.info(f">>>>>> {STAGE_NAME} completed <<<<<<]\n\n[x==========x")
except Exception as e:
    raise CustomException(e, sys)

[2024-07-09 20:31:51,337: INFO: 4178946079: *******************]
[2024-07-09 20:31:51,339: INFO: 4178946079: >>>>>> Data Ingestion Stage started <<<<<<]
[2024-07-09 20:31:51,381: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-09 20:31:51,385: INFO: common: created directory at: artifacts]
[2024-07-09 20:31:51,388: INFO: common: created directory at: artifacts/data_ingestion]
Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
[2024-07-09 20:32:33,156: INFO: 1812307141: Scraping complete!]
[2024-07-09 20:32:33,457: INFO: 1812307141: Scraped File location: artifacts/data_ingestion/data.csv]
[2024-0