# Project Level Imports

In [1]:
import os
import time
import logging
import pandas as pd
import numpy as np
import pandas as pd
import logging
import time
from queue import Queue
from threading import Thread
from logging.handlers import RotatingFileHandler

# Project Configuration

### 1. Setup logging.

In [2]:
#=====================================================================================================================================================#
#== Function level imports ==#

from queue import Queue
from threading import Thread
from sqlalchemy import create_engine, Table, Column, MetaData, String, DateTime
from logging.handlers import RotatingFileHandler
from datetime import datetime

#== Defines a global dataframe to store log entries ==#

log_df = pd.DataFrame(columns=['timestamp', 'level', 'message'])

#== Defines a class for logging handler ==#

class DataFrameLoggingHandler(logging.Handler):
    def __init__(self, dataframe, queue):
        super().__init__()
        self.dataframe = dataframe
        self.queue = queue

    def emit(self, record):
        try:
            timestamp = pd.Timestamp.now()
            level = record.levelname
            message = record.getMessage()
            self.queue.put((timestamp, level, message))
        except Exception:
            self.handleError(record)

#== Defines a function to process log queue ==#

def process_log_queue(dataframe, queue, push_interval=10):
    """
    Processes the log queue, adding new logs to the DataFrame and pushing to the database.
    """
    while True:
        while not queue.empty():
            timestamp, level, message = queue.get()
            dataframe.loc[len(dataframe)] = [timestamp, level, message]
        
        # Push to PostgreSQL at regular intervals
        if len(dataframe) > 0:
            push_to_database(dataframe)
            dataframe.drop(dataframe.index, inplace=True)  # Clear DataFrame after pushing

        time.sleep(push_interval)

#== Evaluates whether the table already exists in the database ==#

def create_table_if_not_exists(engine):
    """
    Create the table 'website_log' in the 'light_site' database if it doesn't already exist,
    ensuring the column names are lowercase for compatibility with PostgreSQL.
    """
    metadata = MetaData()

    # Define table schema with lowercase column names
    website_log = Table(
        'website_log', metadata,
        Column('timestamp', DateTime, nullable=False),  # Lowercase column names
        Column('level', String, nullable=False),
        Column('message', String, nullable=False)
    )

    # Create the table if it does not exist
    metadata.create_all(engine)

#== Pushes the log data to the database table ==#

def push_to_database(dataframe):
    """
    Pushes the log DataFrame to the PostgreSQL database, ensuring lowercase columns,
    and creates the table if it doesn't exist.
    """
    try:
        # Define your PostgreSQL connection string
        engine = create_engine('postgresql://postgres:password@localhost:5432/project_gemma')

        # Ensure lowercase column names in DataFrame
        dataframe.columns = [col.lower() for col in dataframe.columns]

        # Create the table 'website_log' if it doesn't exist
        create_table_if_not_exists(engine)

        # Insert DataFrame into the table 'website_log' (append mode)
        dataframe.to_sql('website_log', engine, if_exists='append', index=False)
    except Exception as e:
        logging.error(f"Error pushing data to PostgreSQL: {str(e)}")

#== Logging setup ==#

def setup_logging(log_file='app_log.log', log_level=logging.INFO):
    """
    Set up logging configuration to log to a file, console, and DataFrame, and return a logger.
    """
    logging.root.handlers.clear()

    logger = logging.getLogger()
    logger.setLevel(log_level)

    # Define handlers
    log_queue = Queue()
    handlers = [
        RotatingFileHandler(log_file, maxBytes=10485760, backupCount=5),  # 10 MB per file
        logging.StreamHandler(),
        DataFrameLoggingHandler(log_df, log_queue)
    ]
    
    for handler in handlers:
        handler.setLevel(log_level)
        handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        logger.addHandler(handler)

    # Start a thread to process the log queue and push to the database
    log_thread = Thread(target=process_log_queue, args=(log_df, log_queue), daemon=True)
    log_thread.start()

    return logger

# Usage
logger = setup_logging(log_file='app_log.log', log_level=logging.INFO)
logger.info("Logging initialized and set up successfully.")

#== Function to Display Website Log ==#

logging.info(f"Log size: {log_df.shape}")

def display_log_df(log_df, rows=5, exclude_column=None):
    logging.info(f"Viewing the log: {display_log_df}")
    if exclude_column in log_df.columns:
        log_df = log_df.drop(columns=exclude_column)
    return log_df.head(rows)

preview_log = display_log_df(log_df, rows=20)
preview_log

2024-10-27 20:42:01,543 - INFO - Logging initialized and set up successfully.
2024-10-27 20:42:01,548 - INFO - Log size: (0, 3)
2024-10-27 20:42:01,552 - INFO - Viewing the log: <function display_log_df at 0x7afb6158ba30>


Unnamed: 0,timestamp,level,message


### 2. Initialize paths for output parquet file for the main dataframe that is being analyzed.

In [3]:
def initialize_paths():
    """
    Initializes paths for output and logs the process.
    
    Returns:
        tuple: A tuple containing base_dir, jdbc_dir, and output_path.
    """
    try:
        # Initialize base directory
        base_dir = os.path.join(os.path.expanduser('~'), 'git', 'project_gemma')
        logging.info(f"Base directory initialized: {base_dir}")
        
        # Initialize JDBC directory
        jdbc_dir = os.path.join(base_dir, "jdbc")
        logging.info(f"JDBC directory initialized: {jdbc_dir}")
        
        # Initialize output path
        output_path = os.path.join(jdbc_dir, "output.parquet")
        logging.info(f"Output path initialized: {output_path}")
        
        return base_dir, jdbc_dir, output_path
    
    except Exception as e:
        logging.error(f"Error initializing paths: {str(e)}")
        raise  # Re-raise the exception after logging

# Example usage
if __name__ == "__main__":
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')

    base_dir, jdbc_dir, output_path = initialize_paths()

2024-10-27 20:42:01,993 - INFO - Base directory initialized: /home/tron/git/project_gemma
2024-10-27 20:42:01,995 - INFO - JDBC directory initialized: /home/tron/git/project_gemma/jdbc
2024-10-27 20:42:02,000 - INFO - Output path initialized: /home/tron/git/project_gemma/jdbc/output.parquet


# Extract Data

Setup database and define table structure for data extraction.

In [4]:
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, Column, Integer, String, Text, TIMESTAMP
from sqlalchemy.orm import sessionmaker, declarative_base  # Updated import

# Database setup
DATABASE_URL = 'postgresql://postgres:password@localhost:5432/project_gemma'  # Update with your credentials
engine = create_engine(DATABASE_URL)
Base = declarative_base()  # This line remains unchanged
Session = sessionmaker(bind=engine)
session = Session()

# Define the table structure
class ProjectGutenberg(Base):
    __tablename__ = 'cagliostro_gutenberg'  # Use the correct table name
    id = Column(Integer, primary_key=True)
    chapter_title = Column(String(255), nullable=True)  # Changed from title to chapter_title
    paragraph = Column(Text, nullable=True)              # Added paragraph field
    quote = Column(Text, nullable=True)                  # Added quote field
    source_url = Column(String(255), nullable=True)     # Added source_url field
    created_at = Column(TIMESTAMP, nullable=True)        # Added created_at field
    title = Column(String(255), nullable=True)           # Added title field
    content = Column(Text, nullable=True)                 # Content can remain as is

# Create the table in the database (if it doesn't already exist)
Base.metadata.create_all(engine)

Defining web scraping logic and pushing the scraped content to the database

In [5]:
def scrape_html(html_url):
    # Fetch the HTML content
    response = requests.get(html_url)
    response.raise_for_status()  # Raise an error for bad responses
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract relevant data
    chapters = []
    title = soup.find('h1').get_text(strip=True)  # Assuming the main title is in <h1>
    chapter_titles = soup.find_all('h2')  # Assuming chapter titles are in <h2>
    
    for chapter in chapter_titles:
        chapter_title_text = chapter.get_text(strip=True)
        paragraphs = chapter.find_next_siblings('p')  # Get paragraphs after the chapter title
        
        for paragraph in paragraphs:
            # Stop when we hit the next chapter title
            if paragraph.name == 'h2':
                break
            
            paragraph_text = paragraph.get_text(strip=True)
            chapters.append({
                'chapter_title': chapter_title_text,
                'paragraph': paragraph_text,
                'quote': None,  # Handle quotes separately if needed
                'source_url': html_url,
                'title': title,
            })

    # Insert data into the database
    for chapter in chapters:
        db_entry = ProjectGutenberg(
            chapter_title=chapter['chapter_title'],
            paragraph=chapter['paragraph'],
            quote=chapter['quote'],
            source_url=chapter['source_url'],
            title=chapter['title'],
        )
        session.add(db_entry)

    session.commit()
    logging.info(f"Data from {html_url} has been successfully scraped and stored.")

# Example usage
html_url = 'https://www.forbes.com/sites/alisondurkee/2024/10/25/america-first-agenda-what-to-know-about-the-project-2025-alternative-that-trump-isnt-disavowing/?'  # Replace with the actual HTML URL
scrape_data = scrape_html(html_url)


2024-10-27 20:42:06,067 - INFO - Data from https://www.forbes.com/sites/alisondurkee/2024/10/25/america-first-agenda-what-to-know-about-the-project-2025-alternative-that-trump-isnt-disavowing/? has been successfully scraped and stored.
