In [8]:
import pandas as pd
import logging
import time
from queue import Queue
from threading import Thread
from logging.handlers import RotatingFileHandler
from logging_configuration import setup_logging, log_df, display_log_df


In [10]:
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def load_data():
    file_name = "cagliostro_gutenberg.csv"
    file_path = os.path.join("..", "csv", file_name)

    try:
        # Check if the file exists
        if not os.path.exists(file_path):
            logging.error(f"File not found: {file_path}")
            return None

        # Load the dataset
        df = pd.read_csv(file_path)
        
        # Check if the DataFrame is empty
        if df.empty:
            logging.warning(f"{file_name} is empty. No data to load.")
            return None

        logging.info(f"{file_name} imported successfully!")
        logging.info(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

        # Export schema
        schema = df.dtypes.reset_index()
        schema.columns = ['Column Name', 'Data Type']

        # Get unique values for each column
        unique_values = df.nunique()
        schema['n_unique'] = unique_values.values

        # Log the schema
        logging.info("Schema of the loaded dataset:")
        logging.info(f"\n{tabulate(schema, headers='keys', tablefmt='psql')}")

        return df

    except pd.errors.EmptyDataError:
        logging.error("The file is empty or contains no data.")
    except pd.errors.ParserError:
        logging.error("Error parsing the CSV file. Please check the file format.")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")

    # Return None if any error occurs
    return None

# Load the data
df = load_data()


2024-10-27 12:27:02,875 - INFO - cagliostro_gutenberg.csv imported successfully!
2024-10-27 12:27:02,878 - INFO - There are 1775 rows and 8 columns.
2024-10-27 12:27:02,906 - INFO - Schema of the loaded dataset:
2024-10-27 12:27:02,912 - INFO - 
+----+---------------+-------------+------------+
|    | Column Name   | Data Type   |   n_unique |
|----+---------------+-------------+------------|
|  0 | id            | int64       |       1775 |
|  1 | chapter_title | object      |          1 |
|  2 | paragraph     | object      |        908 |
|  3 | quote         | float64     |          0 |
|  4 | source_url    | object      |          1 |
|  5 | created_at    | float64     |          0 |
|  6 | title         | object      |          1 |
|  7 | content       | float64     |          0 |
+----+---------------+-------------+------------+


In [6]:
df.shape

(1775, 8)