In [None]:
import pandas as pd
import logging
from logging_configuration import setup_logging, log_df
from load_from_csv import load_data
from tabulate import tabulate
from tabulate_style import tab_fmt

#=====================================================================================================================================================#
#== Configure logging with specified log file and level ==#
logger = setup_logging(log_file='app_log.log', log_level=logging.INFO)

In [1]:

#=====================================================================================================================================================#
#== Function to Explode Specified Column of DataFrame into Tokens ==#
def explode_to_tokens(df, column='paragraph'):
    """
    Explodes the specified column of a DataFrame into tokens.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the text data.
    - column (str): The column name to be exploded into tokens (default is 'paragraph').

    Returns:
    - pd.DataFrame: A DataFrame containing tokens, with each token in a separate row.
    """
    # Check if the specified column exists in the DataFrame
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")

    # Explode the specified column into tokens and reset the index
    tokens_df = df[column].str.split(' ').explode().reset_index(drop=True)
    
    return tokens_df

2024-10-27 13:49:25,827 - INFO - Logging initialized and set up successfully.
2024-10-27 13:49:25,830 - INFO - Viewing the log: <function display_log_df at 0x723c64ba3520>


Empty DataFrame
Columns: [timestamp, level, message]
Index: []


### Calling the function via tabulate_style

In [2]:
#=====================================================================================================================================================#
#== Preview the main dataframe ==#
df = load_data()

2024-10-27 13:49:28,424 - INFO - cagliostro_gutenberg.csv imported successfully!
2024-10-27 13:49:28,426 - INFO - There are 1775 rows and 8 columns.
2024-10-27 13:49:28,451 - INFO - Schema of the loaded dataset:
2024-10-27 13:49:28,458 - INFO - 
+----+---------------+-------------+------------+
|    | Column Name   | Data Type   |   n_unique |
|----+---------------+-------------+------------|
|  0 | id            | int64       |       1775 |
|  1 | chapter_title | object      |          1 |
|  2 | paragraph     | object      |        908 |
|  3 | quote         | float64     |          0 |
|  4 | source_url    | object      |          1 |
|  5 | created_at    | float64     |          0 |
|  6 | title         | object      |          1 |
|  7 | content       | float64     |          0 |
+----+---------------+-------------+------------+


In [7]:
tab_fmt(df, 3, style='grid')

2024-10-27 13:50:00,995 - INFO - +----+------+-----------------+-----------------------------------------------------------+----------------------------------------------------------------+------------+
|    |   id | chapter_title   | paragraph                                                 | source_url                                                     | title      |
|  0 |   24 | CHAPTER I       | THE POWER OF PREJUDICE                                    | https://www.gutenberg.org/cache/epub/74618/pg74618-images.html | CAGLIOSTRO |
+----+------+-----------------+-----------------------------------------------------------+----------------------------------------------------------------+------------+
|  1 |   25 | CHAPTER I       | I                                                         | https://www.gutenberg.org/cache/epub/74618/pg74618-images.html | CAGLIOSTRO |
+----+------+-----------------+-----------------------------------------------------------+--------------------------

In [4]:
#=====================================================================================================================================================#
#== Explode the Specified Column into Tokens ==#
tokens_df = explode_to_tokens(df, column='paragraph')

In [5]:
tab_fmt(tokens_df, 5, style='grid')

2024-10-27 13:49:32,598 - INFO - +---------+-------------+
|   index | paragraph   |
|       0 | THE         |
+---------+-------------+
|       1 | POWER       |
+---------+-------------+
|       2 | OF          |
+---------+-------------+
|       3 | PREJUDICE   |
+---------+-------------+
|       4 | I           |
+---------+-------------+
