In [9]:
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import logging
#import os
import sys
#from tabulate import tabulate
#import plotly.io as pio
#import plotly.graph_objects as go
#import plotly.express as px


sys.path.append(os.path.abspath(os.path.join('..', 'utils')))  # Adjust the path based on your structure
#from logging_configuration import setup_logging
from load_from_csv import load_data
#from token_distribution import analyze_distribution
#from tabulate_style import tab_fmt

df = load_data()

token_counts = analyze_distribution(df, lower_bound=50, upper_bound=1500, keywords=None, return_type='dataframe')

#=====================================================================================================================================================#
#== Analyze token distribution and stats ==#

def generate_word_cloud(token_counts, lower_bound=None, upper_bound=None):

    # Ensure that the count column is of integer type and filter out non-positive counts
    token_counts['count'] = token_counts['count'].astype(int)
    token_counts = token_counts[token_counts['count'] > 0]

    # Filter tokens based on user-defined lower and upper bounds
    token_counts = token_counts[(token_counts['count'] >= lower_bound) & (token_counts['count'] <= upper_bound)]

    # Check if token_counts is empty after filtering
    if token_counts.empty:
        logging.warning("No tokens found within the specified bounds.")
        return

    # Concatenate tokens into a single string based on their frequencies
    text = ' '.join([token for token, count in zip(token_counts['token'], token_counts['count']) for _ in range(count)])

    # Generate the word cloud image
    wordcloud = WordCloud(width=1400, height=800, background_color='black', colormap='viridis').generate(text)

    # Convert the word cloud image to a NumPy array
    image_array = np.array(wordcloud)

    # Create a Plotly figure
    fig = go.Figure()

    # Add the word cloud image as a trace
    fig.add_trace(go.Image(z=image_array))

    # Update layout to format the word cloud
    fig.update_layout(title='Word Cloud',
                      #height=800,
                      #width=1400,
                      xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                      yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                      template='plotly_dark')

    # Show the word cloud
    fig.show()

    logging.info("Word cloud has been displayed successfully!")

# Usage (after analyzing paragraphs):

2024-10-27 14:38:39,745 - INFO - cagliostro_gutenberg.csv imported successfully!
2024-10-27 14:38:39,747 - INFO - There are 1775 rows and 8 columns.
2024-10-27 14:38:39,772 - INFO - Schema of the loaded dataset:
2024-10-27 14:38:39,779 - INFO - 
+----+---------------+-------------+------------+
|    | Column Name   | Data Type   |   n_unique |
|----+---------------+-------------+------------|
|  0 | id            | int64       |       1775 |
|  1 | chapter_title | object      |          1 |
|  2 | paragraph     | object      |        908 |
|  3 | quote         | float64     |          0 |
|  4 | source_url    | object      |          1 |
|  5 | created_at    | float64     |          0 |
|  6 | title         | object      |          1 |
|  7 | content       | float64     |          0 |
+----+---------------+-------------+------------+


2024-10-27 14:38:40,182 - INFO - Filtered stats_df:
+------+---------------+
|      |   token_count |
|------+---------------|
|    2 |           129 |
|    3 |           200 |
|    4 |            82 |
|    5 |           168 |
|    6 |           212 |
|    8 |           132 |
|   11 |           135 |
|   12 |           119 |
|   13 |           132 |
|   14 |           149 |
|   17 |           135 |
|   18 |           107 |
|   21 |            50 |
|   22 |           152 |
|   23 |           172 |
|   24 |            99 |
|   25 |           158 |
|   26 |           129 |
|   29 |           193 |
|   31 |           184 |
|   36 |            61 |
|   38 |            77 |
|   41 |           142 |
|   42 |           104 |
|   43 |           173 |
|   44 |           142 |
|   48 |            90 |
|   49 |           155 |
|   50 |           163 |
|   51 |           164 |
|   53 |            78 |
|   57 |            60 |
|   58 |            72 |
|   59 |            92 |
|   61 |            73 

### Calling the function

In [None]:
generate_word_cloud(token_counts, lower_bound=50, upper_bound=5000)