In [1]:
import pandas as pd
import logging
import os
import sys
from tabulate import tabulate
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from nltk.sentiment import SentimentIntensityAnalyzer

sys.path.append(os.path.abspath(os.path.join('..', 'utils')))  # Adjust the path based on your structure
from logging_configuration import setup_logging
from load_from_csv import load_data
from tabulate_style import tab_fmt
from explode_tokens import explode_to_tokens
from custom_plotly_template import get_custom_layout, set_custom_template
from token_analysis import analyze_token_sentiment

# Call the function to set the custom template
set_custom_template() 

2024-10-27 19:11:10,947 - INFO - Logging initialized and set up successfully.
2024-10-27 19:11:10,950 - INFO - Viewing the log: <function display_log_df at 0x797dd4353400>


Empty DataFrame
Columns: [timestamp, level, message]
Index: []


In [14]:
def analyze_token_sentiment_statistics(tokens_df, sentiment_types=['compound'], return_type='both'):
    """
    Analyzes and visualizes statistical data for sentiment scores in tokens_df.

    Parameters:
    - tokens_df: DataFrame containing individual tokens and sentiment scores.
    - sentiment_types: list of str, sentiment scores to analyze, e.g., ['compound', 'pos'].
    - return_type: str, 'dataframe', 'plot', or 'both' for output preference.

    Returns:
    - DataFrame with statistical summaries or None based on return_type.
    """
    # Ensure tokens_df has the correct structure
    for sentiment_type in sentiment_types:
        if sentiment_type not in tokens_df.columns:
            logging.error(f"The DataFrame must contain a '{sentiment_type}' column.")
            return None

    # Initialize a dictionary to store statistical summaries
    stats_summary = {}

    # Calculate statistics for each sentiment type
    for sentiment_type in sentiment_types:
        sentiment_data = tokens_df[sentiment_type].dropna()
        stats_summary[sentiment_type] = {
            'mean': sentiment_data.mean(),
            'median': sentiment_data.median(),
            'std_dev': sentiment_data.std(),
            'min': sentiment_data.min(),
            'max': sentiment_data.max(),
            '95th_percentile': round(sentiment_data.quantile(0.95), 2),
        }
        
        logging.info(f"Statistical Summary for {sentiment_type}:\n{stats_summary[sentiment_type]}")

    # Create a DataFrame for the summary
    stats_df = pd.DataFrame(stats_summary).T
    stats_df.columns = ['Mean', 'Median', 'Standard Deviation', 'Min', 'Max', '95th Percentile']

    # Plot statistics if required
    if return_type in ['plot', 'both']:
        for sentiment_type in sentiment_types:
            # Histogram of sentiment scores
            fig = px.histogram(tokens_df, x=sentiment_type, nbins=30,
                               title=f'Distribution of {sentiment_type.capitalize()} Sentiment Scores',
                               labels={sentiment_type: 'Sentiment Score', 'count': 'Count'},
                               color_discrete_sequence=px.colors.sequential.Jet)
            fig.show()

            # Box plot for outliers
            box_fig = px.box(tokens_df, y=sentiment_type,
                             title=f'Box Plot of {sentiment_type.capitalize()} Sentiment Scores',
                             labels={sentiment_type: 'Sentiment Score'})
            box_fig.show()

    return stats_df if return_type in ['dataframe', 'both'] else None

In [15]:
df = load_data()

2024-10-27 20:00:24,389 - INFO - cagliostro_gutenberg.csv imported successfully!
2024-10-27 20:00:24,392 - INFO - There are 1775 rows and 8 columns.
2024-10-27 20:00:24,424 - INFO - Schema of the loaded dataset:
2024-10-27 20:00:24,438 - INFO - 
+----+---------------+-------------+------------+
|    | Column Name   | Data Type   |   n_unique |
|----+---------------+-------------+------------|
|  0 | id            | int64       |       1775 |
|  1 | chapter_title | object      |          1 |
|  2 | paragraph     | object      |        908 |
|  3 | quote         | float64     |          0 |
|  4 | source_url    | object      |          1 |
|  5 | created_at    | float64     |          0 |
|  6 | title         | object      |          1 |
|  7 | content       | float64     |          0 |
+----+---------------+-------------+------------+


In [16]:
tokens_df = explode_to_tokens(df, column='paragraph')
tokens_df = pd.DataFrame(tokens_df)
#tab_fmt(tokens_df, 5, style='psql')

In [None]:
# Assuming you have a DataFrame named tokens_df ready
print("Input DataFrame Shape:", tokens_df.shape)
print("Input DataFrame Head:\n", tokens_df.head())

result = analyze_token_sentiment(tokens_df, sentiment_types=['compound', 'pos'], return_type='both')

# Check if sentiment scores were successfully added
if tokens_df is not None and 'compound' in tokens_df.columns:
    stats_summary = analyze_token_sentiment_statistics(tokens_df, sentiment_types=['compound'], return_type='dataframe')
else:
    logging.error("Sentiment analysis did not produce the expected output.")

Input DataFrame Shape: (132635, 1)
Input DataFrame Head:
    paragraph
0        THE
1      POWER
2         OF
3  PREJUDICE
4          I


In [6]:
import pandas as pd
import logging
import plotly.express as px
from nltk import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

def analyze_sentence_sentiment_statistics_one(paragraph_df, sentiment_types=['compound'], return_type='both'):
    """
    Analyzes and visualizes statistical data for sentiment scores at the sentence level.

    Parameters:
    - paragraph_df: DataFrame containing a 'paragraph' column.
    - sentiment_types: list of str, sentiment scores to analyze, e.g., ['compound', 'pos'].
    - return_type: str, 'dataframe', 'plot', or 'both' for output preference.

    Returns:
    - DataFrame with statistical summaries or None based on return_type.
    """
    # Ensure paragraph_df has the correct structure
    if 'paragraph' not in paragraph_df.columns:
        logging.error("The DataFrame must contain a 'paragraph' column.")
        return None

    # Initialize a list to hold sentence-level sentiment data
    sentence_sentiments = []

    # Analyze sentiment for each paragraph at the sentence level
    for index, row in paragraph_df.iterrows():
        paragraph = row['paragraph']
        sentences = sent_tokenize(paragraph)  # Tokenize into sentences

        for sentence in sentences:
            scores = sia.polarity_scores(sentence)
            sentence_sentiment = {**{'sentence': sentence, 'paragraph_index': index}, **{sentiment_type: scores[sentiment_type] for sentiment_type in sentiment_types}}
            sentence_sentiments.append(sentence_sentiment)

    # Create a DataFrame from the sentence-level sentiment data
    sentences_df = pd.DataFrame(sentence_sentiments)

    # Ensure each sentiment type column exists in the DataFrame
    for sentiment_type in sentiment_types:
        if sentiment_type not in sentences_df.columns:
            logging.error(f"The DataFrame must contain a '{sentiment_type}' column.")
            return None

    # Initialize a dictionary to store statistical summaries
    stats_summary = {}

    # Calculate statistics for each sentiment type
    for sentiment_type in sentiment_types:
        sentiment_data = sentences_df[sentiment_type].dropna()
        stats_summary[sentiment_type] = {
            'mean': sentiment_data.mean(),
            'median': sentiment_data.median(),
            'std_dev': sentiment_data.std(),
            'min': sentiment_data.min(),
            'max': sentiment_data.max(),
            '95th_percentile': round(sentiment_data.quantile(0.95), 2),
        }

        logging.info(f"Statistical Summary for {sentiment_type}:\n{stats_summary[sentiment_type]}")

    # Create a DataFrame for the summary
    stats_df = pd.DataFrame(stats_summary).T
    stats_df.columns = ['Mean', 'Median', 'Standard Deviation', 'Min', 'Max', '95th Percentile']

    # Plot statistics if required
    if return_type in ['plot', 'both']:
        for sentiment_type in sentiment_types:
            # Histogram of sentiment scores
            fig = px.histogram(sentences_df, x=sentiment_type, nbins=30,
                               title=f'Distribution of {sentiment_type.capitalize()} Sentiment Scores',
                               labels={sentiment_type: 'Sentiment Score', 'count': 'Count'},
                               color_discrete_sequence=px.colors.sequential.Jet)
            fig.show()

            # Box plot for outliers
            box_fig = px.box(sentences_df, y=sentiment_type,
                             title=f'Box Plot of {sentiment_type.capitalize()} Sentiment Scores',
                             labels={sentiment_type: 'Sentiment Score'})
            box_fig.show()

    return stats_df if return_type in ['dataframe', 'both'] else None


In [7]:
stats_df = analyze_sentence_sentiment_statistics_one(paragraph_df, sentiment_types=['compound'], return_type='both')


NameError: name 'paragraph_df' is not defined

In [13]:
import pandas as pd
import logging
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

def analyze_paragraph_sentiment_one(paragraph_df, sentiment_types=['compound'], return_type='dataframe'):
    """
    Analyzes sentiment for paragraphs in paragraph_df using NLTK's VADER sentiment analysis.

    Parameters:
    - paragraph_df: DataFrame containing a 'paragraph' column.
    - sentiment_types: list of str, sentiment scores to calculate, e.g., ['compound', 'pos'].
    - return_type: str, 'dataframe', 'plot', or 'both' for output preference.

    Returns:
    - DataFrame with sentiment scores or None based on return_type.
    """
    paragraph_df = load_data()
    # Ensure paragraph_df has the correct structure
    if 'paragraph' not in paragraph_df.columns:
        logging.error("The DataFrame must contain a 'paragraph' column.")
        return None

    # Filter out empty strings
    paragraph_df = paragraph_df[paragraph_df['paragraph'].str.strip().astype(bool)].copy()

    # Initialize sentiment score columns
    for sentiment_type in sentiment_types:
        paragraph_df.loc[:, sentiment_type] = [None] * len(paragraph_df)

    # Calculate sentiment for each paragraph with error handling
    for i, paragraph in enumerate(paragraph_df['paragraph']):
        try:
            scores = sia.polarity_scores(paragraph)
            for sentiment_type in sentiment_types:
                paragraph_df.at[i, sentiment_type] = scores[sentiment_type]
        except Exception as e:
            logging.error(f"Error calculating sentiment for paragraph index {i}: {e}")
            for sentiment_type in sentiment_types:
                paragraph_df.at[i, sentiment_type] = None

    # Calculate the 95th percentile for each sentiment type
    for sentiment_type in sentiment_types:
        percentile_95_sentiment = round(paragraph_df[sentiment_type].quantile(0.95), 2)
        logging.info(f"95th Percentile Sentiment Score ({sentiment_type}): {percentile_95_sentiment}")

    # Plot sentiment distribution if required
    if return_type in ['plot', 'both']:
        for sentiment_type in sentiment_types:
            fig = px.histogram(paragraph_df, x=sentiment_type, nbins=30,
                               title=f'Distribution of {sentiment_type.capitalize()} Sentiment Scores',
                               labels={sentiment_type: 'Sentiment Score Value', 'count': 'Frequency'},
                               color_discrete_sequence=px.colors.sequential.Jet)
            fig.update_layout(xaxis_title='Sentiment Score Value',
                              yaxis_title='Frequency',
                              title_x=0.5)  # Center the title
            fig.show()

    return paragraph_df if return_type in ['dataframe', 'both'] else None

# Sample DataFrame
data = {
    'paragraph': [
        "This is the first paragraph. It has some positive sentiment!",
        "The second paragraph is neutral.",
        "Oh no! This third paragraph is very negative.",
        "Another great paragraph full of positive vibes."
    ]
}

# Call the sentiment analysis function
result_df = analyze_paragraph_sentiment_one(paragraph_df, sentiment_types=['compound', 'pos', 'neg'], return_type='both')

2024-10-27 20:00:01,232 - INFO - cagliostro_gutenberg.csv imported successfully!
2024-10-27 20:00:01,235 - INFO - There are 1775 rows and 8 columns.
2024-10-27 20:00:01,262 - INFO - Schema of the loaded dataset:
2024-10-27 20:00:01,269 - INFO - 
+----+---------------+-------------+------------+
|    | Column Name   | Data Type   |   n_unique |
|----+---------------+-------------+------------|
|  0 | id            | int64       |       1775 |
|  1 | chapter_title | object      |          1 |
|  2 | paragraph     | object      |        908 |
|  3 | quote         | float64     |          0 |
|  4 | source_url    | object      |          1 |
|  5 | created_at    | float64     |          0 |
|  6 | title         | object      |          1 |
|  7 | content       | float64     |          0 |
+----+---------------+-------------+------------+


2024-10-27 20:00:06,429 - INFO - 95th Percentile Sentiment Score (compound): 0.96
2024-10-27 20:00:06,435 - INFO - 95th Percentile Sentiment Score (pos): 0.23
2024-10-27 20:00:06,440 - INFO - 95th Percentile Sentiment Score (neg): 0.2


In [9]:
from collections import Counter

# Generate word frequency
word_freq = Counter(' '.join(paragraph_df['paragraph']).split())
word_freq_df = pd.DataFrame(word_freq.items(), columns=['word', 'count'])

# Create a word cloud with Plotly
fig = px.bar(word_freq_df.sort_values('count', ascending=False).head(20),
              x='word', y='count',
              title='Top 20 Words Frequency',
              labels={'word': 'Words', 'count': 'Frequency'})
fig.update_layout(xaxis_title='Words', yaxis_title='Frequency', title_x=0.5)
fig.show()

NameError: name 'paragraph_df' is not defined