In [None]:
import os
os.chdir('../')  # Moving up one directory to the root

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from data.data_processing import DataProcessor 
from utils.dataVisualizer import DataVisualizer
from models.sentiment_model import ModelTrainer

In [None]:
# Create instances
dataVisualizer = DataVisualizer()
processor = DataProcessor()
trainer = ModelTrainer() 

In [None]:
df = processor.load_data()

In [None]:
# Analyze ratings distribution
dataVisualizer.analyze_ratings_distribution(df)

# Analyze sentiment distribution on the imbalanced dataset in terms of sentiment
dataVisualizer.analyze_sentiment_distribution(df)

In [None]:
df_balanced = processor.create_balanced_dataset(df)

In [None]:
print(f"Total samples: {len(df_balanced)}")
dataVisualizer.analyze_sentiment_distribution(df_balanced)

In [None]:
dataVisualizer.analyze_text_lengths(df_balanced['text'])

In [None]:
sample_texts = df_balanced['text'].head(5)

print("Detailed preprocessing examples from dataset:\n")
for text in sample_texts:
    processed, is_sarcastic = processor.preprocess_text(text)
    print(f"Original: {text}\n\n")
    print(f"Processed: {processed}\n\n")
    print(f"Sarcastic: {is_sarcastic}\n\n")
    print("-" * 80 + "\n")

In [None]:
processed_texts, analysis = processor.process_batch(df['text'])

print("Batch Analysis Statistics:")
print(f"Sarcasm detected: {analysis['sarcasm_count']}")
print(f"Negations found: {analysis['negation_count']}")
print(f"Special tokens: {analysis['special_tokens_count']}")
print(f"URLs found: {analysis['url_count']}")


In [None]:
print("Data Quality Checks:")
print("\nNull values:")
print(df_balanced.isnull().sum())
print("\nDuplicate rows:", df_balanced.duplicated().sum())

In [None]:
# Get tokenized dataset from ModelTrainer
encoded_data = trainer.prepare_dataset(df_balanced['text'])

# Analyze token lengths
suggested_length = dataVisualizer.analyze_token_lengths(encoded_data)
MAX_LENGTH = min(512, (suggested_length + 15) // 16 * 16)
print(f"\nRecommended MAX_LENGTH: {MAX_LENGTH}")

In [None]:
dataVisualizer.visualize_wordclouds(df_balanced)

In [None]:
DataVisualizer.display_processed_reviews(df_balanced)  # Since it's a static method

In [None]:
# For the full dataset
dataVisualizer.analyze_text_signals(df)

# for the balanced dataset
dataVisualizer.analyze_text_signals(df_balanced)