In [None]:
# 1 : Imports
import os
os.chdir('../')  # Moving up one directory to the root
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data.data_processing import DataProcessor, TextSignals, SarcasmAugmenter
from utils.dataVisualizer import DataVisualizer
from models.sentiment_model import ModelTrainer

In [None]:
# 2 : Create instances
dataProcessor = DataProcessor()
dataVisualizer = DataVisualizer(data_processor=dataProcessor)
trainer = ModelTrainer()

In [None]:
# Cell 3: Load raw dataset and analyze initial distribution
df = dataProcessor.load_data()
print("\nInitial Distribution Analysis:")
dataVisualizer.analyze_ratings_distribution(df)
dataVisualizer.analyze_sentiment_distribution(df)

In [None]:
# Cell 4: Process and prepare balanced dataset
data = dataProcessor.prepare_data()
train_df = data['dataframes']['train']
val_df = data['dataframes']['val']
test_df = data['dataframes']['test']
model_inputs = data['model_inputs']

In [None]:
# Cell 5: Analyze data splits distributions
for split_name, split_df in [('Training', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(f"\n{split_name} Set Analysis:")
    print(f"Total samples: {len(split_df)}")
    print("\nSentiment Distribution:")
    print(split_df['sentiment'].value_counts().sort_index())
    print("\nSarcasm Distribution:")
    print(split_df['is_sarcastic'].value_counts())

In [None]:
# Cell 6: Text length analysis
print("\nText Length Analysis Across Splits:")
for split_name, split_df in [('Training', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(f"\n{split_name} Set Text Lengths:")
    dataVisualizer.analyze_text_lengths(split_df['text'])

In [None]:
# Cell 7: Token length analysis and MAX_LENGTH recommendation
encoded_data = trainer.prepare_dataset(train_df['text'])
suggested_length = dataVisualizer.analyze_token_lengths(encoded_data)
MAX_LENGTH = min(512, (suggested_length + 15) // 16 * 16)
print(f"\nRecommended MAX_LENGTH: {MAX_LENGTH}")

In [None]:
# Cell 8: Visualize word distributions
dataVisualizer.visualize_wordclouds(train_df)

In [None]:
# Cell 9: Sample reviews analysis
dataVisualizer.display_processed_reviews(train_df, num_samples=10)

In [None]:
# Cell 10: Text signals analysis
print("\nText Signals Analysis for Training Set:")
dataVisualizer.analyze_text_signals(train_df)

In [None]:
# Cell 11: Data quality checks
print("Data Quality Checks:")
for split_name, split_df in [('Training', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(f"\n{split_name} Set:")
    print("Null values:")
    print(split_df.isnull().sum())
    print(f"Duplicate rows: {split_df.duplicated().sum()}")