In [None]:
# emoji_sentiment_analysis\notebooks\data_exploration.ipynb

import sys
from pathlib import Path

# Add the project's root directory to the Python path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from emoji_sentiment_analysis.config import RAW_DATA_DIR, TEXT_COL, TARGET_COL

import pandas as pd
import matplotlib.pyplot as plt
import re
from loguru import logger

# --- Data Exploration ---

# 1. Load the raw data
raw_data_path = RAW_DATA_DIR / "1k_data_emoji_tweets_senti_posneg.csv"
try:
    df = pd.read_csv(raw_data_path)
    logger.info(f"Successfully loaded data from {raw_data_path}")
except FileNotFoundError:
    logger.error(f"File not found at {raw_data_path}. Make sure it exists.")
    raise

# 2. Initial inspection
logger.info("Initial Data Inspection:")
display(df.head())
print("\n")
df.info()

# 3. Rename columns for consistency
column_mapping = {
    'post': TEXT_COL,
    'sentiment': TARGET_COL
}
df = df.rename(columns=column_mapping)
logger.info("Columns renamed to match config.py.")
display(df.head())

# 4. Check label distribution
logger.info("Sentiment Label Distribution:")
label_counts = df[TARGET_COL].value_counts().sort_index()
print(label_counts)

# 5. Visualize label distribution
plt.figure(figsize=(6, 4))
label_counts.plot(kind='bar', color=['#FF5733', '#336CFF'])
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# 6. Check for unique emojis (optional)
# This is a basic check to see what the model can learn from
EMOJI_PATTERN = re.compile(
    r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+',
    flags=re.UNICODE
)

def find_all_emojis(text):
    return re.findall(EMOJI_PATTERN, str(text))

all_emojis = [emoji for text in df[TEXT_COL] for emoji in find_all_emojis(text)]
logger.info(f"Total unique emojis found: {len(set(all_emojis))}")
print(f"Sample of unique emojis: {list(set(all_emojis))[:10]}")