# Stock News Sentiment Analysis - Exploratory Data Analysis

**Dataset**: Apple Stock News Articles with Sentiment Labels  
**Period**: January 2019 - April 2019  
**Features**: Date, News, Open, High, Low, Close, Volume, Label  
**Labels**: -1 (Negative), 0 (Neutral), 1 (Positive)

---

This notebook performs comprehensive exploratory data analysis on stock news sentiment data.

## 1. Setup and Configuration

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# For text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Visualization settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11
%matplotlib inline

print('✓ All libraries loaded successfully!')

## 2. Load Dataset

**Important**: Update the file path to match your local setup

In [None]:
# OPTION 1: If using the modular project structure
# Uncomment and use this if you have the full project structure
# import sys
# from pathlib import Path
# sys.path.append(str(Path.cwd().parent))
# from src.data.data_loader import DataLoader
# data_loader = DataLoader()
# df = data_loader.load_data()

# OPTION 2: Direct file path (RECOMMENDED for standalone notebook)
# Update this path to match your file location
file_path = r'C:\pathto\stock_news.csv'

# Alternative: If file is in the same directory
# file_path = 'stock_news.csv'

# Alternative: If file is in parent directory
# file_path = '../data/raw/stock_news.csv'

# Load the data
df = pd.read_csv(file_path)

print(f'✓ Dataset loaded successfully!')
print(f'Shape: {df.shape[0]:,} rows × {df.shape[1]} columns')
print(f'\nFirst few rows:')
df.head()

## 3. Dataset Overview

In [None]:
print('='*80)
print('DATASET INFORMATION')
print('='*80)
print(f'\nTotal Records: {len(df):,}')
print(f'Number of Features: {len(df.columns)}')
print(f'\nColumn Names and Types:')
for col in df.columns:
    print(f'  {col:15s}: {str(df[col].dtype):12s} (Non-null: {df[col].count():,})')
print(f'\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')

In [None]:
# Data quality assessment
print('\nDATA QUALITY CHECKS')
print('='*80)
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print('\nMissing Values:')
    print(missing_data[missing_data > 0])
else:
    print('\n✓ No missing values found!')

duplicates = df.duplicated().sum()
print(f'\nDuplicate Rows: {duplicates:,} ({duplicates/len(df)*100:.2f}%)')

# Statistical summary
print('\nNumerical Features Summary:')
df.describe().round(2)

## 4. Sentiment Distribution Analysis

In [None]:
# Sentiment distribution
label_counts = df['Label'].value_counts().sort_index()
label_pct = (label_counts / len(df) * 100).round(2)

label_map = {-1: 'Negative', 0: 'Neutral', 1: 'Positive'}
label_names = [label_map[x] for x in label_counts.index]

print('='*80)
print('SENTIMENT DISTRIBUTION')
print('='*80)
for lbl in sorted(label_map.keys()):
    if lbl in label_counts.index:
        name = label_map[lbl]
        count = label_counts[lbl]
        pct = label_pct[lbl]
        print(f'{name:10s} ({lbl:2d}): {count:5d} articles ({pct:5.2f}%)')

# Check for class imbalance
max_count = label_counts.max()
min_count = label_counts.min()
imbalance_ratio = max_count / min_count
print(f'\nClass Imbalance Ratio: {imbalance_ratio:.2f}:1')
if imbalance_ratio > 2:
    print('⚠️  Significant class imbalance detected!')
    print('   → Consider using stratified sampling')
    print('   → Use weighted metrics for evaluation')

In [None]:
# Visualize sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

colors = ['#e74c3c', '#95a5a6', '#2ecc71']

# Bar chart
bars = axes[0].bar(label_names, label_counts.values, color=colors, 
                  alpha=0.8, edgecolor='black', linewidth=1.5)
axes[0].set_xlabel('Sentiment', fontweight='bold', fontsize=13)
axes[0].set_ylabel('Number of Articles', fontweight='bold', fontsize=13)
axes[0].set_title('Sentiment Distribution (Counts)', fontweight='bold', fontsize=14)
axes[0].grid(axis='y', alpha=0.3, linestyle='--')

# Add count labels on bars
for bar in bars:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height):,}\n({height/len(df)*100:.1f}%)',
                ha='center', va='bottom', fontweight='bold', fontsize=11)

# Pie chart
wedges, texts, autotexts = axes[1].pie(label_counts.values, labels=label_names, 
                                       autopct='%1.1f%%', colors=colors, 
                                       startangle=90, explode=(0.05, 0.05, 0.05),
                                       shadow=True, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Sentiment Distribution (Percentage)', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()

## 5. Temporal Analysis

In [None]:
# Convert date and analyze temporal patterns
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

print('TEMPORAL COVERAGE')
print('='*80)
print(f"Start Date: {df['Date'].min().strftime('%B %d, %Y')}")
print(f"End Date:   {df['Date'].max().strftime('%B %d, %Y')}")
print(f"Duration:   {(df['Date'].max() - df['Date'].min()).days} days")
print(f"\nArticles per Day (avg): {len(df) / (df['Date'].max() - df['Date'].min()).days:.1f}")

# Articles per day
daily_counts = df.groupby('Date').size()
print(f"\nDaily Article Statistics:")
print(f"  Min:    {daily_counts.min():3d} articles")
print(f"  Max:    {daily_counts.max():3d} articles")
print(f"  Mean:   {daily_counts.mean():6.1f} articles")
print(f"  Median: {daily_counts.median():6.1f} articles")

In [None]:
# Sentiment over time
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Sentiment distribution over time
sentiment_time = df.groupby(['Date', 'Label']).size().unstack(fill_value=0)
sentiment_time.plot(kind='area', ax=axes[0], color=colors, alpha=0.7, stacked=True)
axes[0].set_xlabel('Date', fontweight='bold', fontsize=12)
axes[0].set_ylabel('Number of Articles', fontweight='bold', fontsize=12)
axes[0].set_title('News Articles Over Time (by Sentiment)', fontweight='bold', fontsize=14)
axes[0].legend(['Negative', 'Neutral', 'Positive'], loc='upper left', fontsize=11)
axes[0].grid(True, alpha=0.3)

# Daily article volume
daily_counts.plot(kind='bar', ax=axes[1], color='steelblue', alpha=0.7, width=0.8)
axes[1].set_xlabel('Date', fontweight='bold', fontsize=12)
axes[1].set_ylabel('Number of Articles', fontweight='bold', fontsize=12)
axes[1].set_title('Daily News Volume', fontweight='bold', fontsize=14)
axes[1].grid(True, alpha=0.3, axis='y')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Stock Price Analysis

In [None]:
# Stock price statistics
print('STOCK PRICE STATISTICS')
print('='*80)
print(f"\nPrice Range:")
print(f"  Lowest Close:  ${df['Close'].min():.2f}")
print(f"  Highest Close: ${df['Close'].max():.2f}")
print(f"  Mean Close:    ${df['Close'].mean():.2f}")
print(f"  Price Change:  ${df['Close'].max() - df['Close'].min():.2f}")

print(f"\nVolume Statistics:")
print(f"  Mean Volume:   {df['Volume'].mean():,.0f}")
print(f"  Max Volume:    {df['Volume'].max():,.0f}")

# Calculate returns
df['Daily_Return'] = df['Close'].pct_change() * 100
df['Price_Change'] = df['Close'] - df['Open']

print(f"\nReturns Analysis:")
print(f"  Mean Daily Return: {df['Daily_Return'].mean():.2f}%")
print(f"  Volatility (Std):  {df['Daily_Return'].std():.2f}%")
print(f"  Max Gain:          {df['Daily_Return'].max():.2f}%")
print(f"  Max Loss:          {df['Daily_Return'].min():.2f}%")

df[['Open', 'High', 'Low', 'Close', 'Volume']].describe().round(2)

In [None]:
# Plot stock prices
fig, axes = plt.subplots(3, 1, figsize=(16, 12))

# Price trends
axes[0].plot(df['Date'], df['Open'], label='Open', alpha=0.6, linewidth=1.5)
axes[0].plot(df['Date'], df['High'], label='High', alpha=0.6, linewidth=1.5)
axes[0].plot(df['Date'], df['Low'], label='Low', alpha=0.6, linewidth=1.5)
axes[0].plot(df['Date'], df['Close'], label='Close', alpha=0.9, linewidth=2.5, color='navy')
axes[0].set_ylabel('Price ($)', fontweight='bold', fontsize=12)
axes[0].set_title('Apple Stock Price Trends (Jan-Apr 2019)', fontweight='bold', fontsize=14)
axes[0].legend(loc='best', fontsize=11)
axes[0].grid(True, alpha=0.3)

# Volume
axes[1].bar(df['Date'], df['Volume'], color='steelblue', alpha=0.6, width=0.8)
axes[1].set_ylabel('Volume', fontweight='bold', fontsize=12)
axes[1].set_title('Trading Volume', fontweight='bold', fontsize=14)
axes[1].grid(True, alpha=0.3, axis='y')
axes[1].ticklabel_format(style='plain', axis='y')

# Daily returns
colors_returns = ['red' if x < 0 else 'green' for x in df['Daily_Return']]
axes[2].bar(df['Date'], df['Daily_Return'], color=colors_returns, alpha=0.6, width=0.8)
axes[2].axhline(y=0, color='black', linestyle='-', linewidth=1)
axes[2].set_xlabel('Date', fontweight='bold', fontsize=12)
axes[2].set_ylabel('Daily Return (%)', fontweight='bold', fontsize=12)
axes[2].set_title('Daily Returns', fontweight='bold', fontsize=14)
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 7. Text Analysis

In [None]:
# Text statistics
df['text_length'] = df['News'].apply(lambda x: len(str(x)))
df['word_count'] = df['News'].apply(lambda x: len(str(x).split()))
df['avg_word_len'] = df['News'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

print('TEXT STATISTICS')
print('='*80)
print(f"Average Characters per Article: {df['text_length'].mean():.0f}")
print(f"Average Words per Article:      {df['word_count'].mean():.0f}")
print(f"Average Word Length:            {df['avg_word_len'].mean():.2f} chars")

print(f"\nText Length Range:")
print(f"  Shortest: {df['text_length'].min():4d} chars ({df['word_count'].min():3d} words)")
print(f"  Longest:  {df['text_length'].max():4d} chars ({df['word_count'].max():3d} words)")

print(f"\nDetailed Statistics:")
df[['text_length', 'word_count', 'avg_word_len']].describe().round(2)

In [None]:
# Text length visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Character distribution
axes[0,0].hist(df['text_length'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0,0].axvline(df['text_length'].mean(), color='red', linestyle='--', linewidth=2, label='Mean')
axes[0,0].axvline(df['text_length'].median(), color='green', linestyle='--', linewidth=2, label='Median')
axes[0,0].set_xlabel('Characters', fontweight='bold')
axes[0,0].set_ylabel('Frequency', fontweight='bold')
axes[0,0].set_title('Character Count Distribution', fontweight='bold')
axes[0,0].legend()
axes[0,0].grid(axis='y', alpha=0.3)

# Word distribution
axes[0,1].hist(df['word_count'], bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0,1].axvline(df['word_count'].mean(), color='red', linestyle='--', linewidth=2, label='Mean')
axes[0,1].axvline(df['word_count'].median(), color='green', linestyle='--', linewidth=2, label='Median')
axes[0,1].set_xlabel('Words', fontweight='bold')
axes[0,1].set_ylabel('Frequency', fontweight='bold')
axes[0,1].set_title('Word Count Distribution', fontweight='bold')
axes[0,1].legend()
axes[0,1].grid(axis='y', alpha=0.3)

# By sentiment
for label, name, color in zip([-1, 0, 1], ['Negative', 'Neutral', 'Positive'], colors):
    if label in df['Label'].unique():
        subset = df[df['Label'] == label]['text_length']
        axes[1,0].hist(subset, bins=30, alpha=0.5, label=name, color=color)
axes[1,0].set_xlabel('Characters', fontweight='bold')
axes[1,0].set_ylabel('Frequency', fontweight='bold')
axes[1,0].set_title('Text Length by Sentiment', fontweight='bold')
axes[1,0].legend()
axes[1,0].grid(axis='y', alpha=0.3)

# Box plot
data_by_label = [df[df['Label']==l]['word_count'].values for l in sorted(df['Label'].unique())]
bp = axes[1,1].boxplot(data_by_label, labels=label_names, patch_artist=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.6)
axes[1,1].set_ylabel('Word Count', fontweight='bold')
axes[1,1].set_title('Word Count Distribution by Sentiment', fontweight='bold')
axes[1,1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Word Frequency Analysis

In [None]:
# Text preprocessing function
def clean_text(text):
    """Clean and tokenize text"""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

# Process all text
print('Processing text for word frequency analysis...')
all_words = []
label_words = {label: [] for label in df['Label'].unique()}

for idx, row in df.iterrows():
    tokens = clean_text(row['News'])
    all_words.extend(tokens)
    label_words[row['Label']].extend(tokens)

print(f'\n✓ Processing complete!')
print(f'Total words (after cleaning): {len(all_words):,}')
print(f'Unique words: {len(set(all_words)):,}')
print(f'Vocabulary richness: {len(set(all_words))/len(all_words)*100:.2f}%')

In [None]:
# Top words
word_freq = Counter(all_words)
top_30 = word_freq.most_common(30)

print('\nTOP 30 MOST COMMON WORDS')
print('='*80)
for i, (word, count) in enumerate(top_30, 1):
    print(f'{i:2d}. {word:20s}: {count:6d} occurrences')

In [None]:
# Visualize word frequencies
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Bar chart - top 20
top_20 = word_freq.most_common(20)
words, counts = zip(*top_20)
y_pos = np.arange(len(words))

axes[0].barh(y_pos, counts, color='steelblue', alpha=0.8, edgecolor='black')
axes[0].set_yticks(y_pos)
axes[0].set_yticklabels(words)
axes[0].invert_yaxis()
axes[0].set_xlabel('Frequency', fontweight='bold', fontsize=12)
axes[0].set_title('Top 20 Most Common Words', fontweight='bold', fontsize=14)
axes[0].grid(axis='x', alpha=0.3)

# Word cloud
wc = WordCloud(width=800, height=500, background_color='white',
              colormap='viridis', max_words=100).generate(' '.join(all_words))
axes[1].imshow(wc, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('Word Cloud (All Text)', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Word clouds by sentiment
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, (label, name, color) in enumerate(zip([-1, 0, 1], 
                                               ['Negative', 'Neutral', 'Positive'],
                                               ['Reds', 'Greys', 'Greens'])):
    if label in label_words and len(label_words[label]) > 0:
        wc = WordCloud(width=600, height=400, background_color='white',
                      colormap=color, max_words=80).generate(' '.join(label_words[label]))
        axes[idx].imshow(wc, interpolation='bilinear')
        axes[idx].axis('off')
        axes[idx].set_title(f'{name} Sentiment', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()

## 9. Correlation Analysis

In [None]:
# Correlation matrix
num_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 
           'text_length', 'word_count', 'Daily_Return']
num_cols = [col for col in num_cols if col in df.columns]
corr = df[num_cols].corr()

print('CORRELATION ANALYSIS')
print('='*80)
print('\nCorrelation Matrix:')
print(corr.round(3))

In [None]:
# Visualize correlation
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt='.2f',
           square=True, linewidths=1.5, cbar_kws={'label': 'Correlation Coefficient'},
           mask=mask, vmin=-1, vmax=1)
plt.title('Correlation Heatmap', fontweight='bold', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

# High correlations
print('\nHigh Correlations (|r| > 0.7):')
print('='*80)
found = False
for i in range(len(corr.columns)):
    for j in range(i+1, len(corr.columns)):
        if abs(corr.iloc[i,j]) > 0.7:
            print(f'{corr.columns[i]:15s} <-> {corr.columns[j]:15s}: {corr.iloc[i,j]:6.3f}')
            found = True
if not found:
    print('No strong correlations (|r| > 0.7) found')

## 10. Sentiment-Price Relationship

In [None]:
# Aggregate by sentiment
sent_stats = df.groupby('Label').agg({
    'Close': ['mean', 'std', 'min', 'max'],
    'Volume': ['mean', 'std'],
    'Daily_Return': ['mean', 'std']
}).round(2)

print('STOCK METRICS BY SENTIMENT')
print('='*80)
print(sent_stats)

In [None]:
# Visualize sentiment-price relationship
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Avg close price
avg_close = df.groupby('Label')['Close'].mean()
bars = axes[0,0].bar(label_names, [avg_close[l] for l in sorted(df['Label'].unique())],
                    color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
axes[0,0].set_ylabel('Average Closing Price ($)', fontweight='bold', fontsize=12)
axes[0,0].set_title('Average Closing Price by Sentiment', fontweight='bold', fontsize=13)
axes[0,0].grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    axes[0,0].text(bar.get_x() + bar.get_width()/2., height,
                  f'${height:.2f}', ha='center', va='bottom', fontweight='bold')

# Avg volume
avg_vol = df.groupby('Label')['Volume'].mean()
bars = axes[0,1].bar(label_names, [avg_vol[l] for l in sorted(df['Label'].unique())],
                    color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
axes[0,1].set_ylabel('Average Volume', fontweight='bold', fontsize=12)
axes[0,1].set_title('Average Trading Volume by Sentiment', fontweight='bold', fontsize=13)
axes[0,1].grid(axis='y', alpha=0.3)
axes[0,1].ticklabel_format(style='plain', axis='y')

# Price change
avg_chg = df.groupby('Label')['Price_Change'].mean()
bars = axes[1,0].bar(label_names, [avg_chg[l] for l in sorted(df['Label'].unique())],
                    color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
axes[1,0].axhline(0, color='black', linewidth=1.5)
axes[1,0].set_ylabel('Average Price Change ($)', fontweight='bold', fontsize=12)
axes[1,0].set_title('Average Intraday Price Change by Sentiment', fontweight='bold', fontsize=13)
axes[1,0].grid(axis='y', alpha=0.3)

# Daily return
avg_ret = df.groupby('Label')['Daily_Return'].mean()
bars = axes[1,1].bar(label_names, [avg_ret[l] for l in sorted(df['Label'].unique())],
                    color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
axes[1,1].axhline(0, color='black', linewidth=1.5)
axes[1,1].set_ylabel('Average Daily Return (%)', fontweight='bold', fontsize=12)
axes[1,1].set_title('Average Daily Return by Sentiment', fontweight='bold', fontsize=13)
axes[1,1].grid(axis='y', alpha=0.3)
for bar in bars:
    height = bar.get_height()
    axes[1,1].text(bar.get_x() + bar.get_width()/2., height,
                  f'{height:.2f}%', ha='center', 
                  va='bottom' if height >= 0 else 'top', fontweight='bold')

plt.tight_layout()
plt.show()

## 11. Sample Articles

In [None]:
# Show sample articles
print('='*80)
print('SAMPLE NEWS ARTICLES BY SENTIMENT')
print('='*80)

for label, name in label_map.items():
    if label in df['Label'].unique():
        print(f'\n{name.upper()} SENTIMENT ({label:+d})')
        print('-' * 80)
        samples = df[df['Label'] == label].sample(min(3, len(df[df['Label'] == label])))
        for i, (idx, row) in enumerate(samples.iterrows(), 1):
            print(f"\n[{i}] Date: {row['Date'].strftime('%Y-%m-%d')}")
            print(f"    Stock: Close=${row['Close']:.2f}, Volume={row['Volume']:,}")
            news_text = row['News'][:250] + '...' if len(row['News']) > 250 else row['News']
            print(f"    News: {news_text}")

## 12. Key Insights & Summary

In [None]:
print('='*80)
print('KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS')
print('='*80)

print(f'\n1. DATASET OVERVIEW')
print(f'   • Total articles: {len(df):,}')
print(f'   • Time period: {(df["Date"].max()-df["Date"].min()).days} days '
      f'({df["Date"].min().strftime("%b %Y")} to {df["Date"].max().strftime("%b %Y")})')
print(f'   • Daily volume: {len(df)/(df["Date"].max()-df["Date"].min()).days:.1f} articles/day')

print(f'\n2. SENTIMENT DISTRIBUTION')
for l in sorted(label_map.keys()):
    if l in label_counts.index:
        print(f'   • {label_map[l]:8s}: {label_counts[l]:4d} ({label_pct[l]:5.1f}%)')
print(f'   • Imbalance ratio: {imbalance_ratio:.2f}:1 '
      f'({"⚠️ Significant" if imbalance_ratio > 2 else "✓ Balanced"})')

print(f'\n3. TEXT CHARACTERISTICS')
print(f'   • Avg length: {df["word_count"].mean():.0f} words ({df["text_length"].mean():.0f} chars)')
print(f'   • Vocabulary: {len(set(all_words)):,} unique words')
print(f'   • Top keyword: "{word_freq.most_common(1)[0][0]}" ({word_freq.most_common(1)[0][1]:,} times)')

print(f'\n4. STOCK PRICE INSIGHTS')
print(f'   • Price range: ${df["Close"].min():.2f} - ${df["Close"].max():.2f}')
print(f'   • Mean price: ${df["Close"].mean():.2f}')
print(f'   • Avg daily return: {df["Daily_Return"].mean():.2f}%')
print(f'   • Volatility (σ): {df["Daily_Return"].std():.2f}%')

print(f'\n5. SENTIMENT-PRICE RELATIONSHIP')
for l in sorted(df['Label'].unique()):
    name = label_map[l]
    avg_ret = df[df['Label'] == l]['Daily_Return'].mean()
    avg_price = df[df['Label'] == l]['Close'].mean()
    print(f'   • {name:8s}: Avg return={avg_ret:+6.2f}%, Avg price=${avg_price:.2f}')

print(f'\n6. DATA QUALITY')
print(f'   • Missing values: {df.isnull().sum().sum()}')
print(f'   • Duplicates: {df.duplicated().sum()}')
print(f'   • Data completeness: {(1 - df.isnull().sum().sum()/(len(df)*len(df.columns)))*100:.1f}%')
print(f'   • Quality: ✓ GOOD' if df.isnull().sum().sum() == 0 else '   • Quality: ⚠️ NEEDS ATTENTION')

print('\n' + '='*80)
print('✅ EXPLORATORY DATA ANALYSIS COMPLETED SUCCESSFULLY')
print('='*80)
print('\nNext Steps:')
print('  1. Preprocess text data (tokenization, stopword removal, lemmatization)')
print('  2. Split data into train/validation/test sets (70/15/15)')
print('  3. Generate embeddings (Word2Vec, GloVe, FastText, Sentence Transformers)')
print('  4. Train Random Forest models with hyperparameter tuning')
print('  5. Evaluate and compare model performances')