In [None]:
# Data Exploration Notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Load data
fake_df = pd.read_csv('../data/raw/Fake.csv')
true_df = pd.read_csv('../data/raw/True.csv')

print("Fake News Dataset Info:")
print(fake_df.info())
print("\nTrue News Dataset Info:")
print(true_df.info())

# Basic statistics
print(f"Fake news articles: {len(fake_df)}")
print(f"True news articles: {len(true_df)}")

# Check for missing values
print("\nMissing values in Fake dataset:")
print(fake_df.isnull().sum())
print("\nMissing values in True dataset:")
print(true_df.isnull().sum())

# Text length analysis
fake_df['text_length'] = fake_df['text'].astype(str).apply(len)
true_df['text_length'] = true_df['text'].astype(str).apply(len)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(fake_df['text_length'], bins=50, alpha=0.7, label='Fake', color='red')
plt.hist(true_df['text_length'], bins=50, alpha=0.7, label='True', color='blue')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Lengths')
plt.legend()

plt.subplot(1, 2, 2)
fake_words = fake_df['text'].astype(str).apply(lambda x: len(x.split()))
true_words = true_df['text'].astype(str).apply(lambda x: len(x.split()))

plt.hist(fake_words, bins=50, alpha=0.7, label='Fake', color='red')
plt.hist(true_words, bins=50, alpha=0.7, label='True', color='blue')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Word Counts')
plt.legend()

plt.tight_layout()
plt.show()

# Word clouds
fake_text = ' '.join(fake_df['text'].astype(str))
true_text = ' '.join(true_df['text'].astype(str))

plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
fake_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(fake_text)
plt.imshow(fake_wordcloud, interpolation='bilinear')
plt.title('Fake News Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
true_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(true_text)
plt.imshow(true_wordcloud, interpolation='bilinear')
plt.title('True News Word Cloud')
plt.axis('off')

plt.tight_layout()
plt.show()