#### SMS Spam Collection - Exploratory Data Analysis
This notebook performs exploratory data analysis (EDA) on the SMS Spam Collection dataset.

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import os

In [14]:
# Setup
sns.set_theme()
sns.set_palette("viridis")
os.makedirs("../plots", exist_ok=True)

In [15]:
# Load data
df = pd.read_csv('../data/spam.csv', encoding='latin-1', usecols=[0, 1])
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df['char_count'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().apply(len)

In [16]:
# Plot 1: Class Distribution
plt.figure(figsize=(8, 5))
ax = sns.countplot(x='label', data=df)
plt.title('Message Type Distribution')
plt.xlabel('Message Type')
plt.ylabel('Count')
plt.xticks([0, 1], ['Legitimate', 'Fraudulent'])
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}\n({height/len(df):.1%})',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.savefig('../plots/eda_class_dist.png', dpi=300)
plt.close()

In [17]:
# Plot 2: Char Count by Label
plt.figure(figsize=(8, 5))
sns.boxplot(x='label', y='char_count', data=df)
plt.title('Character Count Distribution by Message Type')
plt.xticks([0, 1], ['Legitimate', 'Fraudulent'])
plt.savefig('../plots/eda_text_length.png', dpi=300)
plt.close()

In [18]:
# Word Clouds
def make_wordcloud(series, title):
    text = ' '.join(series)
    wc = WordCloud(width=1200, height=600, background_color='white', colormap='viridis').generate(text)
    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.savefig(f'../plots/eda_{title.lower().replace(" ", "_")}.png', dpi=300)
    plt.close()

make_wordcloud(df[df['label'] == 1]['text'], 'Fraudulent Message Lexicon')
make_wordcloud(df[df['label'] == 0]['text'], 'Legitimate Communication Patterns')

In [19]:
# Top 20 Unigrams in Spam
vec = CountVectorizer(ngram_range=(1, 1), stop_words='english')
spam_vec = vec.fit_transform(df[df['label'] == 1]['text'])
sum_words = spam_vec.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)[:20]

plt.figure(figsize=(10, 6))
sns.barplot(x=[x[1] for x in words_freq], y=[x[0] for x in words_freq])
plt.title('Top 20 Spam Unigrams')
plt.xlabel('Frequency')
plt.tight_layout()
plt.savefig('../plots/top_spam_unigrams.png', dpi=300)
plt.close()