# EDA for What's Up, Docs?

Baseline exploratory analysis of the DrivenData summarization dataset.


In [None]:
import json
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
DATA_DIR = Path("../data")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_FEATURES_PATH = DATA_DIR / "test_features.csv"

TRAIN_PATH, TEST_FEATURES_PATH


In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_features_df = pd.read_csv(TEST_FEATURES_PATH)

train_df.head()


In [None]:
train_df.info()


In [None]:
train_df.describe(include="all", datetime_is_numeric=True)


In [None]:
train_df = train_df.assign(
    text_char_len=train_df["text"].str.len(),
    text_word_len=train_df["text"].str.split().str.len(),
    summary_char_len=train_df["summary"].str.len(),
    summary_word_len=train_df["summary"].str.split().str.len(),
)

train_df[["text_char_len", "text_word_len", "summary_char_len", "summary_word_len"]].describe(percentiles=[0.5, 0.75, 0.9, 0.95, 0.99])


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(train_df["text_word_len"], bins=50, ax=axes[0])
axes[0].set_title("Document Word Length Distribution")
axes[0].set_xlabel("Word count")

sns.histplot(train_df["summary_word_len"], bins=50, ax=axes[1])
axes[1].set_title("Summary Word Length Distribution")
axes[1].set_xlabel("Word count")

plt.tight_layout()
plt.show()


In [None]:
sample_pairs = train_df.sample(3, random_state=42)
for idx, row in sample_pairs.iterrows():
    print(f"paper_id: {row['paper_id']}")
    print("Document snippet:")
    print(row["text"][:500].replace("\n", " ") + "...")
    print("Summary:")
    print(row["summary"])
    print("-" * 80)



In [None]:
duplicate_docs = train_df.duplicated(subset="text").sum()
duplicate_summaries = train_df.duplicated(subset="summary").sum()
print(f"Duplicate documents: {duplicate_docs}")
print(f"Duplicate summaries: {duplicate_summaries}")


In [None]:
train_df.isna().sum()


In [None]:
test_features_df.head()


In [None]:
print(f"Train samples: {len(train_df):,}")
print(f"Test samples: {len(test_features_df):,}")
print("Average text word length:", round(train_df["text_word_len"].mean(), 2))
print("Average summary word length:", round(train_df["summary_word_len"].mean(), 2))


In [None]:
train_df = train_df.assign(summary_to_text_ratio=train_df["summary_word_len"] / train_df["text_word_len"])
train_df["summary_to_text_ratio"].describe(percentiles=[0.5, 0.75, 0.9, 0.95])


In [None]:
train_df.nlargest(3, "text_word_len")[["paper_id", "text_word_len", "summary_word_len"]]


## Key Takeaways

- Documents average ~6.3k words with a long tail past 9.7k words (90th percentile) and a max over 34k, so truncation or chunking is required.
- Summaries are much shorter (mean ~184 words) with low variance, so we can keep targets largely intact.
- Summary-to-text ratio has a median near 0.03, confirming highly abstractive style.
- No missing values detected in `text` or `summary`; data quality looks clean aside from length outliers.
- Sample pairs show academic, multi-paragraph structure; SentencePiece tokenizer defaults should cope but we must watch token budgets.

