# Exploratory Data Analysis

Stack Overflow time series exploration.

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

DATA_PATH = Path("../data/raw/tag_question_counts.csv")
df = pd.read_csv(DATA_PATH, parse_dates=["week"])
df.sort_values(["tag", "week"], inplace=True)
print(f"Shape: {df.shape}")
df.head(10)

## Basic Statistics

In [None]:
# Summary stats per tag
summary = df.groupby("tag")["question_count"].agg(["count", "mean", "std", "min", "median", "max"])
summary = summary.round(1).sort_values("mean", ascending=False)
print(f"Date range: {df['week'].min().date()} to {df['week'].max().date()}")
print(f"Tags: {sorted(df['tag'].unique())}\n")
summary

## Question Volume Over Time by Tag

In [None]:
# Weekly question counts per tag — interactive line chart
fig = px.line(
    df,
    x="week",
    y="question_count",
    color="tag",
    title="Weekly Question Count by Tag",
    labels={"week": "Week", "question_count": "Questions", "tag": "Tag"},
)
fig.update_layout(hovermode="x unified", legend_title_text="Tag")
fig.show()

In [None]:
# Individual subplots per tag for clearer comparison
fig = px.line(
    df,
    x="week",
    y="question_count",
    facet_col="tag",
    facet_col_wrap=2,
    title="Weekly Question Count — Per Tag",
    labels={"week": "", "question_count": "Questions"},
    height=1200,
)
fig.update_xaxes(tickangle=45)
fig.update_layout(showlegend=False)
fig.show()

## Missing Weeks Check

Verify whether every tag has a continuous weekly record with no gaps.

In [None]:
# Build a complete weekly date range and check for gaps per tag
full_weeks = pd.date_range(df["week"].min(), df["week"].max(), freq="W-MON")

missing_report = []
for tag, grp in df.groupby("tag"):
    present = set(grp["week"])
    missing = sorted(set(full_weeks) - present)
    missing_report.append({"tag": tag, "missing_weeks": len(missing), "total_expected": len(full_weeks)})
    if missing:
        print(f"{tag}: {len(missing)} missing week(s) — first gap at {missing[0].date()}")

missing_df = pd.DataFrame(missing_report).sort_values("missing_weeks", ascending=False)
if missing_df["missing_weeks"].sum() == 0:
    print("No missing weeks found for any tag.")
missing_df

## Anomaly / Outlier Detection

Flag weeks where the question count deviates more than 3 standard deviations from the rolling mean (13-week window) for each tag.

In [None]:
# Detect outliers using a rolling z-score (13-week window)
WINDOW = 13
THRESHOLD = 3

outlier_frames = []
for tag, grp in df.groupby("tag"):
    grp = grp.copy()
    grp["rolling_mean"] = grp["question_count"].rolling(WINDOW, center=True).mean()
    grp["rolling_std"] = grp["question_count"].rolling(WINDOW, center=True).std()
    grp["z_score"] = (grp["question_count"] - grp["rolling_mean"]) / grp["rolling_std"]
    grp["is_outlier"] = grp["z_score"].abs() > THRESHOLD
    outlier_frames.append(grp)

df_with_outliers = pd.concat(outlier_frames)
outliers = df_with_outliers[df_with_outliers["is_outlier"]]
print(f"Found {len(outliers)} outlier week(s) across all tags:\n")
outliers[["week", "tag", "question_count", "rolling_mean", "z_score"]].round(2)

In [None]:
# Visualize outliers overlaid on the time series
fig = go.Figure()

for tag in sorted(df["tag"].unique()):
    tag_data = df_with_outliers[df_with_outliers["tag"] == tag]
    tag_outliers = tag_data[tag_data["is_outlier"]]

    fig.add_trace(go.Scatter(
        x=tag_data["week"], y=tag_data["question_count"],
        mode="lines", name=tag, opacity=0.6,
    ))
    if len(tag_outliers) > 0:
        fig.add_trace(go.Scatter(
            x=tag_outliers["week"], y=tag_outliers["question_count"],
            mode="markers", name=f"{tag} outliers",
            marker=dict(size=8, symbol="x"),
            showlegend=False,
        ))

fig.update_layout(
    title="Outlier Weeks Highlighted (3σ from 13-week rolling mean)",
    xaxis_title="Week", yaxis_title="Question Count",
    hovermode="x unified",
)
fig.show()

## Distribution of Weekly Counts

In [None]:
# Box plots showing the spread of weekly question counts per tag
fig = px.box(
    df,
    x="tag",
    y="question_count",
    color="tag",
    title="Distribution of Weekly Question Counts by Tag",
    labels={"tag": "Tag", "question_count": "Weekly Questions"},
)
fig.update_layout(showlegend=False)
fig.show()