In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

In [10]:
LIAR_DIR = Path("Data/liar_dataset")   # has liar_train.csv / liar_val.csv / liar_test.csv
FIG_DIR  = Path("Figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [11]:
def load(csv):
    df = pd.read_csv(csv)[["text","label"]].dropna().copy()
    df["text"] = df["text"].astype(str)
    df["label"] = df["label"].astype(int)
    df["len"] = df["text"].str.split().str.len()
    return df

In [12]:
train = load(LIAR_DIR/"clean/liar_train.csv")
val   = load(LIAR_DIR/"clean/liar_val.csv")
test  = load(LIAR_DIR/"clean/liar_test.csv")

In [15]:
def label_counts(df, name):
    c = df["label"].value_counts().reindex([0,1]).fillna(0).astype(int)
    return pd.DataFrame({"split": name, "label": [0,1], "count": [c.get(0,0), c.get(1,0)]})

In [16]:
lab = pd.concat([label_counts(train,"train"),
                 label_counts(val,"val"),
                 label_counts(test,"test")], ignore_index=True)

In [17]:
for split in ["train","val","test"]:
    sub = lab[lab["split"]==split]
    fig, ax = plt.subplots(figsize=(4,3))
    ax.bar(sub["label"].astype(str), sub["count"])
    ax.set_title(f"Label Balance — {split}")
    ax.set_xlabel("label (0=false-ish, 1=true-ish)")
    ax.set_ylabel("count")
    for x,y in zip(sub["label"].astype(str), sub["count"]):
        ax.text(x, y, str(y), ha="center", va="bottom", fontsize=9)
    fig.tight_layout()
    fig.savefig(FIG_DIR/f"liar_label_balance_{split}.png", dpi=160)
    plt.close(fig)

In [18]:
for split, df in [("train",train),("val",val),("test",test)]:
    fig, ax = plt.subplots(figsize=(4,3))
    ax.hist(df["len"].clip(upper=80), bins=40)  # clip long tails for readability
    ax.set_title(f"Text Length (words) — {split}")
    ax.set_xlabel("words (clipped at 80)")
    ax.set_ylabel("count")
    fig.tight_layout()
    fig.savefig(FIG_DIR/f"liar_text_length_{split}.png", dpi=160)
    plt.close(fig)

In [20]:
stats = {
    "train": dict(n=len(train), len_mean=float(train["len"].mean()), len_p95=float(train["len"].quantile(0.95))),
    "val":   dict(n=len(val),   len_mean=float(val["len"].mean()),   len_p95=float(val["len"].quantile(0.95))),
    "test":  dict(n=len(test),  len_mean=float(test["len"].mean()),  len_p95=float(test["len"].quantile(0.95))),
}
print("✅ saved figures in:", FIG_DIR.resolve())
print("📊 length stats:", stats)

✅ saved figures in: C:\Users\moham\OneDrive\Desktop\Courses\MAHE Internship polysemy\Dual Branch Fake News Detection Framework versions\0.3\notebooks\visualizations\Figures
📊 length stats: {'train': {'n': 10227, 'len_mean': 17.95482546201232, 'len_p95': 33.0}, 'val': {'n': 1282, 'len_mean': 17.949297971918877, 'len_p95': 32.0}, 'test': {'n': 1276, 'len_mean': 18.052507836990596, 'len_p95': 32.0}}
