In [1]:
import re
from pathlib import Path
import pandas as pd

In [2]:
RAW_DIR = Path(r"Data/liar_dataset")           # where your train.tsv / valid.tsv / test.tsv live
OUT_DIR = Path(r"Data/liar_dataset/clean")     # cleaned CSVs will be written here
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# LIAR has 6 classes; we'll collapse -> binary
name_to_bin = {
    "pants-fire": 0,
    "false": 0,
    "barely-true": 0,
    "half-true": 1,
    "mostly-true": 1,
    "true": 1,
}
# some LIAR dumps use numeric 0..5; map them to names first
idx_to_name = {
    0: "pants-fire",
    1: "false",
    2: "barely-true",
    3: "half-true",
    4: "mostly-true",
    5: "true",
}

In [4]:
def load_liar_tsv(path: Path) -> pd.DataFrame:
    """
    Expected LIAR columns (no header):
      0: statement ID (sometimes)
      1: label (string or int 0..5)
      2: statement text
      ... (more columns we ignore)
    We keep only columns [label, text] -> rename to ['label','text'].
    """
    # robust read
    df = pd.read_csv(path, sep="\t", header=None, quoting=3, on_bad_lines="skip", dtype=str)
    if df.shape[1] < 3:
        raise ValueError(f"{path} doesn't look like LIAR (need at least 3 columns). Got shape={df.shape}")

    # take label + text
    df = df[[1, 2]].copy()
    df.columns = ["label", "text"]

    # normalize whitespace
    df["text"] = df["text"].astype(str).apply(lambda s: re.sub(r"\s+", " ", s).strip())

    # drop empties & very short texts (<= 3 words)
    df = df.dropna(subset=["label", "text"])
    df = df[df["text"].str.split().str.len() > 3]

    # map labels to binary
    # case A: labels are numeric strings "0".."5"
    if df["label"].str.fullmatch(r"[0-5]").all():
        df["label"] = df["label"].astype(int).map(idx_to_name).map(name_to_bin)
    else:
        # case B: labels are names; lowercase + trim then map
        df["label"] = df["label"].str.strip().str.lower().map(name_to_bin)

    # drop rows with unknown labels
    df = df.dropna(subset=["label"])
    df["label"] = df["label"].astype(int)

    # deduplicate identical texts (keep first)
    df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
    return df


In [5]:
def save_split(in_file: str, out_file: str):
    src = RAW_DIR / in_file
    dst = OUT_DIR / out_file
    df = load_liar_tsv(src)
    df.to_csv(dst, index=False)
    print(f"✅ Saved {dst}  | rows={len(df)}  | label counts:\n{df['label'].value_counts().to_string()}\n")

In [6]:
if __name__ == "__main__":
    save_split("train.tsv", "liar_train.csv")
    save_split("valid.tsv", "liar_val.csv")
    save_split("test.tsv",  "liar_test.csv")
    print("🎉 Done! Clean files are in:", OUT_DIR.resolve())


FileNotFoundError: [Errno 2] No such file or directory: 'Data\\liar_dataset\\train.tsv'