In [1]:
# ==========================================
# Cell 1: Configuration
#
# Set the source file and project name.
# The cleaned output will be saved to:
#   01_raw_inbox/{PROJECT_NAME}/raw.csv
# ==========================================

# -- Choose your source file --
SOURCE_FILE = "01_raw_inbox/cold_start/raw.csv"

# -- Project name (matches NB01 PROJECT_NAME) --
PROJECT_NAME = "cold_start"

# -- Optional: sample N rows (set to None for full dataset) --
SAMPLE_N = None

In [2]:
# ==========================================
# Cell 2: Load, Clean & Save
#
# Steps:
#   1. Read source CSV
#   2. Normalize column names (lowercase, strip whitespace)
#   3. Auto-detect text column from known aliases
#   4. Drop pandas index artifacts (Unnamed: columns)
#   5. Drop rows with empty/NaN text
#   6. Deduplicate on text (keep last)
#   7. Strip whitespace from all string columns
#   8. Optional: random sample
#   9. Save to 01_raw_inbox/{PROJECT_NAME}/raw.csv
# ==========================================

import os
import pandas as pd
from config import DIRS

# -- Accepted text column aliases (DATA_SPEC.md ยง1) --
TEXT_ALIASES = ["text", "content", "body", "comment", "review", "tweet"]

# -- Load --
df = pd.read_csv(SOURCE_FILE)
print(f"Loaded: {SOURCE_FILE}")
print(f"  Shape: {df.shape}")
print(f"  Columns: {list(df.columns)}")

# -- Normalize column names --
df.columns = df.columns.str.lower().str.strip()

# -- Drop Unnamed index artifacts --
unnamed_cols = [c for c in df.columns if c.startswith("unnamed")]
if unnamed_cols:
    df.drop(columns=unnamed_cols, inplace=True)
    print(f"  Dropped index artifacts: {unnamed_cols}")

# -- Auto-detect and rename text column --
text_col = None
for alias in TEXT_ALIASES:
    if alias in df.columns:
        text_col = alias
        break

if text_col is None:
    raise ValueError(
        f"No text column found. Expected one of {TEXT_ALIASES}, "
        f"got: {list(df.columns)}"
    )

if text_col != "text":
    df.rename(columns={text_col: "text"}, inplace=True)
    print(f"  Renamed '{text_col}' -> 'text'")

# -- Normalize sentiment column if present --
if "sentiment" in df.columns:
    df["sentiment"] = df["sentiment"].astype(str).str.lower().str.strip()

# -- Strip whitespace from all string columns --
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype(str).str.strip()

# -- Drop empty/NaN text --
before = len(df)
df = df[df["text"].notna() & (df["text"].str.len() > 0) & (df["text"] != "nan")]
dropped_empty = before - len(df)

# -- Deduplicate on text (keep last) --
before = len(df)
df.drop_duplicates(subset=["text"], keep="last", inplace=True)
dropped_dupes = before - len(df)

# -- Optional: random sample --
if SAMPLE_N is not None and SAMPLE_N < len(df):
    df = df.sample(n=SAMPLE_N, random_state=42)
    print(f"  Sampled {SAMPLE_N} rows (random_state=42)")

# -- Save --
output_dir = f"{DIRS['raw']}/{PROJECT_NAME}"
os.makedirs(output_dir, exist_ok=True)
output_path = f"{output_dir}/raw.csv"
df.to_csv(output_path, index=False)

print(f"\n--- Cleaning Summary ---")
print(f"  Empty text dropped:  {dropped_empty}")
print(f"  Duplicates dropped:  {dropped_dupes}")
print(f"  Final rows:          {len(df)}")
print(f"  Final columns:       {list(df.columns)}")
print(f"  Saved to:            {output_path}")

Loaded: 01_raw_inbox/cold_start/raw.csv
  Shape: (1000, 5)
  Columns: ['Unnamed: 0.1', 'Unnamed: 0', 'id', 'sentiment', 'text']
  Dropped index artifacts: ['unnamed: 0.1', 'unnamed: 0']

--- Cleaning Summary ---
  Empty text dropped:  10
  Duplicates dropped:  8
  Final rows:          982
  Final columns:       ['id', 'sentiment', 'text']
  Saved to:            ./01_raw_inbox/cold_start/raw.csv
