In [2]:
#!/usr/bin/env python3
"""
train_model.py
- Expects CSV with columns: text_clean and label
- Saves model pipeline to social-media-sentiment-analysis/models/model_pipeline.joblib

This variant is robust when running inside Jupyter/IPython:
- exposes train_model(...) function for programmatic usage
- uses parse_known_args() so ipykernel injected args won't break parsing
- avoids auto-running CLI parse when imported in notebooks (prints usage help instead)
- handles stratify fallback when a stratified split is not possible
"""
import argparse
import os
from typing import Optional, Dict, Any
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import sys


def train_model(
    input_csv: str,
    output_path: str = "social-media-sentiment-analysis/models/model_pipeline.joblib",
    test_size: float = 0.2,
    random_state: int = 42,
) -> Dict[str, Any]:
    """
    Train a TF-IDF + LogisticRegression pipeline on the processed CSV.

    Returns a dict with keys:
      - report: classification report (dict)
      - model_path: path to saved model
      - n_samples: number of samples used
    """
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input CSV not found: {input_csv}")

    df = pd.read_csv(input_csv, encoding="utf-8")
    if "text_clean" not in df.columns:
        raise SystemExit("Input must contain 'text_clean' column.")
    if "label" not in df.columns:
        raise SystemExit("Input must contain 'label' column for supervised training.")

    X = df["text_clean"].astype(str)
    y = df["label"].astype(str)

    # Determine whether stratify is possible (each class needs at least 2 samples to stratify)
    stratify_arg = None
    try:
        vc = y.value_counts()
        if (vc >= 2).all() and len(vc) > 1:
            stratify_arg = y
        else:
            stratify_arg = None
    except Exception:
        stratify_arg = None

    if stratify_arg is not None:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=stratify_arg
        )
    else:
        # fallback to non-stratified split with a warning
        print(
            "Warning: Stratified split not possible (some classes have <2 samples or only one class). "
            "Proceeding with non-stratified split.",
            file=sys.stderr,
        )
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )

    pipeline = Pipeline(
        [
            ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=20000)),
            ("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
        ]
    )

    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_test)
    report_text = classification_report(y_test, preds)
    print("Classification report on test set:")
    print(report_text)

    # Save pipeline
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    joblib.dump(pipeline, output_path)
    print("Saved model pipeline to", output_path)

    return {
        "report": classification_report(y_test, preds, output_dict=True),
        "model_path": output_path,
        "n_samples": len(df),
    }


def main(argv: Optional[list] = None):
    parser = argparse.ArgumentParser(prog="train_model.py")
    parser.add_argument("--input", required=True, help="processed csv with text_clean and label")
    parser.add_argument(
        "--output",
        default="social-media-sentiment-analysis/models/model_pipeline.joblib",
        help="path to save trained model (joblib)",
    )
    parser.add_argument("--test-size", type=float, default=0.2, help="test set fraction")
    parser.add_argument("--random-state", type=int, default=42, help="random seed for splitting")

    # parse_known_args so running inside Jupyter won't fail due to extra argv
    if argv is None:
        args, unknown = parser.parse_known_args()
    else:
        args, unknown = parser.parse_known_args(argv)

    result = train_model(
        input_csv=args.input,
        output_path=args.output,
        test_size=args.test_size,
        random_state=args.random_state,
    )
    return result


if __name__ == "__main__":
    # Detect interactive environment (Jupyter/IPython). If interactive, do not auto-run CLI parsing.
    in_ipy = False
    try:
        from IPython import get_ipython  # type: ignore

        if get_ipython() is not None:
            in_ipy = True
    except Exception:
        in_ipy = False

    if in_ipy:
        msg = (
            "Detected IPython/Jupyter environment. The CLI entrypoint will not run automatically here.\n"
            "To train in a notebook, either:\n"
            "  from train_model import train_model\n"
            "  train_model('data/processed/tweets_clean.csv', output_path='models/model_pipeline.joblib')\n\n"
            "Or run from a terminal:\n"
            "  python train_model.py --input data/processed/tweets_clean.csv --output social-media-sentiment-analysis/models/model_pipeline.joblib\n"
        )
        print(msg)
    else:
        main()

Detected IPython/Jupyter environment. The CLI entrypoint will not run automatically here.
To train in a notebook, either:
  from train_model import train_model
  train_model('data/processed/tweets_clean.csv', output_path='models/model_pipeline.joblib')

Or run from a terminal:
  python train_model.py --input data/processed/tweets_clean.csv --output social-media-sentiment-analysis/models/model_pipeline.joblib



In [3]:
# Quick training script for local testing
# - Reads social-media-sentiment-analysis/data/raw/tweets_scraped.csv
# - If not present, instructs to run create_sample_data.py first
# - Trains TF-IDF + LogisticRegression and saves pipeline to:
#     social-media-sentiment-analysis/models/model_pipeline.joblib
# Usage (from repo root, with venv active):
#   python .\social-media-sentiment-analysis\train_quick.py

import os
import sys
from pathlib import Path

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

ROOT = Path("social-media-sentiment-analysis")
RAW = ROOT / "data" / "raw" / "tweets_scraped.csv"
PROCESSED = ROOT / "data" / "processed" / "tweets_clean.csv"
MODEL_OUT = ROOT / "models" / "model_pipeline.joblib"

def simple_preprocess_text(s):
    if not isinstance(s, str):
        return ""
    import re
    s = s.lower()
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"@\w+", "", s)
    s = re.sub(r"#", "", s)
    s = re.sub(r"[^a-z0-9\s']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def main():
    if not RAW.exists():
        print(f"Input CSV not found: {RAW}")
        print("Run create_sample_data.py first or place a CSV at the path above.")
        sys.exit(1)

    print("Loading raw data:", RAW)
    df = pd.read_csv(RAW, encoding="utf-8")
    if "text" not in df.columns or "label" not in df.columns:
        print("CSV must have columns: text,label")
        sys.exit(1)

    df["text_clean"] = df["text"].astype(str).apply(simple_preprocess_text)
    os.makedirs(MODEL_OUT.parent, exist_ok=True)

    X = df["text_clean"]
    y = df["label"].astype(str)

    # small train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=10000)),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ])

    print("Training pipeline...")
    pipeline.fit(X_train, y_train)

    print("Evaluating on test set...")
    preds = pipeline.predict(X_test)
    print(classification_report(y_test, preds))

    joblib.dump(pipeline, MODEL_OUT)
    print("Saved model pipeline to:", MODEL_OUT)

if __name__ == "__main__":
    main()

Loading raw data: social-media-sentiment-analysis\data\raw\tweets_scraped.csv
Training pipeline...
Evaluating on test set...
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        40
     neutral       1.00      1.00      1.00        40
    positive       1.00      1.00      1.00        40

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Saved model pipeline to: social-media-sentiment-analysis\models\model_pipeline.joblib
