In [1]:
# === ONE-CELL IMDB SENTIMENT TRAINING + INFERENCE (Kaggle-friendly, offline-ready) ===
# - Auto-finds IMDB CSV under /kaggle/input
# - Fine-tunes a local HuggingFace model if available; else falls back to sklearn baseline
# - Runs a pipeline on 10 test samples (readable labels: 'negative' / 'positive')

import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

from pathlib import Path
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ---------------- IMDB CSV auto-discovery ----------------
def find_imdb_csv():
    base = Path("/kaggle/input")
    # Try canonical name first
    hits = list(base.rglob("IMDB Dataset.csv"))
    if hits:
        hits.sort(key=lambda p: -p.stat().st_size)
        return hits[0]
    # Fallback to imdb-ish CSVs
    patterns = ["*IMDB*Dataset*.csv", "*imdb*/*.csv", "*IMDB*/*.csv", "*imdb*.csv"]
    for pat in patterns:
        hits = list(base.rglob(pat))
        if hits:
            hits.sort(key=lambda p: -p.stat().st_size)
            return hits[0]
    return None

csv_path = find_imdb_csv()
if not csv_path:
    all_csvs = list(Path("/kaggle/input").rglob("*.csv"))
    raise FileNotFoundError(
        "IMDB CSV not found under /kaggle/input.\n"
        "Attached CSVs:\n  " + "\n  ".join(map(str, all_csvs[:50]))
    )

print("Loading:", csv_path)
df = pd.read_csv(csv_path)

# Expect 'review' and 'sentiment' columns
if "label" not in df.columns:
    if "sentiment" not in df.columns:
        raise ValueError("Expected a 'sentiment' column with 'positive'/'negative'.")
    df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})

print("Label distribution (full):")
print(df["label"].value_counts(normalize=True).round(4))

# ---------------- Splits (matches your approach) ----------------
trainval_df, test_df = train_test_split(
    df, test_size=0.10, stratify=df["label"], random_state=42
)
subset_frac = 0.10
subset_df = trainval_df.sample(frac=subset_frac, random_state=42)
train_subset, val_subset = train_test_split(
    subset_df, test_size=0.10, stratify=subset_df["label"], random_state=42
)
print(f"Subset train size: {len(train_subset)} | val size: {len(val_subset)}")

# Full train/val (for a more serious run, like your later cell)
train_full, val_full = train_test_split(
    trainval_df, test_size=0.10, stratify=trainval_df["label"], random_state=42
)

# ---------------- Local HF model discovery (no internet) ----------------
def looks_like_hf_model_dir(d: Path) -> bool:
    has_config = (d / "config.json").exists()
    has_weights = (d / "pytorch_model.bin").exists() or (d / "tf_model.h5").exists()
    has_tok = any((d / name).exists() for name in ["tokenizer.json", "vocab.txt"])
    return d.is_dir() and has_config and has_weights and has_tok

def find_local_hf_model(prefer=("distilbert", "bert")) -> Path | None:
    base = Path("/kaggle/input")
    if not base.exists():
        return None
    candidates = []
    for d in base.rglob("*"):
        if looks_like_hf_model_dir(d):
            name = d.name.lower()
            score = 0
            for i, key in enumerate(prefer):
                if key in name:
                    score += (len(prefer) - i)  # earlier keys higher
            candidates.append((score, d))
    if not candidates:
        return None
    candidates.sort(key=lambda x: x[0], reverse=True)
    return candidates[0][1]

local_model_dir = find_local_hf_model()
print("Local HF model:", local_model_dir if local_model_dir else "NOT FOUND (will fallback to sklearn if needed)")

# ---------------- HF training utilities ----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

class HFTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = list(map(str, texts))
        self.labels = list(labels)
        self.tok = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tok(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def prepare_datasets(train_df, val_df, tokenizer, text_col="review", label_col="label", max_len=256):
    train_ds = HFTextDataset(train_df[text_col], train_df[label_col], tokenizer, max_len)
    val_ds   = HFTextDataset(val_df[text_col],   val_df[label_col],   tokenizer, max_len)
    return train_ds, val_ds

# ---------------- Try HF offline fine-tuning; else sklearn baseline ----------------
did_hf = False
try:
    from transformers import (
        AutoTokenizer, AutoModelForSequenceClassification,
        Trainer, TrainingArguments, pipeline
    )
    if local_model_dir is not None:
        local_only = True
        model_name = str(local_model_dir)
    else:
        # If you KNOW you have internet, you can set local_only=False and a model name, e.g. 'distilbert-base-uncased'
        local_only = True  # keep offline-safe by default
        model_name = "distilbert-base-uncased"  # will fail offline if not cached

    print("Loading tokenizer/model from:", model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=local_only)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2, local_files_only=local_only
    ).to(device)

    # ensure readable labels in model config
    model.config.id2label = {0: "negative", 1: "positive"}
    model.config.label2id = {"negative": 0, "positive": 1}

    # datasets (use the fuller split for meaningful training, adjust epochs as needed)
    train_ds_full, val_ds_full = prepare_datasets(train_full, val_full, tokenizer, max_len=256)

    training_args = TrainingArguments(
        output_dir=f'/kaggle/working/results/{Path(model_name).name}_full',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,
        report_to=[],
    )

    trainer_full = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_full,
        eval_dataset=val_ds_full,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer_full.train()
    metrics = trainer_full.evaluate()
    print("HF eval:", metrics)
    trainer_full.save_model()
    print("Model saved to:", training_args.output_dir)

    # -------- Inference with pipeline on 10 samples --------
    device_idx = 0 if torch.cuda.is_available() else -1
    clf_pipeline = pipeline(
        task="text-classification",
        model=trainer_full.model,
        tokenizer=tokenizer,
        device=device_idx,
        batch_size=32,
        truncation=True,
    )

    test_samples = test_df.sample(10, random_state=42)
    reviews = test_samples["review"].tolist()
    true_labels = test_samples["label"].tolist()

    preds = clf_pipeline(reviews)

    def to_int_label(p):
        lbl = p["label"].lower()
        if lbl in model.config.label2id:
            return model.config.label2id[lbl]
        return 1 if lbl.endswith("1") else 0

    correct = 0
    for i, (review, p, y) in enumerate(zip(reviews, preds, true_labels), 1):
        pred_lbl = "positive" if to_int_label(p) == 1 else "negative"
        true_lbl = "positive" if y == 1 else "negative"
        correct += int(pred_lbl == true_lbl)
        print(f"\nReview #{i}:")
        print(review[:300] + ("..." if len(review) > 300 else ""))
        print(f"Predicted: {pred_lbl} (score={p['score']:.4f}), True: {true_lbl}")

    acc = correct / len(true_labels)
    print(f"\nPipeline sample accuracy: {acc:.2%}")

    did_hf = True

except Exception as e:
    print("HF route unavailable. Falling back to sklearn baseline.\nReason:", repr(e))

if not did_hf:
    # --------- Sklearn Fallback (always works offline) ---------
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
        ("clf", LogisticRegression(max_iter=300))
    ])
    pipe.fit(train_full["review"], train_full["label"])
    pred = pipe.predict(val_full["review"])

    acc = accuracy_score(val_full["label"], pred)
    prec, rec, f1, _ = precision_recall_fscore_support(val_full["label"], pred, average="macro", zero_division=0)
    print(f"Sklearn baseline — acc: {acc:.4f}, prec: {prec:.4f}, rec: {rec:.4f}, f1: {f1:.4f}")

# Save splits for inspection/download
out_dir = Path("/kaggle/working")
out_dir.mkdir(parents=True, exist_ok=True)
train_subset.to_csv(out_dir / "train_subset.csv", index=False)
val_subset.to_csv(out_dir / "val_subset.csv", index=False)
test_df.to_csv(out_dir / "test_full_10pct.csv", index=False)
print("Saved splits to /kaggle/working/")


Loading: /kaggle/input/imdb-dataset-csv/IMDB Dataset.csv
Label distribution (full):
label
1    0.5
0    0.5
Name: proportion, dtype: float64
Subset train size: 4050 | val size: 450
Local HF model: NOT FOUND (will fallback to sklearn if needed)


2025-09-07 17:05:04.393627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757264704.610067      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757264704.685495      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading tokenizer/model from: distilbert-base-uncased
HF route unavailable. Falling back to sklearn baseline.
Reason: OSError("We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.\nCheckout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.")
Sklearn baseline — acc: 0.9060, prec: 0.9060, rec: 0.9060, f1: 0.9060
Saved splits to /kaggle/working/


In [2]:
# Offline-friendly IMDB training: HuggingFace (local-only) with fallback to scikit-learn.
# - Auto-finds IMDB CSV under /kaggle/input
# - Forces transformers offline (no downloads)
# - Loads a local model dir under /kaggle/input (e.g., *distilbert-base-uncased*)
# - If no local model found, runs a TF-IDF + LogisticRegression baseline

import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# ---------------- IMDB CSV auto-discovery ----------------
def find_imdb_csv():
    base = Path("/kaggle/input")
    # Try canonical filename
    hits = list(base.rglob("IMDB Dataset.csv"))
    if hits:
        hits.sort(key=lambda p: -p.stat().st_size)
        return hits[0]
    # Fallback: anything imdb-ish
    patterns = ["*IMDB*Dataset*.csv", "*imdb*/*.csv", "*IMDB*/*.csv", "*imdb*.csv"]
    for pat in patterns:
        hits = list(base.rglob(pat))
        if hits:
            hits.sort(key=lambda p: -p.stat().st_size)
            return hits[0]
    return None

csv_path = find_imdb_csv()
if not csv_path:
    all_csvs = list(Path("/kaggle/input").rglob("*.csv"))
    raise FileNotFoundError(
        "IMDB csv not found under /kaggle/input.\n"
        "Attached CSVs:\n  " + "\n  ".join(map(str, all_csvs[:50]))
    )
print("Loading:", csv_path)

# ---- Load & label ----
df = pd.read_csv(csv_path)
df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})
print("Label distribution (full):")
print(df["label"].value_counts(normalize=True).round(4))

# ---- Split like your original code ----
trainval_df, test_df = train_test_split(
    df, test_size=0.10, stratify=df["label"], random_state=42
)
subset_frac = 0.10
subset_df = trainval_df.sample(frac=subset_frac, random_state=42)
train_subset, val_subset = train_test_split(
    subset_df, test_size=0.10, stratify=subset_df["label"], random_state=42
)
print(f"Subset train size: {len(train_subset)} | val size: {len(val_subset)}")

# ---------------- Local HF model discovery (no internet) ----------------
def looks_like_hf_model_dir(d: Path) -> bool:
    # Require config + weights + tokenizer-ish file
    has_config = (d / "config.json").exists()
    has_weights = (d / "pytorch_model.bin").exists() or (d / "tf_model.h5").exists()
    has_tok = any((d / name).exists() for name in ["tokenizer.json", "vocab.txt"])
    return d.is_dir() and has_config and has_weights and has_tok

def find_local_hf_model(prefer=("distilbert", "bert")) -> Path | None:
    base = Path("/kaggle/input")
    if not base.exists(): 
        return None
    # Score directories that look like HF model repos
    candidates = []
    for d in base.rglob("*"):
        if d.is_dir() and looks_like_hf_model_dir(d):
            name = d.name.lower()
            score = 0
            for i, key in enumerate(prefer):
                if key in name:
                    score += (len(prefer) - i)  # prefer earlier keys
            # Prefer smaller models (distilbert) over bert if both exist
            candidates.append((score, -d.stat().st_size if hasattr(d, "stat") else 0, d))
    if not candidates:
        return None
    candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
    return candidates[0][2]

local_model_dir = find_local_hf_model()
print("Local HF model:", local_model_dir if local_model_dir else "NOT FOUND (will fallback to sklearn)")

# ---------------- Option A: HF offline fine-tune ----------------
def run_hf_offline(train_df, val_df, model_dir: Path):
    import torch
    from torch.utils.data import Dataset
    import transformers

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_dir, local_files_only=True
    )

    class TextClsDS(Dataset):
        def __init__(self, df, tokenizer, max_len=256):
            self.texts = df["review"].tolist()
            self.labels = df["label"].tolist()
            self.tok = tokenizer
            self.max_len = max_len
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            t = str(self.texts[idx])
            enc = self.tok(
                t, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt"
            )
            item = {k: v.squeeze(0) for k, v in enc.items()}
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

    train_ds = TextClsDS(train_df, tokenizer)
    val_ds   = TextClsDS(val_df,   tokenizer)

    # Build model with classification head (num_labels=2); load base weights from dir
    model = transformers.AutoModelForSequenceClassification.from_pretrained(
        model_dir, num_labels=2, local_files_only=True
    )

    args = transformers.TrainingArguments(
        output_dir="/kaggle/working/hf-out",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=1,        # keep quick
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=50,
        report_to=[],
    )

    trainer = transformers.Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=lambda p: {
            "accuracy": (p.predictions.argmax(axis=-1) == p.label_ids).mean().item()
        },
    )

    trainer.train()
    metrics = trainer.evaluate()
    print("HF offline eval:", metrics)
    # Save model if you like
    model.save_pretrained("/kaggle/working/hf-offline-model")
    tokenizer.save_pretrained("/kaggle/working/hf-offline-model")

# ---------------- Option B: Fallback sklearn baseline ----------------
def run_sklearn_baseline(train_df, val_df):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import accuracy_score

    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
        ("clf", LogisticRegression(max_iter=200))
    ])
    pipe.fit(train_df["review"], train_df["label"])
    pred = pipe.predict(val_df["review"])
    acc = accuracy_score(val_df["label"], pred)
    print("Sklearn baseline accuracy:", round(acc, 4))

# ---------------- Run ----------------
if local_model_dir:
    try:
        run_hf_offline(train_subset, val_subset, local_model_dir)
    except Exception as e:
        print("HF offline route failed, falling back to sklearn. Error:", e)
        run_sklearn_baseline(train_subset, val_subset)
else:
    run_sklearn_baseline(train_subset, val_subset)

# Save splits for inspection
out_dir = Path("/kaggle/working")
train_subset.to_csv(out_dir / "train_subset.csv", index=False)
val_subset.to_csv(out_dir / "val_subset.csv", index=False)
test_df.to_csv(out_dir / "test_full_10pct.csv", index=False)
print("Saved splits to /kaggle/working/")


Loading: /kaggle/input/imdb-dataset-csv/IMDB Dataset.csv
Label distribution (full):
label
1    0.5
0    0.5
Name: proportion, dtype: float64
Subset train size: 4050 | val size: 450
Local HF model: NOT FOUND (will fallback to sklearn)
Sklearn baseline accuracy: 0.8244
Saved splits to /kaggle/working/
