In [75]:
# ==========================================
# Unified Single-Cell Code: Train & Evaluate on the Full Dataset
# ==========================================

import os
import copy
import numpy as np
import pandas as pd
import torch
from IPython.display import Markdown, display
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Optional helper to display Markdown in notebooks
def md(text):
    display(Markdown(text))

# --------------------------
# 1) Configurable Parameters
# --------------------------
DATA_FILE = "Suicide_Detection.csv"
TRAIN_FRACTION = 0.8         # 80% for train, 20% for test
TRAIN_PERCENTAGE_INNER = 0.2 # Use 20% of that training portion for demonstration
num_train_epochs = 2
max_length = 256
batch_size = 2
grad_accum_steps = 2
learning_rate = 1e-4

# LoRA config
lora_rank = 16
lora_alpha = 16
lora_dropout = 0.05

base_model_path = "/Users/ehsan/.llama/checkpoints/Llama3.2-1B-hf"
output_dir = "./checkpoints/llama3_seqcls_lora"

# --------------------------
# 2) Load the CSV
# --------------------------
df = pd.read_csv(DATA_FILE)
print(f"Loaded CSV with shape: {df.shape}")

# Create a map from "row_id" -> original text, so we can later retrieve text for misclassifications
original_text_map = {}
for i, row in df.iterrows():
    # We'll assume "Unnamed: 0" is a unique ID in the CSV
    row_id = row["Unnamed: 0"]
    original_text_map[row_id] = row["text"]

# --------------------------
# 3) Create a Hugging Face Dataset
# --------------------------
dataset_dict = {
    "row_id": df["Unnamed: 0"].tolist(),
    "class": df["class"].tolist(),
    "text": df["text"].tolist()
}
dataset = Dataset.from_dict(dataset_dict)

label2id = {"non-suicide": 0, "suicide": 1}
def keep_minimal_columns(example):
    return {
        "row_id": example["row_id"],
        "labels": label2id[example["class"]],
        "text": example["text"]
    }

dataset_min = dataset.map(keep_minimal_columns)

# --------------------------
# 4) Train/Test split
# --------------------------
# We do an 80%/20% split for training/evaluation
split_dset = dataset_min.train_test_split(test_size=1 - TRAIN_FRACTION, seed=42)
train_dataset_full = split_dset["train"]  # 80% portion
eval_dataset_full  = split_dset["test"]   # 20% portion

print(f"Train set size: {len(train_dataset_full)}, Test set size: {len(eval_dataset_full)}")

# Then subselect 20% of the training set if you only want a quick demonstration:
train_size = int(len(train_dataset_full) * TRAIN_PERCENTAGE_INNER)
train_dataset = train_dataset_full.shuffle(seed=42).select(range(train_size))
eval_dataset  = eval_dataset_full

print("Final TRAIN size:", len(train_dataset))
print("Final EVAL size:", len(eval_dataset))

# --------------------------
# 5) Tokenizer & LoRA Model Setup
# --------------------------
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model_base = AutoModelForSequenceClassification.from_pretrained(
    base_model_path,
    num_labels=2,
    device_map="auto"
)
model_base.config.pad_token_id = tokenizer.pad_token_id

# Setup LoRA
lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
model = get_peft_model(model_base, lora_config)

# --------------------------
# 6) Tokenize the Train/Eval
# --------------------------
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length"
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True)

# Remove leftover "text" field so the collator sees only input_ids, attention_mask, labels, row_id
train_dataset = train_dataset.remove_columns(["text"])
eval_dataset  = eval_dataset.remove_columns(["text"])

# Convert to PyTorch format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels", "row_id"])
eval_dataset.set_format("torch",  columns=["input_ids", "attention_mask", "labels", "row_id"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

# --------------------------
# 7) Trainer Setup & Training
# --------------------------
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum_steps,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    fp16=False,   # keep False if using Apple MPS
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./lora_suicide_model_seqcls")
print("Done training! Model + LoRA adapter saved to: ./lora_suicide_model_seqcls")

# --------------------------
# 8) Evaluate & Print Misclassifications
# --------------------------
predictions = trainer.predict(eval_dataset)
pred_probs  = predictions.predictions
pred_labels = pred_probs.argmax(-1)
true_labels = predictions.label_ids

acc = (pred_labels == true_labels).mean()
print(f"Evaluation Accuracy = {acc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, pred_labels))

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=["non-suicide","suicide"]))

# Identify misclassifications
wrong_mask   = (pred_labels != true_labels)
wrong_indices = np.where(wrong_mask)[0]  # dataset indices in eval_dataset

print("\nNumber of misclassifications:", len(wrong_indices))

max_show = 5
label_name_map = {0: "non-suicide", 1: "suicide"}

for i, ds_idx in enumerate(wrong_indices[:max_show]):
    ds_idx_py = int(ds_idx)  # Convert np.int64 -> Python int for indexing
    sample_row = eval_dataset[ds_idx_py]

    row_id = int(sample_row["row_id"])
    predicted = label_name_map[pred_labels[ds_idx_py]]
    actual = label_name_map[true_labels[ds_idx_py]]

    # Retrieve original text from your map
    original_text = original_text_map[row_id]
    snippet = original_text[:300] + ("..." if len(original_text) > 300 else "")

    print(f"\n--- Misclassified Sample {i} ---")
    print(f"Row ID: {row_id}")
    print(f"Predicted: {predicted}, Actual: {actual}")
    print("Snippet of text:", snippet)


Loaded CSV with shape: (232074, 3)


Map:   0%|          | 0/232074 [00:00<?, ? examples/s]

Train set size: 185659, Test set size: 46415
Final TRAIN size: 37131
Final EVAL size: 46415


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /Users/ehsan/.llama/checkpoints/Llama3.2-1B-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/37131 [00:00<?, ? examples/s]

Map:   0%|          | 0/46415 [00:00<?, ? examples/s]



  0%|          | 0/18566 [00:00<?, ?it/s]

{'loss': 3.1094, 'grad_norm': 106.77273559570312, 'learning_rate': 9.994613810190672e-05, 'epoch': 0.0}
{'loss': 1.6751, 'grad_norm': 71.1974105834961, 'learning_rate': 9.989227620381343e-05, 'epoch': 0.0}
{'loss': 2.0439, 'grad_norm': 54.92241287231445, 'learning_rate': 9.983841430572014e-05, 'epoch': 0.0}
{'loss': 1.3188, 'grad_norm': 111.18098449707031, 'learning_rate': 9.978455240762685e-05, 'epoch': 0.0}
{'loss': 1.0407, 'grad_norm': 46.394065856933594, 'learning_rate': 9.973069050953357e-05, 'epoch': 0.01}
{'loss': 1.0606, 'grad_norm': 26.005048751831055, 'learning_rate': 9.967682861144027e-05, 'epoch': 0.01}
{'loss': 0.8373, 'grad_norm': 70.67825317382812, 'learning_rate': 9.962296671334699e-05, 'epoch': 0.01}
{'loss': 0.6686, 'grad_norm': 84.9291763305664, 'learning_rate': 9.95691048152537e-05, 'epoch': 0.01}
{'loss': 0.7721, 'grad_norm': 5.769374370574951, 'learning_rate': 9.95152429171604e-05, 'epoch': 0.01}
{'loss': 1.4556, 'grad_norm': 141.23388671875, 'learning_rate': 9.94

  0%|          | 0/23208 [00:00<?, ?it/s]

{'eval_loss': 0.06478647142648697, 'eval_accuracy': 0.9896369708068512, 'eval_runtime': 3197.0799, 'eval_samples_per_second': 14.518, 'eval_steps_per_second': 7.259, 'epoch': 1.0}
{'loss': 0.7928, 'grad_norm': 0.0004550080338958651, 'learning_rate': 4.99622966713347e-05, 'epoch': 1.0}
{'loss': 0.0755, 'grad_norm': 1.7167056398648128e-07, 'learning_rate': 4.990843477324141e-05, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': 1.0739310027929605e-06, 'learning_rate': 4.985457287514812e-05, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': 1.7877606296679005e-05, 'learning_rate': 4.980071097705483e-05, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': 2.887624532377231e-06, 'learning_rate': 4.9746849078961547e-05, 'epoch': 1.01}
{'loss': 0.0, 'grad_norm': 1.5866982721490785e-05, 'learning_rate': 4.969298718086826e-05, 'epoch': 1.01}
{'loss': 0.0004, 'grad_norm': 7.744497487749413e-09, 'learning_rate': 4.963912528277497e-05, 'epoch': 1.01}
{'loss': 0.0, 'grad_norm': 4.528665158431977e-06, 'learning_rate': 4.9585263

  0%|          | 0/23208 [00:00<?, ?it/s]

{'eval_loss': 0.04327120631933212, 'eval_accuracy': 0.9931487665625337, 'eval_runtime': 3199.9013, 'eval_samples_per_second': 14.505, 'eval_steps_per_second': 7.253, 'epoch': 2.0}
{'train_runtime': 16410.0484, 'train_samples_per_second': 4.525, 'train_steps_per_second': 1.131, 'train_loss': 0.10475210716941649, 'epoch': 2.0}
Done training! Model + LoRA adapter saved to: ./lora_suicide_model_seqcls


  0%|          | 0/23208 [00:00<?, ?it/s]

Evaluation Accuracy = 0.9931

Confusion Matrix:
[[23134   132]
 [  186 22963]]

Classification Report:
              precision    recall  f1-score   support

 non-suicide       0.99      0.99      0.99     23266
     suicide       0.99      0.99      0.99     23149

    accuracy                           0.99     46415
   macro avg       0.99      0.99      0.99     46415
weighted avg       0.99      0.99      0.99     46415


Number of misclassifications: 318

--- Misclassified Sample 0 ---
Row ID: 346690
Predicted: suicide, Actual: non-suicide
Snippet of text: So I’m really upset at my friend right now (I hardly post here, just didn’t know where to put it. I don’t necessarily need advice, I just need to spell this out. Thanks to anyone that reads to the end) So I’m (16 M) kind of pissed at my friend (16 F) right now. Light backstory, I met her online, we’...

--- Misclassified Sample 1 ---
Row ID: 245540
Predicted: non-suicide, Actual: suicide
Snippet of text: i have to wait another 


--- Misclassified Sample 0 ---
Row ID: 1396
Predicted: non-suicide, Actual: suicide
Snippet of text: I have a good lifeseeing some of the posts here, there are so many people with actual problems in life and reasons to wanna die. I feel guilty since my life is actually pretty good yet I have suicidal thoughts daily for no reason

--- Misclassified Sample 1 ---
Row ID: 384
Predicted: non-suicide, Actual: suicide
Snippet of text: I'm ending it on new year's day.I've posted on here before and the only response was a troll comment so idk why I'm posting again. I guess I just need to organize my thoughts somehow. I've been looking for a better job for well over a year now and I can't even get anyone to look at my resume. Ive be...

--- Misclassified Sample 2 ---
Row ID: 953
Predicted: suicide, Actual: non-suicide
Snippet of text: Love you so much I wish I could W ha

--- Misclassified Sample 3 ---
Row ID: 1447
Predicted: suicide, Actual: non-suicide
Snippet of text: Today me and my girlfri

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /Users/ehsan/.llama/checkpoints/Llama3.2-1B-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps


Evaluating:   0%|          | 0/581 [00:00<?, ?batch/s]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

KeyError: "Column test not in the dataset. Current columns in the dataset: ['row_id', 'labels', 'input_ids', 'attention_mask']"

In [47]:
from transformers import DataCollatorWithPadding
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


def debug_collate_fn(features, data_collator):
    """
    Custom collate function that tries to pad/convert the batch,
    but if an error occurs, it prints the offending samples.
    """
    try:
        return data_collator(features)
    except Exception as e:
        print("Error in collate_fn. Printing batch features:")
        for i, feature in enumerate(features):
            print(f"\n--- Feature {i} ---")
            for k, v in feature.items():
                print(f"{k} => {v} (type: {type(v)})")
        # Optionally re-raise the original error to stop execution
        raise e


data_collator = DataCollatorWithPadding(tokenizer, padding="longest")


def wrapper_collate_fn(features):
    return debug_collate_fn(features, data_collator)


eval_loader = DataLoader(
    eval_dataset_small_copy,
    batch_size=16,
    shuffle=False,
    collate_fn=wrapper_collate_fn
)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)
model.to(device)

all_preds = []
all_labels = []
all_indices = []

model.eval()
with torch.no_grad():
    offset = 0
    for batch in tqdm(eval_loader, desc="Evaluating", unit="batch"):
        batch_size = batch["input_ids"].size(0)

        # Map batch items to dataset indices
        batch_indices = list(range(offset, offset + batch_size))
        offset += batch_size

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        # numeric (0 = non-suicide, 1 = suicide)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        all_indices.extend(batch_indices)

# --------------------------
# 7) Compute Metrics
# --------------------------
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
all_indices = np.array(all_indices)

acc = accuracy_score(all_labels, all_preds)
print("Accuracy on eval dataset:", acc)

print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

print("\nClassification Report:")
print(classification_report(all_labels, all_preds,
      target_names=["non-suicide", "suicide"]))

wrong_indices = all_indices[all_preds != all_labels]
print("Number of misclassifications:", len(wrong_indices))

# --------------------------
# 8) Inspect Misclassifications
# --------------------------


def print_misclassified_samples(
    dataset,
    indices,
    all_preds,
    title="Misclassified",
    max_show=5
):
    """
    dataset: the dataset with output_all_columns=True
    indices: numeric indices into dataset
    all_preds: array of predicted classes
    """
    print(f"\n--- {title} (showing up to {max_show} examples) ---\n")
    label_name_map = {0: "non-suicide", 1: "suicide"}
    for i, idx in enumerate(indices[:max_show]):
        idx = int(idx)  # Convert numpy.int64 to native int
        # Retrieve full row (including "text", "class", etc.)
        row = dataset[idx]

        text = row["text"] if "text" in row else "N/A"
        actual_label_str = row["class"] if "class" in row else row["labels"]
        predicted_num = all_preds[idx]
        predicted_label_str = label_name_map[predicted_num]

        print(f"Index in subset: {idx}")
        print(f"Actual Label: {actual_label_str}")
        print(f"Predicted Label: {predicted_label_str}")
        snippet = text[:300] + ("..." if len(text) > 300 else "")
        print(f"Text: {snippet}\n---")


# Identify false positives vs. false negatives
all_preds = all_preds.astype(int)
all_labels = all_labels.astype(int)

false_positives = []
false_negatives = []
for i in range(len(all_preds)):
    if all_preds[i] == 1 and all_labels[i] == 0:
        false_positives.append(all_indices[i])
    elif all_preds[i] == 0 and all_labels[i] == 1:
        false_negatives.append(all_indices[i])

print(f"\nFalse positives: {len(false_positives)}")
print(f"False negatives: {len(false_negatives)}")

# Example usage to visualize some misclassifications
print_misclassified_samples(eval_dataset_small_copy,
                            false_positives, all_preds, "False Positives")
print_misclassified_samples(eval_dataset_small_copy,
                            false_negatives, all_preds, "False Negatives")

Using device: mps


Evaluating:   0%|          | 0/581 [00:00<?, ?batch/s]

Error in collate_fn. Printing batch features:

--- Feature 0 ---
labels => 0 (type: <class 'torch.Tensor'>)
input_ids => tensor([128000,     40,  47177,     11,    279,   7757,    706,   1903,   1274,
          2288,  17668,  10882,    449,   1694,  28201,   1274,     13,    358,
          1766,    279,   1207,    436,     14,   1736,    267,     13,   2876,
         82495,     11,   7000,    315,    430,     13,   4702,    653,   2534,
           292,    382,   3957,   1070,    264,   1486,   1405,   1274,   5616,
          1427,    520,   5694,    323,   2019,    330,  36661,    499,   1440,
            11,   7344,    358,  13434,    956,    656,    420,  17619,  12241,
            40,   3974,    304,    264,  28859,  69110,  30651,     13,    578,
          3823,   7102,    706,    912,  21648,  14926,     13, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,




ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [44]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!pip install --upgrade torch transformers

python(37723) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
86777.17s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




In [37]:
import re


def remove_invisibles(s):
    """
    Removes/strips out some common invisible characters
    that can cause shape mismatch or dimension errors.
    Adjust the regex as needed for your specific case.
    """
    # This pattern catches zero-width spaces, left-to-right markers, etc.
    # You can expand it if you find more codepoints cause trouble.
    return re.sub(r"[\u200B-\u200F\u202A-\u202E\ufeff]+", "", s)


def debug_collate_fn(features, data_collator):
    """
    Custom collate function that tries to pad/convert the batch,
    but if an error occurs, it prints the offending samples.
    """
    try:
        return data_collator(features)
    except Exception as e:
        print("Error in collate_fn. Printing batch features:")
        for i, feat in enumerate(features):
            print(f"\n--- Feature {i} ---")
            for k, v in feat.items():
                print(f"{k} => {v} (type: {type(v)})")
        raise e


# Single-sample approach
for idx in range(len(eval_dataset_small_copy)):
    try:
        single_item = [eval_dataset_small_copy[idx]]  # one-sample 'batch'
        _ = debug_collate_fn(single_item, data_collator)
    except Exception as e:
        print(
            f"\nError occurred at dataset index {idx}.\nAttempting to fix text.\n")

        # 1) Inspect the text
        row = eval_dataset_small_copy[idx]
        text_before = row["text"]
        print("Text before fix:", repr(text_before))

        # 2) Remove any invisible/unicode characters
        cleaned_text = remove_invisibles(text_before)

        # 3) Replace the text in the dataset with the cleaned version
        eval_dataset_small_copy[idx]["text"] = cleaned_text

        # 4) Try again:
        try:
            single_item_fixed = [eval_dataset_small_copy[idx]]
            _ = debug_collate_fn(single_item_fixed, data_collator)
            print("Fix successful. Updated text for index", idx)
        except Exception as e2:
            print("Fix unsuccessful. Still erroring out:\n", e2)
            print("Skipping or removing item from dataset.")
            # Optionally remove the item from the dataset:
            # eval_dataset_small_copy = eval_dataset_small_copy.select(
            #    [i for i in range(len(eval_dataset_small_copy)) if i != idx]
            # )
            break  # or continue

Error in collate_fn. Printing batch features:

--- Feature 0 ---
labels => 0 (type: <class 'torch.Tensor'>)
input_ids => tensor([128000,     40,  47177,     11,    279,   7757,    706,   1903,   1274,
          2288,  17668,  10882,    449,   1694,  28201,   1274,     13,    358,
          1766,    279,   1207,    436,     14,   1736,    267,     13,   2876,
         82495,     11,   7000,    315,    430,     13,   4702,    653,   2534,
           292,    382,   3957,   1070,    264,   1486,   1405,   1274,   5616,
          1427,    520,   5694,    323,   2019,    330,  36661,    499,   1440,
            11,   7344,    358,  13434,    956,    656,    420,  17619,  12241,
            40,   3974,    304,    264,  28859,  69110,  30651,     13,    578,
          3823,   7102,    706,    912,  21648,  14926,     13]) (type: <class 'torch.Tensor'>)
attention_mask => tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [57]:
# Assume you have the original dataset or eval_dataset_small
# Let's call it sample = eval_dataset_small[0]
# We'll do everything by hand for that one sample

sample = eval_dataset_small[0]

print("  We'll do everything by hand for that one sample Raw sample:")
# Copy only the relevant keys
sample_minimal = {
    "input_ids": sample["input_ids"],
    "attention_mask": sample["attention_mask"],
    "labels": sample["labels"]
}

print("Minimal sample keys:", sample_minimal.keys())

  We'll do everything by hand for that one sample Raw sample:
Minimal sample keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [58]:
from transformers import DataCollatorWithPadding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer, padding="longest")

single_batch = [sample_minimal]

try:
    out = data_collator(single_batch)
    print("Collation success!\n", out)
except Exception as e:
    print("Collation error:\n", e)

Collation success!
 {'input_ids': tensor([[128000,     40,  47177,     11,    279,   7757,    706,   1903,   1274,
           2288,  17668,  10882,    449,   1694,  28201,   1274,     13,    358,
           1766,    279,   1207,    436,     14,   1736,    267,     13,   2876,
          82495,     11,   7000,    315,    430,     13,   4702,    653,   2534,
            292,    382,   3957,   1070,    264,   1486,   1405,   1274,   5616,
           1427,    520,   5694,    323,   2019,    330,  36661,    499,   1440,
             11,   7344,    358,  13434,    956,    656,    420,  17619,  12241,
             40,   3974,    304,    264,  28859,  69110,  30651,     13,    578,
           3823,   7102,    706,    912,  21648,  14926,     13, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 12

In [29]:
%%writefile check_text_issues.py

import pandas as pd

def check_csv_for_tokenization_issues(csv_file="Suicide_Detection.csv", text_col="text"):
    """
    Checks several potential pitfalls that cause 'ValueError: too many dimensions "str"' 
    or other tokenization issues with text data.
    1) Ensures text_col is always a string
    2) Checks for extremely long text entries
    3) Prints columns to confirm text_col exists
    4) Shows an example row
    """

    # 1) Load the CSV into a pandas DataFrame
    df = pd.read_csv(csv_file)
    print(f"Loaded {len(df)} rows from '{csv_file}'")

    # 2) Identify suspicious rows where text_col is not a str
    if text_col not in df.columns:
        print(f"ERROR: Column '{text_col}' not found in CSV. Columns are: {df.columns}")
        return
    
    # Check the column type
    mask = ~df[text_col].apply(lambda x: isinstance(x, str))
    problem_rows = df[mask]
    print(f"Number of suspicious rows (non-string '{text_col}'):", len(problem_rows))

    if len(problem_rows) > 0:
        print("\nSample suspicious rows:\n", problem_rows.head(5))
        print("Consider merging lists or removing them before tokenization.\n")
    else:
        print("No suspicious rows found. All entries in 'text' are strings.\n")

    # 3) Check text length
    df["text_length"] = df[text_col].apply(lambda x: len(x) if isinstance(x, str) else -1)
    print("Text length stats:\n", df["text_length"].describe(), "\n")

    # 4) Print columns
    print("Columns in CSV:", df.columns.tolist(), "\n")

    # 5) Show an example row (if any)
    if len(df) > 0:
        print("Example row:\n")
        print(df.iloc[0])
    else:
        print("No rows in the dataset to preview.")

if __name__ == "__main__":
    check_csv_for_tokenization_issues()

Writing check_text_issues.py
