In [13]:
!pip install numpy pandas nltk scikit-learn xgboost matplotlib tqdm joblib torch openpyxl transformers datasets accelerate -q

Python(48245) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:

import os
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

from transformers import BertTokenizer, BertForSequenceClassification


In [None]:
FILE_PATH = '../data/suhi_data.xlsx'
OUTPUT_CSV = '../data/processed_text_dataframe.csv'
BERT_CLF_DIRECTORY = "../local_uploads/bert_textclf_ckpt"


# Training constants
TEST_SPLIT_RATIO = 0.2
N_COMPONENT = 50

# Data constants
PATIENT_SUBSET = 2
VECTORIZE_SUBSET = 2 
ENGAGED = 1
FETAURE_SUBSET = 2 
SUMMERIZED = 0


In [None]:
def load_and_process():
    # Load & filter source rows
    df = pd.read_excel(FILE_PATH)
    df = df[df['engaged'] == ENGAGED].reset_index(drop=True)

    # Keep only the two columns and rename
    needed = df[['COMBINED_NOTES', 'day_readmit']].copy()
    needed = needed.rename(columns={'COMBINED_NOTES': 'text', 'day_readmit': 'label'})

    # Clean label: drop NaN, cast to int, keep only 0/1
    needed = needed.dropna(subset=['label'])
    needed['label'] = needed['label'].astype(int)
    needed = needed[needed['label'].isin([0, 1])].reset_index(drop=True)

    # Clean text: coerce to string, drop empty/whitespace-only
    needed['text'] = needed['text'].fillna('').astype(str)
    needed = needed[needed['text'].str.strip() != ''].reset_index(drop=True)

    print(needed.head(3))
    return needed



def load_and_process_with_only_filter():
    # Load & filter source rows
    df = pd.read_excel(FILE_PATH)
    df = df[df['engaged'] == ENGAGED and df['PATIENT_SUBSET'] == PATIENT_SUBSET and df['vector_subset'] == VECTORIZE_SUBSET].reset_index(drop=True)
    df = df[df['FETAURE_SUBSET'] == FETAURE_SUBSET and df['SUMMERIZED'] == SUMMERIZED].reset_index(drop=True)
    
    return df


In [None]:
def save_new_data_text_dataframe(dataframe):
    dataframe.to_csv(OUTPUT_CSV, index = False)

def save_new_data_text_dataframe_with_most_rows(df):
    df.to_csv("with_all.csv", index = False)

df = load_and_process()
save_new_data_text_dataframe_with_most_rows(df)

                                                text  label
0                 sdoh emotional support closed case      0
1  looking for housing in the same and needs acce...      1
2  sdoh/ emotional support emotiional support/ ca...      0


In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed)
from datasets import Dataset, DatasetDict


In [31]:
df = pd.read_csv(OUTPUT_CSV)  # if loading from disk


In [32]:
# 1) Hyperparameters
SEED   = 5
EPOCHS = 100           # as requested
LR     = 0.001          # as requested (note: very high for BERT)
BATCH  = 16
MODEL  = "bert-base-uncased"
MAXLEN = 128

set_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [33]:
# 2) Clean & validate your two columns
assert {'text','label'}.issubset(df.columns), "df must have 'text' and 'label'"
# keep only those two columns
df = df[['text','label']].copy()

In [34]:
# drop rows with NaN/empty text
df['text'] = df['text'].astype(str).str.strip()
df = df[df['text'] != '']
df = df.dropna(subset=['text'])


In [35]:
# coerce label to int and restrict to {0,1}
df['label'] = df['label'].astype(int)
assert set(df['label'].unique()).issubset({0,1}), "label must be 0/1 only"

In [36]:
# 3) Split train / val / test: 70 / 10 / 20 (stratified)
train_val_df, test_df = train_test_split(
    df, test_size=0.20, random_state=SEED, stratify=df['label']
)
val_ratio = (0.10 * len(df)) / max(1, len(train_val_df))  # ~10% overall
train_df, val_df = train_test_split(
    train_val_df, test_size=val_ratio, random_state=SEED, stratify=train_val_df['label']
)

In [37]:
print(f"Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

# --------------------------
# 4) Convert to HF datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))
raw = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})

Train=541, Val=78, Test=155


In [38]:
# 5) Tokenizer & tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"], truncation=True, padding="max_length", max_length=MAXLEN
    )

tokenized = raw.map(tokenize_fn, batched=True, remove_columns=[c for c in raw["train"].column_names if c not in ("text","label")])
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

Map: 100%|██████████| 541/541 [00:00<00:00, 2939.41 examples/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 3471.33 examples/s]
Map: 100%|██████████| 155/155 [00:00<00:00, 4190.63 examples/s]


In [39]:
# 6) Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# 7) Metrics (Accuracy + ROC-AUC)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()   # (N,2)
    preds = probs.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    # Binary AUC uses positive-class (column 1)
    # If a split somehow ends up with only one class, AUC is undefined -> NaN
    if len(np.unique(labels)) < 2:
        auc = float("nan")
    else:
        auc = roc_auc_score(labels, probs[:, 1])
    return {"accuracy": acc, "roc_auc": auc}

In [45]:
# 8) Training args (AdamW under the hood; lr=0.01 per your spec)
args = TrainingArguments(
    output_dir=BERT_CLF_DIRECTORY,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=0.0,                 # behave like Adam (no decay)
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=50,
    seed=SEED,
    report_to="none",
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [46]:
# 9) Train
trainer.train()
print("Best checkpoint:", trainer.state.best_model_checkpoint)


  1%|          | 24/3400 [25:20<89:51:35, 95.82s/it]  

KeyboardInterrupt: 

In [None]:
# 10) Evaluate (Val & Test)
val_metrics  = trainer.evaluate(tokenized["validation"])
test_metrics = trainer.evaluate(tokenized["test"])
print("\nValidation:", val_metrics)
print("Test:", test_metrics)

In [None]:
# 11) Save model & tokenizer
trainer.save_model("bert_textclf_best")
tokenizer.save_pretrained("bert_textclf_best")

In [None]:
# 12) Predict on test (optional CSV)
pred_logits = trainer.predict(tokenized["test"]).predictions
pred_probs  = torch.softmax(torch.tensor(pred_logits), dim=1).numpy()
pred_labels = pred_probs.argmax(axis=1)

out = test_df.reset_index(drop=True).copy()
out["pred_label"] = pred_labels
out["score"] = pred_probs[:, 1]      # positive-class probability
out.to_csv("bert_test_predictions.csv", index=False)
print("Saved predictions to bert_test_predictions.csv")