In [1]:

from transformers import pipeline
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
import torch
import pandas as pd
import numpy as np
# from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from sklearn.model_selection import StratifiedKFold, KFold
from transformers import pipeline

# Check for GPU availability
if torch.cuda.is_available():
    print("CUDA (GPU support) is available and enabled!")
    device = torch.device("cuda")
else:
    print("CUDA (GPU support) is not available. Falling back to CPU.")
    device = torch.device("cpu")

# Ensure PyTorch uses all available threads
torch.set_num_threads(torch.get_num_threads())

def data_pre_processing(df):
    timestamp_columns = [col for col in df.columns if "timestamp" in col]
    other_non_imp_column_to_remove = ["id", "general_relapse_class"]
    columns_with_all_nan = df.columns[df.isna().all(axis=0)].tolist()
    print("shape of the dataframe before dropping columns", df.shape)
    df.drop(timestamp_columns + other_non_imp_column_to_remove + columns_with_all_nan, axis=1, inplace=True)
    print("shape of the dataframe after dropping columns", df.shape)
    categorical = df.select_dtypes(include=['bool', 'object']).columns.tolist()
    numerical = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    print("Length of categorical and numerical columns:", len(categorical), len(numerical))
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    imputed_data = imputer.fit_transform(df[numerical])
    df_imputed_numerical = pd.DataFrame(imputed_data, columns=numerical, index=df.index)
    df[numerical] = df_imputed_numerical
    return pd.get_dummies(df, columns=categorical, drop_first=True)

# In[3]:

df = pd.read_csv("/home/ubuntu/Tabular_Machine_Learning_Using_LLM/Data/patient_features_early_stage.csv")
df.rename(columns={'relapse?': 'general_relapse_class'}, inplace=True)
y = df["general_relapse_class"].values
df_encoded = data_pre_processing(df)
X = df_encoded.values
df_encoded.head()

def row_to_sentence_full(row):
    sentence_parts = [
        f"A patient of age {row['age']} started chemotherapy at {row['chemotherapy@t1_start_time']} days",
        f"underwent radiotherapy for {row['radiotherapy@t1_duration_days']} days",
        f"and had surgery at {row['surgery@t1_time']} days",
        f"with a history of smoking {row['nb_cig_packs_year']} cigarette packs per year",
        f"and {row['nb_cigs_day']} cigarettes per day",
    ]
    family_history = []
    if row['family_lung_cancer']:
        family_history.append("lung cancer")
    if row['family_other_cancer']:
        family_history.append("other types of cancer")
    if family_history:
        sentence_parts.append(f"and a family history that includes: {', '.join(family_history)}")
    else:
        sentence_parts.append("and no family history of cancer")
    for col in row.index:
        if pd.api.types.is_numeric_dtype(row[col]) and not '@' in col and col not in ['age', 'Unnamed: 0']:
            sentence_parts.append(f"{col.replace('_', ' ')} of {row[col]}")
        elif pd.api.types.is_bool_dtype(row[col]) and row[col]:
            sentence_parts.append(f"and has a condition of {col.replace('@', ' at ').replace('_', ' ')}")
    sentence = ', '.join(sentence_parts) + '.'
    return sentence


texts = df_encoded.apply(row_to_sentence_full, axis=1).tolist()
labels = y.tolist()


candidate_labels = ['non relapse','relapse']
classifier = pipeline('zero-shot-classification', model="facebook/bart-large-mnli", device = device)
# classifier = pipeline('zero-shot-classification', model='roberta-large-mnli',device = device)
# classifier = pipeline('zero-shot-classification', model = "distilbert/distilbert-base-uncased", device = device)
# classifier = pipeline('zero-shot-classification',model = "mistralai/Mistral-7B-v0.1",device = device)

# Print the device being used by the model
print("Model device:", classifier.device)

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"Training fold {fold + 1}/5")
    predictions = []
    y_pred_prob = []
    train_texts_fold = [texts[i] for i in train_idx]
    val_texts_fold = [texts[i] for i in val_idx]
    train_labels_fold = [labels[i] for i in train_idx]
    val_labels_fold = [labels[i] for i in val_idx]
    res = classifier(val_texts_fold, candidate_labels)
    for i in res:
        predictions.append(1 if i['labels'][0] == 'relapse' else 0)  # Predicted label (1 for relapse, 0 for non relapse)
        y_pred_prob.append(i['scores'][0] if i['labels'][0] == 'relapse' else i['scores'][1])  # Probability of relapse
        

    fold_results.append({
        "accuracy": accuracy_score(val_labels_fold, predictions),
        "precision": precision_score(val_labels_fold, predictions),
        "recall": recall_score(val_labels_fold, predictions),
        "f1": f1_score(val_labels_fold, predictions),
        "auc_roc": roc_auc_score(val_labels_fold, y_pred_prob),
        "auc_pr": average_precision_score(val_labels_fold, y_pred_prob)
    })

# Calculate mean and std dev of metrics
metrics_mean = {metric: np.mean([result[metric] for result in fold_results]) for metric in fold_results[0]}
metrics_std = {metric: np.std([result[metric] for result in fold_results]) for metric in fold_results[0]}

# Report metrics
for metric in metrics_mean:
    print(f"{metric}: {metrics_mean[metric]:.3f} ± {metrics_std[metric]:.3f}")



  from .autonotebook import tqdm as notebook_tqdm


CUDA (GPU support) is available and enabled!
shape of the dataframe before dropping columns (1348, 76)
shape of the dataframe after dropping columns (1348, 70)
Length of categorical and numerical columns: 59 11
Model device: cuda
Training fold 1/5
Training fold 2/5
Training fold 3/5
Training fold 4/5
Training fold 5/5
accuracy: 0.549 ± 0.024
precision: 0.386 ± 0.027
recall: 0.401 ± 0.029
f1: 0.393 ± 0.024
auc_roc: 0.483 ± 0.022
auc_pr: 0.375 ± 0.025


In [2]:

# classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')
# sequence_to_classify = "A patient of age 58.0 started chemotherapy at 4.0 days, underwent radiotherapy for 46.0 days, and had surgery at 4.0 days, with a history of smoking 20.0 cigarette packs per year, and 10.0 cigarettes per day, and a family history that includes: other types of cancer."
# candidate_labels = ['relapse', 'non relapse']
# classifier(sequence_to_classify, candidate_labels)