In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
import torch
import pandas as pd
import numpy as np
# from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# from torch.utils.data import Dataset
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold, KFold
from transformers import pipeline
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        
   
set_seed(42)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

import os
os.environ["WANDB_DISABLED"] = "true"

# Check for GPU availability
if torch.cuda.is_available():
    print("CUDA (GPU support) is available and enabled!")
    device = torch.device("cuda")
else:
    print("CUDA (GPU support) is not available. Falling back to CPU.")
    device = torch.device("cpu")

# Ensure PyTorch uses all available threads
torch.set_num_threads(torch.get_num_threads())

def data_pre_processing(df):
    timestamp_columns = [col for col in df.columns if "timestamp" in col]
    other_non_imp_column_to_remove = ["id", "general_relapse_class"]
    columns_with_all_nan = df.columns[df.isna().all(axis=0)].tolist()
    print("shape of the dataframe before dropping columns", df.shape)
    df.drop(timestamp_columns + other_non_imp_column_to_remove + columns_with_all_nan, axis=1, inplace=True)
    print("shape of the dataframe after dropping columns", df.shape)
    categorical = df.select_dtypes(include=['bool', 'object']).columns.tolist()
    numerical = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    print("Length of categorical and numerical columns:", len(categorical), len(numerical))
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    imputed_data = imputer.fit_transform(df[numerical])
    df_imputed_numerical = pd.DataFrame(imputed_data, columns=numerical, index=df.index)
    df[numerical] = df_imputed_numerical
    return pd.get_dummies(df, columns=categorical, drop_first=True)

# In[3]:

df = pd.read_csv("/home/ubuntu/Tabular_Machine_Learning_Using_LLM/Data/patient_features_early_stage.csv")
df.rename(columns={'relapse?': 'general_relapse_class'}, inplace=True)
y = df["general_relapse_class"].values
df_encoded = data_pre_processing(df)
X = df_encoded.values
df_encoded.head()

def row_to_sentence_full(row):
    sentence_parts = [
        f"A patient of age {row['age']} started chemotherapy at {row['chemotherapy@t1_start_time']} days",
        f"underwent radiotherapy for {row['radiotherapy@t1_duration_days']} days",
        f"and had surgery at {row['surgery@t1_time']} days",
        f"with a history of smoking {row['nb_cig_packs_year']} cigarette packs per year",
        f"and {row['nb_cigs_day']} cigarettes per day",
    ]
    family_history = []
    if row['family_lung_cancer']:
        family_history.append("lung cancer")
    if row['family_other_cancer']:
        family_history.append("other types of cancer")
    if family_history:
        sentence_parts.append(f"and a family history that includes: {', '.join(family_history)}")
    else:
        sentence_parts.append("and no family history of cancer")
    for col in row.index:
        if pd.api.types.is_numeric_dtype(row[col]) and not '@' in col and col not in ['age', 'Unnamed: 0']:
            sentence_parts.append(f"{col.replace('_', ' ')} of {row[col]}")
        elif pd.api.types.is_bool_dtype(row[col]) and row[col]:
            sentence_parts.append(f"and has a condition of {col.replace('@', ' at ').replace('_', ' ')}")
    sentence = ', '.join(sentence_parts) + '.'
    return sentence


texts = df_encoded.apply(row_to_sentence_full, axis=1).tolist()
labels = y.tolist()

data = {'text': texts,
        'label':labels}
new_df = pd.DataFrame(data)


In [None]:
new_df.head()
X = new_df["text"]
y = new_df["label"]

relapse_df = new_df[new_df['label'] == 1]
non_relapse_df = new_df[new_df['label'] == 0]

In [None]:
# For relapse patients
relapse_train, relapse_test = train_test_split(relapse_df, test_size=100, random_state=42)
relapse_train, relapse_val = train_test_split(relapse_train, test_size=50, random_state=42)

# For non-relapse patients
non_relapse_train, non_relapse_test = train_test_split(non_relapse_df, test_size=100, random_state=42)
non_relapse_train, non_relapse_val = train_test_split(non_relapse_train, test_size=50, random_state=42)

train_df = pd.concat([relapse_train, non_relapse_train])
test_df = pd.concat([relapse_test, non_relapse_test])
val_df = pd.concat([relapse_val, non_relapse_val])

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# # Split into train+validation and test sets with stratification
# X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# # Further split train+validation into train and validation sets with stratification
# X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42)  # 0.25 * 0.8 = 0.2 of the original

# # Convert pandas DataFrames back to Hugging Face Datasets
# train_dataset = Dataset.from_pandas(pd.concat([X_train, y_train], axis=1))
# val_dataset = Dataset.from_pandas(pd.concat([X_val, y_val], axis=1))
# test_dataset = Dataset.from_pandas(pd.concat([X_test, y_test], axis=1))

In [None]:
candidate_labels = ['non relapse','relapse']
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# X_train, X_test = train_test_split(new_df, test_size=0.2, random_state=42, shuffle=True)

# data = Dataset.from_pandas(new_df)
# train_val_test_split = data.train_test_split(test_size=0.2, seed=42)
# train_val_dataset = train_val_test_split['train']
# test_dataset = train_val_test_split['test']
# train_val_split = train_val_dataset.train_test_split(test_size=0.25, seed=42)  # 0.25 x 0.8 = 0.2
# train_dataset = train_val_split['train']
# val_dataset = train_val_split['test']


# train_test_split = data.train_test_split(test_size=0.2, seed=42)

# Access the training and test sets
# train_data = train_test_split['train']
# test_data = train_test_split['test']


In [None]:
print(train_dataset)
print(val_dataset)
print(test_dataset)

In [None]:
test_dataset[0]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
# tokenizer.pad_token = tokenizer.eos_token


In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

# tokenized_text = data.map(preprocess_function, batched = True)
tokenized_train = train_dataset.map(preprocess_function, batched = True)
tokenized_val = val_dataset.map(preprocess_function, batched = True)
tokenized_test = test_dataset.map(preprocess_function, batched = True)

In [None]:
tokenized_train

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0: "No_Relapse", 1: "Relapse"}
label2id = {"No_Relapse": 0, "Relapse": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2,id2label=id2label, label2id=label2id).to(device)


# model = AutoModelForSequenceClassification.from_pretrained('EleutherAI/gpt-neo-125M', num_labels=2,
#                                                            id2label=id2label, label2id=label2id).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
logits, _, _ = trainer.predict(tokenized_test)
predictions = np.argmax(logits, axis=-1)


In [None]:
predictions

In [None]:
# tokenized_test['label']

In [None]:
print("accuracy:", accuracy_score(tokenized_test['label'], predictions))
print("precision:", precision_score(tokenized_test['label'], predictions))
print("recall:", recall_score(tokenized_test['label'], predictions))
print("f1:", f1_score(tokenized_test['label'], predictions))
print("auc_roc:", roc_auc_score(tokenized_test['label'], torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()[:, 1]))
print("auc_pr:", average_precision_score(tokenized_test['label'], torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()[:, 1]))

In [None]:
# classifier = pipeline("relapse_prediction", model="my_awesome_model")