In [17]:
!pip install transformers torch -q

In [2]:
import pandas as pd
import numpy as np
import torch
import joblib
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

from lightgbm import LGBMClassifier

In [5]:
TRAIN_FILE_PATH = '/content/Processed_Train_clinical_trials_data.xlsx'
MODEL_SAVE_PATH = 'clinical_trial_predictor.pkl'

In [7]:
TEST_FILE_PATH = '/content/Processed_Test_clinical_trials_data.xlsx'
PREDICTIONS_SAVE_PATH = 'test_data_predictions.csv'

In [3]:
class TextEmbeddingTransformer(BaseEstimator, TransformerMixin):
    """
    A custom scikit-learn transformer to generate text embeddings using a
    pre-trained Transformer model from Hugging Face.
    """
    def __init__(self, model_name='distilbert-base-uncased', batch_size=32):
        self.model_name = model_name
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        Generates embeddings for the input text data.
        """
        if not isinstance(X, pd.Series):
            X = pd.Series(X)

        all_embeddings = []
        print(f"Generating embeddings with {self.model_name} on {self.device}...")
        for i in tqdm(range(0, len(X), self.batch_size)):
            batch = X[i:i+self.batch_size].fillna('').tolist()
            inputs = self.tokenizer(
                batch, return_tensors='pt', truncation=True, padding=True, max_length=512
            ).to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeddings)

        return np.concatenate(all_embeddings, axis=0)

In [12]:

class ClinicalTrialPredictor:
    """
    A complete pipeline to train a model for predicting clinical trial outcomes
    and make predictions on new data.
    """
    def __init__(self):
        self.pipeline = None
        self._define_feature_sets()

    def _define_feature_sets(self):
        """
        Defines the column lists for different feature types.
        """
        self.numeric_features = [
            'Has_Results', 'Low_Enrollment', 'Results_Delay_Days', 'Suspended_Terminated'
        ]
        self.categorical_features = [
            'Sponsor', 'Funder Type', 'Allocation', 'Intervention Model', 'Masking',
            'Primary Purpose', 'BIOLOGICAL_1', 'COMBINATION_PRODUCT_1', 'DEVICE_1',
            'DRUG_1', 'DRUG_2', 'DRUG_3', 'OTHER_1', 'OTHER_2',
            'PROCEDURE_1', 'RADIATION_1'
        ]
        self.text_features = ['Conditions', 'Study_Context', 'Outcome_Details']

    def _create_pipeline(self):
        """
        Builds the full scikit-learn pipeline, including preprocessing for all
        data types and the final estimator model.
        """
        numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
        ])

        conditions_transformer = Pipeline(steps=[('embeddings', TextEmbeddingTransformer())])
        context_transformer = Pipeline(steps=[('embeddings', TextEmbeddingTransformer())])
        outcome_transformer = Pipeline(steps=[('embeddings', TextEmbeddingTransformer())])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features),
                ('cond_emb', conditions_transformer, 'Conditions'),
                ('cont_emb', context_transformer, 'Study_Context'),
                ('outc_emb', outcome_transformer, 'Outcome_Details')
            ],
            remainder='drop'
        )

        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LGBMClassifier(class_weight='balanced', random_state=42))
        ])

    def train(self, train_filepath):
        """
        Loads training data, splits it for validation, trains the pipeline,
        and evaluates its performance.
        """
        print("--- Starting Training Process ---")
        df = pd.read_excel(train_filepath)

        y = df['Outcome_numeric']
        X = df.drop(columns=['Outcome_numeric', 'Outcome'])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        self._create_pipeline()

        print("\nTraining model on the training set...")
        self.pipeline.fit(X_train, y_train)

        print("\nEvaluating model on the validation set...")
        y_pred = self.pipeline.predict(X_test)
        print("\n--- Validation Classification Report ---")
        print(classification_report(y_test, y_pred))

        print("\nRe-training model on the full dataset...")
        self.pipeline.fit(X, y)
        print("Training complete. Model is ready.")

    def predict(self, test_filepath, output_csv_path='predictions.csv'):
        """
        Loads new data, predicts the outcome, calculates the probability of
        FAILURE on a 0-100 scale, and saves the results to a CSV file.
        """
        if self.pipeline is None:
            raise RuntimeError("Model has not been trained yet. Please call the 'train' method first.")

        print(f"\n--- Making predictions on new data from {test_filepath} ---")
        df_test = pd.read_excel(test_filepath)

        predictions_numeric = self.pipeline.predict(df_test)
        predicted_probabilities = self.pipeline.predict_proba(df_test)

        failure_probabilities = (predicted_probabilities[:, 1] * 100).round(2)

        df_test['Outcome_numeric'] = predictions_numeric
        df_test['Failure_Probability'] = failure_probabilities
        df_test['Outcome'] = df_test['Outcome_numeric'].apply(lambda x: 'Fail' if x == 1 else 'Approved')

        df_test.to_csv(output_csv_path, index=False)
        print(f"Predictions saved successfully to {output_csv_path}")
        return df_test

    def save_model(self, filepath='clinical_trial_predictor.pkl'):
        """
        Saves the entire trained pipeline to a file.
        """
        if self.pipeline is None:
            raise RuntimeError("Model has not been trained yet. Cannot save an empty model.")

        print(f"\nSaving model to {filepath}...")
        joblib.dump(self.pipeline, filepath)
        print("Model saved successfully.")

    @classmethod
    def load_from_file(cls, filepath):
        """
        Loads a pre-trained pipeline from a file and returns a new instance
        of the predictor class.
        """
        print(f"Loading model from {filepath}...")
        pipeline = joblib.load(filepath)
        predictor = cls()
        predictor.pipeline = pipeline
        print("Model loaded successfully.")
        return predictor

In [6]:
predictor = ClinicalTrialPredictor()
predictor.train(TRAIN_FILE_PATH)
predictor.save_model(MODEL_SAVE_PATH)

--- Starting Training Process ---

Training model on the training set...
Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 32/32 [00:01<00:00, 17.79it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 32/32 [00:13<00:00,  2.41it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 32/32 [00:18<00:00,  1.70it/s]


[LightGBM] [Info] Number of positive: 614, number of negative: 404
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 587761
[LightGBM] [Info] Number of data points in the train set: 1018, number of used features: 2342
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

Evaluating model on the validation set...
Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 8/8 [00:00<00:00, 17.49it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 8/8 [00:02<00:00,  3.18it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 8/8 [00:05<00:00,  1.49it/s]



--- Validation Classification Report ---
              precision    recall  f1-score   support

           0       0.65      0.63      0.64       101
           1       0.76      0.77      0.77       154

    accuracy                           0.72       255
   macro avg       0.70      0.70      0.70       255
weighted avg       0.72      0.72      0.72       255


Re-training model on the full dataset...
Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 40/40 [00:02<00:00, 19.82it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 40/40 [00:13<00:00,  2.94it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 40/40 [00:23<00:00,  1.67it/s]


[LightGBM] [Info] Number of positive: 768, number of negative: 505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 587822
[LightGBM] [Info] Number of data points in the train set: 1273, number of used features: 2350
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training complete. Model is ready.

Saving model to clinical_trial_predictor.pkl...
Model saved successfully.


In [13]:
loaded_predictor = ClinicalTrialPredictor.load_from_file(MODEL_SAVE_PATH)
prediction_results = loaded_predictor.predict(TEST_FILE_PATH, PREDICTIONS_SAVE_PATH)

print("\n--- Prediction Results (First 5 Rows) ---")
print(prediction_results.head())

Loading model from clinical_trial_predictor.pkl...
Model loaded successfully.

--- Making predictions on new data from /content/Processed_Test_clinical_trials_data.xlsx ---
Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 9/9 [00:00<00:00, 23.10it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 9/9 [00:03<00:00,  2.81it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 9/9 [00:04<00:00,  1.84it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 9/9 [00:00<00:00, 24.89it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 9/9 [00:03<00:00,  2.72it/s]


Generating embeddings with distilbert-base-uncased on cuda...


100%|██████████| 9/9 [00:04<00:00,  1.88it/s]

Predictions saved successfully to test_data_predictions.csv

--- Prediction Results (First 5 Rows) ---
    NCT Number  Has_Results  Low_Enrollment  Results_Delay_Days  \
0  NCT03765918            0               0                  -1   
1  NCT05572515            0               0                  -1   
2  NCT05057494            0               0                  -1   
3  NCT05020236            0               0                  -1   
4  NCT06091865            0               0                  -1   

   Suspended_Terminated                                         Conditions  \
0                     0                            Head and Neck Neoplasms   
1                     0            Relapsed or Refractory Multiple Myeloma   
2                     0  Chronic Lymphocytic Leukemia or Small Lymphocy...   
3                     0                                   Multiple Myeloma   
4                     0              Diffuse Large B-cell Lymphoma (DLBCL)   

                         


