<a href="https://colab.research.google.com/github/urmilapol/urmilapolprojects/blob/master/Copy_of_thyroidimagefinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import h5py
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv3D, MaxPooling3D, Flatten, Dense, Dropout, TimeDistributed, Conv2D, MaxPooling2D, LSTM, concatenate
from tensorflow.keras.optimizers import Adam
import os

# --- 1. Data Loading ---

# Assuming 'dataset.hdf5' and 'metadata.csv' are in the same directory
HDF5_FILE = 'dataset.hdf5'
METADATA_FILE = 'metadata.csv'

def load_data(/content/sample_data/dataset.hdf5, /content/sample_data/metadata.csv):
    """
    Loads cine-clip image data and metadata.
    Note: Loading the entire HDF5 can be memory intensive.
          Consider loading data in batches or on-the-fly during training.
    """
    try:
        with h5py.File(/content/sample_data/dataset.hdf5, 'r') as f:
            # Assuming 'cine_clips' is the key for the image data
            # The structure might be different, you'll need to inspect the HDF5 file
            # Example: images = f['cine_clips'][:]
            # For large datasets, you might iterate or use a custom data generator
            print(f"Keys in HDF5 file: {list(f.keys())}")
            # For demonstration, we'll assume a structure where 'images' and 'labels' are directly accessible
            # You will likely need to map metadata to the image data
            images = f['images'][:] # This is a placeholder, adapt based on actual HDF5 structure
            # Check the shape of your images. It should be (num_clips, num_frames, height, width, channels)
            print(f"Shape of loaded images: {images.shape}")

        metadata_df = pd.read_csv(metadata_path)
        print(f"Shape of metadata: {metadata_df.shape}")
        return images, metadata_df
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Please ensure 'dataset.hdf5' and 'metadata.csv' are in the correct path "
              "and inspect the HDF5 file structure to access the correct keys.")
        return None, None

# Example usage:
# images, metadata_df = load_data(HDF5_FILE, METADATA_FILE)

# For the purpose of a runnable example, let's create dummy data
# In a real scenario, you'd load the actual data
num_clips = 100
frames_per_clip = 30
img_height = 128
img_width = 128
channels = 1 # Grayscale ultrasound images

images = np.random.rand(num_clips, frames_per_clip, img_height, img_width, channels).astype(np.float32)
metadata_df = pd.DataFrame({
    'patient_id': [f'P{i}' for i in range(num_clips)],
    'nodule_id': [f'N{i}' for i in range(num_clips)],
    'age': np.random.randint(20, 80, num_clips),
    'gender': np.random.choice(['M', 'F'], num_clips),
    'lesion_size_mm': np.random.uniform(5, 50, num_clips),
    'ti_rads': np.random.randint(2, 6, num_clips), # TI-RADS 2 to 5
    'histopathological_diagnosis': np.random.choice(['benign', 'malignant'], num_clips, p=[0.8, 0.2])
})
print("Using dummy data for demonstration.")
print(f"Dummy images shape: {images.shape}")
print(f"Dummy metadata shape: {metadata_df.shape}")


# --- 2. Preprocessing ---

def preprocess_metadata(df):
    """
    Encodes categorical features and scales numerical features.
    """
    # Encode 'gender'
    le_gender = LabelEncoder()
    df['gender_encoded'] = le_gender.fit_transform(df['gender'])

    # Scale numerical features (age, lesion_size_mm, ti_rads)
    scaler = StandardScaler()
    df[['age_scaled', 'lesion_size_mm_scaled', 'ti_rads_scaled']] = \
        scaler.fit_transform(df[['age', 'lesion_size_mm', 'ti_rads']])

    # Encode target variable
    le_diagnosis = LabelEncoder()
    df['diagnosis_encoded'] = le_diagnosis.fit_transform(df['histopathological_diagnosis'])
    # Malignant will likely be 1, benign 0 (check mapping with le_diagnosis.classes_)
    print(f"Diagnosis classes: {le_diagnosis.classes_}")

    return df, le_diagnosis

metadata_df, label_encoder = preprocess_metadata(metadata_df.copy()) # Use a copy to avoid modifying original df

# Assuming you have a way to map images to their corresponding metadata entry
# For this dummy data, we assume a 1:1 mapping by index
image_data = images # This would be your actual loaded cine-clip data
clinical_data = metadata_df[['age_scaled', 'gender_encoded', 'lesion_size_mm_scaled', 'ti_rads_scaled']].values
labels = metadata_df['diagnosis_encoded'].values

# Split data into training and testing sets
X_img_train, X_img_test, X_clin_train, X_clin_test, y_train, y_test = train_test_split(
    image_data, clinical_data, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"X_img_train shape: {X_img_train.shape}")
print(f"X_clin_train shape: {X_clin_train.shape}")
print(f"y_train shape: {y_train.shape}")


# --- 3. Model Building (Deep Learning for Cine-clips + Clinical Data) ---

def build_hybrid_model(input_shape_images, input_shape_clinical):
    """
    Builds a hybrid deep learning model for disease prediction.
    Combines a 3D CNN (for cine-clips) with a dense layer for clinical features.
    """
    # Image Input Branch (3D CNN for cine-clips)
    img_input = Input(shape=input_shape_images, name='image_input')
    x = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(img_input)
    x = MaxPooling3D((2, 2, 2), padding='same')(x)
    x = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(x)
    x = MaxPooling3D((2, 2, 2), padding='same')(x)
    x = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(x)
    x = MaxPooling3D((2, 2, 2), padding='same')(x)
    x = Flatten()(x)
    img_features = Dense(128, activation='relu')(x)
    img_features = Dropout(0.5)(img_features)

    # Clinical Data Input Branch
    clinical_input = Input(shape=(input_shape_clinical,), name='clinical_input')
    y = Dense(32, activation='relu')(clinical_input)
    clinical_features = Dropout(0.2)(y)

    # Concatenate features from both branches
    combined_features = concatenate([img_features, clinical_features])

    # Output Layer
    z = Dense(64, activation='relu')(combined_features)
    z = Dropout(0.3)(z)
    output = Dense(1, activation='sigmoid', name='output_diagnosis')(z) # Sigmoid for binary classification

    model = Model(inputs=[img_input, clinical_input], outputs=output)
    return model

# Define input shapes
input_shape_images = (frames_per_clip, img_height, img_width, channels)
input_shape_clinical = clinical_data.shape[1] # Number of clinical features

model = build_hybrid_model(input_shape_images, input_shape_clinical)
model.summary()

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# --- 4. Training ---

# You might need to adjust batch_size and epochs based on your hardware and dataset size
batch_size = 8 # Smaller batch size for cine-clips due to memory
epochs = 10

# Create TensorFlow datasets for better performance and memory management
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({"image_input": X_img_train, "clinical_input": X_clin_train}, y_train)
).shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({"image_input": X_img_test, "clinical_input": X_clin_test}, y_test)
).batch(batch_size).prefetch(tf.data.AUTOTUNE)


print("\nStarting model training...")
history = model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=test_dataset
)
print("Model training complete.")


# --- 5. Evaluation ---

print("\nEvaluating model performance on test set...")
loss, accuracy, auc = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc:.4f}")

# --- 6. Prediction Example ---

def predict_new_patient(model, new_image_clip, new_clinical_data, label_encoder):
    """
    Makes a prediction for a single new patient.
    """
    # Ensure inputs are in the correct batch format for the model
    new_image_clip_batch = np.expand_dims(new_image_clip, axis=0)
    new_clinical_data_batch = np.expand_dims(new_clinical_data, axis=0)

    prediction_proba = model.predict([new_image_clip_batch, new_clinical_data_batch])[0][0]
    predicted_class_idx = (prediction_proba > 0.5).astype(int)
    predicted_diagnosis = label_encoder.inverse_transform([predicted_class_idx])[0]

    return predicted_diagnosis, prediction_proba

# Example new data (replace with actual new patient data)
# Remember to preprocess new clinical data using the same scalers/encoders
# used during training!
sample_index = np.random.randint(0, len(X_img_test))
sample_image_clip = X_img_test[sample_index]
sample_clinical_data = X_clin_test[sample_index]
true_diagnosis = label_encoder.inverse_transform([y_test[sample_index]])[0]

predicted_diagnosis, prediction_proba = predict_new_patient(
    model, sample_image_clip, sample_clinical_data, label_encoder
)

print(f"\n--- Prediction for a Sample Patient ---")
print(f"True Diagnosis: {true_diagnosis}")
print(f"Predicted Diagnosis: {predicted_diagnosis}")
print(f"Prediction Probability (Malignant): {prediction_proba:.4f}")

# You can also save your trained model
# model.save('thyroid_nodule_detection_model.h5')
# print("Model saved to thyroid_nodule_detection_model.h5")
