<a href="https://colab.research.google.com/github/this-is-singh19/tbdetectx/blob/master/ssdresnet50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
dataset_path = '/content/drive/My Drive/Dataset/imgs'
os.chdir(dataset_path)

In [4]:
dirlist = ['../imgs/health/', '../imgs/sick/', '../imgs/tb']
classes = ['Healthy', 'Sick', 'Tuberculosis']
filepaths = []
labels = []
for d, c in zip(dirlist, classes):
    flist = os.listdir(d)
    for f in flist:
        fpath = os.path.join(d, f)
        filepaths.append(fpath)
        labels.append(c)
print ('filepaths: ', len(filepaths), '   labels: ', len(labels))

filepaths:  8408    labels:  8408


In [5]:
Fseries = pd.Series(filepaths, name='file_paths')
Lseries = pd.Series(labels, name='labels')

# Ensure lengths match
min_length = min(len(Fseries), len(Lseries))
Fseries = Fseries[:min_length]
Lseries = Lseries[:min_length]

# Create the DataFrame with named columns
df = pd.concat([Fseries, Lseries], axis=1)
df.columns = ['file_paths', 'labels']

# Count occurrences of each label
label_counts = df['labels'].value_counts()
print(label_counts)

Healthy         3814
Sick            3809
Tuberculosis     785
Name: labels, dtype: int64


In [6]:
file_count = 1500
samples = []

for category in df['labels'].unique():
    category_slice = df.query("labels == @category")

    if len(category_slice) < file_count:
        # If the number of files in the category is less than file_count,
        # sample with replacement to fill up the required number of samples
        samples.append(category_slice.sample(file_count, replace=True, random_state=1))
    else:
        samples.append(category_slice.sample(file_count, replace=False, random_state=1))

df = pd.concat(samples, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)
print(df['labels'].value_counts())
print(len(df))

Sick            1500
Healthy         1500
Tuberculosis    1500
Name: labels, dtype: int64
4500


In [7]:
def split_data(df, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=None):
    """
    Split the data into training, validation, and test sets.

    Parameters:
    - df: DataFrame containing the data to be split.
    - train_size: The proportion of data to include in the training set (default: 0.7).
    - valid_size: The proportion of data to include in the validation set (default: 0.15).
    - test_size: The proportion of data to include in the test set (default: 0.15).
    - random_state: Seed for random number generation (optional).

    Returns:
    - train_df: DataFrame for training.
    - valid_df: DataFrame for validation.
    - test_df: DataFrame for testing.
    """
    if train_size + valid_size + test_size != 1.0:
        raise ValueError("The sum of train_size, valid_size, and test_size should be 1.0")

    # Split the data into training and test sets
    train_and_valid_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # Further split the training and validation data
    train_df, valid_df = train_test_split(train_and_valid_df,
                                          train_size=train_size / (train_size + valid_size),
                                          random_state=random_state)

    return train_df, valid_df, test_df

def print_label_counts(df, set_name):
    """
    Print label counts for a given DataFrame.

    Parameters:
    - df: DataFrame for which label counts should be printed.
    - set_name: Name of the data set (e.g., "Training", "Validation", "Test").
    """
    print(f"{set_name} Set Label Counts:")
    label_counts = df['labels'].value_counts()
    print(label_counts)

# Split the data into train, validation, and test sets
train_df, valid_df, test_df = split_data(df, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=0)

# Print label counts for each set
print_label_counts(train_df, "Training")
print_label_counts(valid_df, "Validation")
print_label_counts(test_df, "Test")

Training Set Label Counts:
Sick            1066
Healthy         1044
Tuberculosis    1040
Name: labels, dtype: int64
Validation Set Label Counts:
Healthy         233
Tuberculosis    229
Sick            213
Name: labels, dtype: int64
Test Set Label Counts:
Tuberculosis    231
Healthy         223
Sick            221
Name: labels, dtype: int64


In [13]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Define the input shape
input_shape = (224, 224, 3)

# Create the ResNet50 base model
resnet50 = ResNet50(input_shape=input_shape, include_top=False, weights='imagenet')

# Allow some layers of ResNet50 to be trainable
for layer in resnet50.layers[:-10]:
    layer.trainable = False

# Add SSD layers on top of the base model
ssd_model = models.Sequential()
ssd_model.add(resnet50)
ssd_model.add(layers.Conv2D(512, (3, 3), padding='same'))
ssd_model.add(layers.BatchNormalization())
ssd_model.add(layers.Activation('relu'))
ssd_model.add(layers.MaxPooling2D((2, 2), strides=2))
ssd_model.add(layers.Conv2D(1024, (3, 3), padding='same'))
ssd_model.add(layers.Conv2D(2048, (3, 3), padding='same'))
ssd_model.add(layers.Flatten())
ssd_model.add(layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
ssd_model.add(layers.BatchNormalization())
ssd_model.add(layers.Dropout(0.5))
ssd_model.add(layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
ssd_model.add(layers.BatchNormalization())
ssd_model.add(layers.Dropout(0.5))
ssd_model.add(layers.Dense(3, activation='softmax'))

# Compile the SSD model with a learning rate scheduler
initial_learning_rate = 0.0001
lr_scheduler = ReduceLROnPlateau(factor=0.1, patience=3, min_lr=1e-7)

ssd_model.compile(optimizer=Adam(learning_rate=initial_learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

# Define the data generators for training, validation, and test with increased data augmentation
target_size = (224, 224)
batch_size = 4

train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input, horizontal_flip=True)
test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input)
train_gen = train_datagen.flow_from_dataframe(train_df, x_col='file_paths', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='categorical')
valid_gen = test_datagen.flow_from_dataframe(valid_df, x_col='file_paths', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='categorical')
test_gen = test_datagen.flow_from_dataframe(test_df, x_col='file_paths', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='categorical')


# Train the SSD model with learning rate scheduler
ssd_model.fit(train_generator, epochs=50, validation_data=valid_generator, callbacks=[lr_scheduler])

# Evaluate the SSD model on the test set
test_loss, test_accuracy = ssd_model.evaluate(test_generator)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# Save the SSD model
ssd_model.save('ssd_resnet50.h5')


Found 3150 validated image filenames belonging to 3 classes.
Found 675 validated image filenames belonging to 3 classes.
Found 675 validated image filenames belonging to 3 classes.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 90.37%


  saving_api.save_model(


In [26]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, recall_score


y_pred_prob = ssd_model.predict(test_generator)

# Calculate accuracy
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate AUC
auc = roc_auc_score(y_true, y_pred_prob, multi_class='ovr', average='weighted')
print("AUC:", auc)

# Calculate average precision
from sklearn.preprocessing import label_binarize

# Binarize the true labels
y_true_bin = label_binarize(y_true, classes=['Tuberculosis'])  # Replace [...] with your class labels

# Calculate average precision for each class
average_precision = []
for i in range(y_true_bin.shape[1]):
    class_average_precision = average_precision_score(y_true_bin[:, i], y_pred_prob[:, i])
    average_precision.append(class_average_precision)

# Calculate the weighted average of class average precisions
weighted_average_precision = np.average(average_precision)

print("Weighted Average Precision:", weighted_average_precision)

# Calculate average recall
average_recall = recall_score(y_true, y_pred, average='weighted')
print("Average Recall:", average_recall)


AUC: 0.5060577596426101
Weighted Average Precision: 0.0
Average Recall: 0.3422222222222222


  mask |= (ar1 == a)
