<a href="https://colab.research.google.com/github/this-is-singh19/tbdetectx/blob/master/ssd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB0

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
dataset_path = '/content/drive/My Drive/Dataset/imgs'
os.chdir(dataset_path)

In [4]:
dirlist = ['../imgs/health/', '../imgs/sick/', '../imgs/tb']
classes = ['Healthy', 'Sick', 'Tuberculosis']
filepaths = []
labels = []
for d, c in zip(dirlist, classes):
    flist = os.listdir(d)
    for f in flist:
        fpath = os.path.join(d, f)
        filepaths.append(fpath)
        labels.append(c)
print ('filepaths: ', len(filepaths), '   labels: ', len(labels))

filepaths:  8408    labels:  8408


In [5]:
Fseries = pd.Series(filepaths, name='file_paths')
Lseries = pd.Series(labels, name='labels')

# Ensure lengths match
min_length = min(len(Fseries), len(Lseries))
Fseries = Fseries[:min_length]
Lseries = Lseries[:min_length]

# Create the DataFrame with named columns
df = pd.concat([Fseries, Lseries], axis=1)
df.columns = ['file_paths', 'labels']

# Count occurrences of each label
label_counts = df['labels'].value_counts()
print(label_counts)

Healthy         3814
Sick            3809
Tuberculosis     785
Name: labels, dtype: int64


In [6]:
file_count = 1500
samples = []

for category in df['labels'].unique():
    category_slice = df.query("labels == @category")

    if len(category_slice) < file_count:
        # If the number of files in the category is less than file_count,
        # sample with replacement to fill up the required number of samples
        samples.append(category_slice.sample(file_count, replace=True, random_state=1))
    else:
        samples.append(category_slice.sample(file_count, replace=False, random_state=1))

df = pd.concat(samples, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)
print(df['labels'].value_counts())
print(len(df))

Sick            1500
Healthy         1500
Tuberculosis    1500
Name: labels, dtype: int64
4500


In [7]:
def split_data(df, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=None):
    """
    Split the data into training, validation, and test sets.

    Parameters:
    - df: DataFrame containing the data to be split.
    - train_size: The proportion of data to include in the training set (default: 0.7).
    - valid_size: The proportion of data to include in the validation set (default: 0.15).
    - test_size: The proportion of data to include in the test set (default: 0.15).
    - random_state: Seed for random number generation (optional).

    Returns:
    - train_df: DataFrame for training.
    - valid_df: DataFrame for validation.
    - test_df: DataFrame for testing.
    """
    if train_size + valid_size + test_size != 1.0:
        raise ValueError("The sum of train_size, valid_size, and test_size should be 1.0")

    # Split the data into training and test sets
    train_and_valid_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # Further split the training and validation data
    train_df, valid_df = train_test_split(train_and_valid_df,
                                          train_size=train_size / (train_size + valid_size),
                                          random_state=random_state)

    return train_df, valid_df, test_df

def print_label_counts(df, set_name):
    """
    Print label counts for a given DataFrame.

    Parameters:
    - df: DataFrame for which label counts should be printed.
    - set_name: Name of the data set (e.g., "Training", "Validation", "Test").
    """
    print(f"{set_name} Set Label Counts:")
    label_counts = df['labels'].value_counts()
    print(label_counts)

# Split the data into train, validation, and test sets
train_df, valid_df, test_df = split_data(df, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=0)

# Print label counts for each set
print_label_counts(train_df, "Training")
print_label_counts(valid_df, "Validation")
print_label_counts(test_df, "Test")

Training Set Label Counts:
Sick            1066
Healthy         1044
Tuberculosis    1040
Name: labels, dtype: int64
Validation Set Label Counts:
Healthy         233
Tuberculosis    229
Sick            213
Name: labels, dtype: int64
Test Set Label Counts:
Tuberculosis    231
Healthy         223
Sick            221
Name: labels, dtype: int64


In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam

# Define the input shape
input_shape = (224, 224, 3)

# Create the VGG16 base model
vgg16 = VGG16(input_shape=input_shape, include_top=False, weights='imagenet')

# Freeze the weights of the base model
for layer in vgg16.layers:
    layer.trainable = False

# Add SSD layers on top of the base model
ssd_model = models.Sequential()
ssd_model.add(vgg16)
ssd_model.add(layers.Conv2D(512, (3, 3), activation='relu', padding='same'))
ssd_model.add(layers.MaxPooling2D((2, 2), strides=2))
ssd_model.add(layers.Conv2D(1024, (3, 3), activation='relu', padding='same'))
ssd_model.add(layers.Conv2D(2048, (3, 3), activation='relu', padding='same'))
ssd_model.add(layers.Flatten())
ssd_model.add(layers.Dense(512, activation='relu'))
ssd_model.add(layers.Dropout(0.5))
ssd_model.add(layers.Dense(128, activation='relu'))
ssd_model.add(layers.Dropout(0.5))
ssd_model.add(layers.Dense(3, activation='softmax'))

# Compile the SSD model
ssd_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Define the data generators for training, validation, and test
train_datagen = ImageDataGenerator(rescale=1./255, rotation_range=10, width_shift_range=0.1, height_shift_range=0.1,
                                   horizontal_flip=True)
valid_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(train_df, x_col='file_paths', y_col='labels', target_size=(224, 224),
                                                  batch_size=32)
valid_generator = valid_datagen.flow_from_dataframe(valid_df, x_col='file_paths', y_col='labels', target_size=(224, 224),
                                                  batch_size=32)
test_generator = test_datagen.flow_from_dataframe(test_df, x_col='file_paths', y_col='labels', target_size=(224, 224),
                                                 batch_size=32)

# Train the SSD model
ssd_model.fit_generator(train_generator, epochs=10, validation_data=valid_generator)

# Evaluate the SSD model on the test set
ssd_model.evaluate_generator(test_generator)

# Save the SSD model
ssd_model.save('ssd_vgg16.h5')


Found 3150 validated image filenames belonging to 3 classes.
Found 675 validated image filenames belonging to 3 classes.
Found 675 validated image filenames belonging to 3 classes.


  ssd_model.fit_generator(train_generator, epochs=10, validation_data=valid_generator)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  ssd_model.evaluate_generator(test_generator)


In [11]:
ssd_model.save('ssd_model_final.h5')

In [12]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, recall_score

# Assuming you have test data and labels
test_data, test_labels = next(valid_generator)

# Predictions from the model
predictions = ssd_model.predict(test_data)

# Convert one-hot encoded labels to single labels
true_labels = np.argmax(test_labels, axis=1)
predicted_labels = np.argmax(predictions, axis=1)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

# AUC
# AUC is meaningful for binary classification problems, so you might need to adjust this part based on your specific use case.
# For simplicity, let's assume you are evaluating a binary classification task.
# You can use roc_auc_score for multi-class problems with one-hot encoded labels as well.
auc = roc_auc_score(test_labels[:, 1], predictions[:, 1])
print(f'AUC: {auc}')

# Average Precision
# You need to calculate average precision separately for each class and then take the average
average_precision = average_precision_score(test_labels, predictions, average='micro')
print(f'Ave. Prec.: {average_precision}')

# Average Recall
# You need to calculate recall separately for each class and then take the average
average_recall = recall_score(true_labels, predicted_labels, average='macro')
print(f'Ave. Rec.: {average_recall}')


Accuracy: 0.90625
AUC: 0.9708333333333333
Ave. Prec.: 0.9706805506414882
Ave. Rec.: 0.9048821548821548
