In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras import layers, models
import os

In [2]:
# Install the required library
!pip install gdown

# Provide the file ID
file_id = '1r-oNYTPiPCOUzSjChjCIYTdkjBTugqxR'

# Create a download link
download_link = f'https://drive.google.com/uc?id={file_id}'

# Download the file
!gdown $download_link


Downloading...
From (original): https://drive.google.com/uc?id=1r-oNYTPiPCOUzSjChjCIYTdkjBTugqxR
From (redirected): https://drive.google.com/uc?id=1r-oNYTPiPCOUzSjChjCIYTdkjBTugqxR&confirm=t&uuid=dbc10697-eff6-4fd0-b717-db2eea68e853
To: /content/TBX11K.zip
100% 3.31G/3.31G [00:49<00:00, 67.5MB/s]


In [3]:
"""
# Install the required library
!pip install zipfile

# Specify the name of the downloaded ZIP file
zip_file_path = '/content/TBX11K.zip'

# Specify the extraction directory
extracted_path = '/folders/1LqXQFENogsBlh9N-z9jFaid0qlgNzEVE'

# Extract the contents of the ZIP file
import zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)
"""
# Specify the name of the downloaded ZIP file
zip_file_path = '/content/TBX11K.zip'

# Specify the extraction directory
extracted_path = '/content/Dataset'

# Extract the contents of the ZIP file
import zipfile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)


In [4]:
dirlist = ['/content/Dataset/TBX11K/imgs/health', '/content/Dataset/TBX11K/imgs/sick', '/content/Dataset/TBX11K/imgs/tb']
classes = ['Healthy', 'Sick', 'Tuberculosis']
filepaths = []
labels = []
for d, c in zip(dirlist, classes):
    flist = os.listdir(d)
    for f in flist:
        fpath = os.path.join(d, f)
        filepaths.append(fpath)
        labels.append(c)
print ('filepaths: ', len(filepaths), '   labels: ', len(labels))

filepaths:  8400    labels:  8400


In [5]:
Fseries = pd.Series(filepaths, name='file_paths')
Lseries = pd.Series(labels, name='labels')

# Ensure lengths match
min_length = min(len(Fseries), len(Lseries))
Fseries = Fseries[:min_length]
Lseries = Lseries[:min_length]

# Create the DataFrame with named columns
df = pd.concat([Fseries, Lseries], axis=1)
df.columns = ['file_paths', 'labels']

# Count occurrences of each label
label_counts = df['labels'].value_counts()
print(label_counts)

Healthy         3800
Sick            3800
Tuberculosis     800
Name: labels, dtype: int64


In [6]:
file_count = 1500
samples = []

for category in df['labels'].unique():
    category_slice = df.query("labels == @category")

    if len(category_slice) < file_count:
        # If the number of files in the category is less than file_count,
        # sample with replacement to fill up the required number of samples
        samples.append(category_slice.sample(file_count, replace=True, random_state=1))
    else:
        samples.append(category_slice.sample(file_count, replace=False, random_state=1))

df = pd.concat(samples, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)
print(df['labels'].value_counts())
print(len(df))

Sick            1500
Healthy         1500
Tuberculosis    1500
Name: labels, dtype: int64
4500


In [7]:
def split_data(df, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=None):
    """
    Split the data into training, validation, and test sets.

    Parameters:
    - df: DataFrame containing the data to be split.
    - train_size: The proportion of data to include in the training set (default: 0.7).
    - valid_size: The proportion of data to include in the validation set (default: 0.15).
    - test_size: The proportion of data to include in the test set (default: 0.15).
    - random_state: Seed for random number generation (optional).

    Returns:
    - train_df: DataFrame for training.
    - valid_df: DataFrame for validation.
    - test_df: DataFrame for testing.
    """
    if train_size + valid_size + test_size != 1.0:
        raise ValueError("The sum of train_size, valid_size, and test_size should be 1.0")

    # Split the data into training and test sets
    train_and_valid_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # Further split the training and validation data
    train_df, valid_df = train_test_split(train_and_valid_df,
                                          train_size=train_size / (train_size + valid_size),
                                          random_state=random_state)

    return train_df, valid_df, test_df

def print_label_counts(df, set_name):
    """
    Print label counts for a given DataFrame.

    Parameters:
    - df: DataFrame for which label counts should be printed.
    - set_name: Name of the data set (e.g., "Training", "Validation", "Test").
    """
    print(f"{set_name} Set Label Counts:")
    label_counts = df['labels'].value_counts()
    print(label_counts)

# Split the data into train, validation, and test sets
train_df, valid_df, test_df = split_data(df, train_size=0.7, valid_size=0.15, test_size=0.15, random_state=0)

# Print label counts for each set
print_label_counts(train_df, "Training")
print_label_counts(valid_df, "Validation")
print_label_counts(test_df, "Test")

Training Set Label Counts:
Sick            1066
Healthy         1044
Tuberculosis    1040
Name: labels, dtype: int64
Validation Set Label Counts:
Healthy         233
Tuberculosis    229
Sick            213
Name: labels, dtype: int64
Test Set Label Counts:
Tuberculosis    231
Healthy         223
Sick            221
Name: labels, dtype: int64


In [8]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16  # Import VGG16

# Define the backbone network (VGG16)
base_model = VGG16(
    include_top=False,
    input_shape=(224, 224, 3),
    weights='imagenet'
)

# Define the region proposal network (RPN)
# This is a simplified version for demonstration purposes
# In practice, you would use a pre-trained RPN or custom implementation
rpn_input = base_model.output
rpn_output = Conv2D(256, (3, 3), activation='relu')(rpn_input)
rpn_output = MaxPooling2D(pool_size=(2, 2))(rpn_output)
rpn_output = Flatten()(rpn_output)
rpn_output = Dense(256, activation='relu')(rpn_output)

# Create the Fast R-CNN model
fast_rcnn_input = Input(shape=(224, 224, 3))  # Specify the input shape
roi_pooling_layer = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)  # Use the correct layer name for VGG16
roi_pooled_features = roi_pooling_layer(fast_rcnn_input)

fast_rcnn_output = GlobalAveragePooling2D()(roi_pooled_features)
fast_rcnn_output = Dense(128, activation='relu')(fast_rcnn_output)
fast_rcnn_output = tf.keras.layers.BatchNormalization()(fast_rcnn_output)
fast_rcnn_output = tf.keras.layers.Dropout(0.2)(fast_rcnn_output)
fast_rcnn_output = Dense(3, activation='softmax')(fast_rcnn_output)

fast_rcnn_model = Model(inputs=fast_rcnn_input, outputs=fast_rcnn_output)

# Optional: You may want to freeze the weights of the backbone network
for layer in base_model.layers:
    layer.trainable = False

# Compile the Fast R-CNN model with an appropriate loss and optimizer
fast_rcnn_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Print a summary of the Fast R-CNN model
fast_rcnn_model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 model (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 global_average_pooling2d (  (None, 512)               0         
 GlobalAveragePooling2D)                                         
                                                                 
 dense_1 (Dense)             (None, 128)               65664     
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                     

In [9]:
target_size = (224, 224)
batch_size = 4

train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input, horizontal_flip=True)
test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input)
train_gen = train_datagen.flow_from_dataframe(train_df, x_col='file_paths', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='categorical')
valid_gen = test_datagen.flow_from_dataframe(valid_df, x_col='file_paths', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='categorical')
test_gen = test_datagen.flow_from_dataframe(test_df, x_col='file_paths', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='categorical')


Found 3150 validated image filenames belonging to 3 classes.
Found 675 validated image filenames belonging to 3 classes.
Found 675 validated image filenames belonging to 3 classes.


In [10]:
train_steps_per_epoch = len(train_gen)
valid_steps_per_epoch = len(valid_gen)

# Define the number of epochs
num_epochs = 20  # You can adjust this as needed

# Fit the Fast R-CNN model
history = fast_rcnn_model.fit_generator(
    train_gen,
    steps_per_epoch=train_steps_per_epoch,
    epochs=num_epochs,
    validation_data=valid_gen,
    validation_steps=valid_steps_per_epoch
)

# Evaluate the model on the test set
test_results = fast_rcnn_model.evaluate(test_gen)

# Print test results (e.g., test loss and test accuracy)
print("Test Loss:", test_results[0])
print("Test Accuracy:", test_results[1])

Epoch 1/20


  history = fast_rcnn_model.fit_generator(


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.18719060719013214
Test Accuracy: 0.936296284198761


In [17]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, recall_score
from keras.utils import to_categorical

# Assuming fast_rcnn_model is already defined and compiled

# Generate predictions for the test set
y_pred_prob = fast_rcnn_model.predict(test_gen)
y_true = to_categorical(test_gen.classes)  # Convert to one-hot encoding

# Calculate accuracy
y_pred = np.argmax(y_pred_prob, axis=1)
accuracy = accuracy_score(np.argmax(y_true, axis=1), y_pred)
print("Accuracy:", accuracy)

# Calculate AUC
auc = roc_auc_score(y_true, y_pred_prob, multi_class='ovr', average='weighted')
print("AUC:", auc)

# Calculate average precision
average_precision = average_precision_score(y_true, y_pred_prob, average='weighted')
print("Average Precision:", average_precision)

# Calculate average recall
average_recall = recall_score(np.argmax(y_true, axis=1), y_pred, average='weighted')
print("Average Recall:", average_recall)

# Evaluate the model on the test set using the Keras evaluate method
test_results = fast_rcnn_model.evaluate(test_gen)

# Print test results (e.g., test loss and test accuracy)
print("Test Loss:", test_results[0])
print("Test Accuracy:", test_results[1])


Accuracy: 0.3362962962962963
AUC: 0.508706382236329
Average Precision: 0.3471450962031932
Average Recall: 0.3362962962962963
Test Loss: 0.18719060719013214
Test Accuracy: 0.936296284198761


In [18]:
print("Test Loss:", test_results[0])
print("Test Accuracy:", test_results[1])

# Generate predictions for the test set
y_pred_prob = fast_rcnn_model.predict(test_gen)
y_true = to_categorical(test_gen.classes)  # Convert to one-hot encoding

# Calculate AUC
auc = roc_auc_score(y_true, y_pred_prob, multi_class='ovr', average='weighted')
print("AUC:", auc)

# Calculate average precision
average_precision = average_precision_score(y_true, y_pred_prob, average='weighted')
print("Average Precision:", average_precision)

# Calculate average recall
y_pred = np.argmax(y_pred_prob, axis=1)
average_recall = recall_score(np.argmax(y_true, axis=1), y_pred, average='weighted')

Test Loss: 0.18719060719013214
Test Accuracy: 0.936296284198761
AUC: 0.47768736131814893
Average Precision: 0.3259831710336418


In [19]:
fast_rcnn_model.save("fast_rcnn_model.h5")

  saving_api.save_model(
