In [240]:
# Importing libraries
import os
import numpy as np
from google.colab import drive
import pandas as pd
import tensorflow as tf
from tensorflow.keras import utils
from tensorflow.keras.preprocessing import image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout ,BatchNormalization
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

In [241]:
# retrieving dataset from Google Drive
drive.mount('/content/drive')
train_path = '/content/drive/My Drive/Datasets/Data/train'
test_path = '/content/drive/My Drive/Datasets/Data/test'
val_path = '/content/drive/My Drive/Datasets/Data/valid'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [242]:
#no. of classes
classes = os.listdir(train_path)
classes

['normal',
 'adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib',
 'large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa',
 'squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa']

In [243]:
print("Train Data:")
for c in classes:
    print(c, len(os.listdir(train_path+'/'+c)))

Train Data:
normal 148
adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib 195
large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa 115
squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa 155


In [244]:
print("Validation Data:")
for c in classes:
    print(c, len(os.listdir(train_path+'/'+c)))

Validation Data:
normal 148
adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib 195
large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa 115
squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa 155


In [245]:
#checking image size -1
img = utils.load_img("/content/drive/My Drive/Datasets/Data/train/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/ad4.png",target_size=(150,150))
img = np.array(img)
img.shape


(150, 150, 3)

In [246]:
#checking image size -2
img = utils.load_img("/content/drive/My Drive/Datasets/Data/train/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/ad10.png")
img = np.array(img)
img.shape

(874, 1200, 3)

In [247]:
# Load and preprocess the data as images have different sizes
def load_and_preprocess_data(data_path):
    data = []
    labels = []
    for category in os.listdir(data_path):
        category_path = os.path.join(data_path, category)
        for img_file in os.listdir(category_path):
            img_path = os.path.join(category_path, img_file)
            img = image.load_img(img_path, target_size=(150, 150))
            img_array = image.img_to_array(img)
            img_array = preprocess_input(img_array)
            data.append(img_array)
            labels.append(category)
    return np.array(data), labels

In [248]:
# Load and preprocess training data
X_train, y_train = load_and_preprocess_data(train_path)

# Load and preprocess validation data
X_val, y_val = load_and_preprocess_data(val_path)

# Load and preprocess test data
X_test, y_test = load_and_preprocess_data(test_path)

In [249]:
X_train.shape

(613, 150, 150, 3)

In [250]:
#labels for train and val data
category2label = {'normal':0,'adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib':1,
 'large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa':2,
 'squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa': 3}
label2category = {0:'normal',1:'adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib',
 2:'large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa',
 3:'squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa'}

In [251]:
#label_encoding train data
y_train_encoded = np.array([category2label[label] for label in y_train])

In [252]:
#label_encoding val data
y_val_encoded = np.array([category2label[label] for label in y_val])

In [253]:
#labels for test data(as test data have different labels than train data)
category2label1 = {'normal':0,'adenocarcinoma':1,
 'large.cell.carcinoma':2,
 'squamous.cell.carcinoma': 3}
label2category1 = {0:'normal',1:'adenocarcinoma',
 2:'large.cell.carcinoma',
 3:'squamous.cell.carcinoma'}

In [254]:
#label_encoding test data
y_test_encoded = np.array([category2label1[label] for label in y_test])

In [255]:
#encoded test data
y_test_encoded

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [256]:
#using cnn to extract features
model =Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(Conv2D(1024, (3, 3), activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())


In [257]:
#cnn architecture
model.summary()

Model: "sequential_60"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_360 (Conv2D)         (None, 148, 148, 32)      896       
                                                                 
 max_pooling2d_145 (MaxPool  (None, 74, 74, 32)        0         
 ing2D)                                                          
                                                                 
 conv2d_361 (Conv2D)         (None, 72, 72, 64)        18496     
                                                                 
 max_pooling2d_146 (MaxPool  (None, 36, 36, 64)        0         
 ing2D)                                                          
                                                                 
 conv2d_362 (Conv2D)         (None, 34, 34, 128)       73856     
                                                                 
 max_pooling2d_147 (MaxPool  (None, 17, 17, 128)     

In [258]:
#storing these features in variables
X_train_features = model.predict(X_train)
X_val_features = model.predict(X_val)
X_test_features = model.predict(X_test)



In [259]:
# Flattening the features to be given to xgboost as input
X_train_flatten = X_train_features.reshape(X_train_features.shape[0], -1)
X_val_flatten = X_val_features.reshape(X_val_features.shape[0], -1)
X_test_flatten = X_test_features.reshape(X_test_features.shape[0], -1)

In [260]:
# Training XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train_flatten, y_train_encoded)

In [261]:
#Validation dataset
val_predictions = xgb_model.predict(X_val_flatten)
val_accuracy = accuracy_score(y_val_encoded, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")

Validation Accuracy: 0.6666666666666666


In [262]:
#Test dataset
test_predictions = xgb_model.predict(X_test_flatten)
test_accuracy = accuracy_score(y_test_encoded, test_predictions)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.6476190476190476
