In [None]:
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adamax
import tensorflow as tf
import numpy as np
from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Import and label images
data_dir = 'L:/!school/!uni/!classes/sem2-2023/software technology/assignments/assignment 2/lung_colon_image_set'
try:
    os.listdir(data_dir)
except:
    data_dir = 'C:/Users/dylan/OneDrive/school/sem2-2023/software technology/assignments/assignment 2/lung_colon_image_set'

labels_mapping = {
    'colon_aca': 'Colon Adenocarcinoma',
    'colon_n': 'Colon Benign Tissue',
    'lung_aca': 'Lung Adenocarcinoma',
    'lung_n': 'Lung Benign Tissue',
    'lung_scc': 'Lung Squamous Cell Carcinoma'
}

filepaths = []
labels = []

for fold in os.listdir(data_dir):
    foldpath = os.path.join(data_dir, fold)

    for f in os.listdir(foldpath):
        f_path = os.path.join(foldpath, f)

        for file in os.listdir(f_path):
            fpath = os.path.join(f_path, file)
            filepaths.append(fpath)

            labels.append(labels_mapping.get(f, ''))

df = pd.DataFrame({'filepaths': filepaths, 'labels': labels})

In [None]:
# Perform EDA on the dataset
df.info()
df.head()

df['labels'].value_counts().plot(kind='bar')
plt.title('Number of Images per Class')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Preprocess images for dimensionality reduction
images = []
for i in df['filepaths']:
    img = Image.open(i).convert('L')
    img = img.resize((128, 128), Image.LANCZOS)
    img = np.array(img).flatten()
    images.append(img)

scaler = StandardScaler()
scaled_images = scaler.fit_transform(images)

In [None]:
# Perform PCA
pca = PCA(n_components=50)
pca_result = pca.fit_transform(scaled_images)

# Perform t-SNE
tsne = TSNE(n_components=2)
tsne_result = tsne.fit_transform(pca_result)

In [None]:
# Create DataFrame for t-SNE visualization
df_tsne = pd.DataFrame({'X': tsne_result[:, 0], 'Y': tsne_result[:, 1], 'labels': df['labels']})

plt.figure(figsize=(16, 10))
sns.scatterplot(data=df_tsne, x="X", y="Y", hue="labels", palette=sns.color_palette("hsv", 5), legend="full", alpha=0.8)
plt.show()

In [None]:
# Create correlation matrix heatmap
scaled_pca_df = pd.DataFrame(pca_result)
correlations = scaled_pca_df.corr()

mask = np.triu(np.ones_like(correlations, dtype=bool))

f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(correlations, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('Heatmap of PCA Components')
plt.show()

In [None]:
# Create violin plots
df_plot = pd.DataFrame(pca_result[:, :2], columns=['PC1', 'PC2'])
df_plot['Label'] = df['labels']

plt.figure(figsize=(15, 10))
sns.violinplot(x='Label', y='PC1', data=df_plot)
plt.title('Violin plot of the first principal component')
plt.show()

plt.figure(figsize=(15, 10))
sns.violinplot(x='Label', y='PC2', data=df_plot)
plt.title('Violin plot of the second principal component')
plt.show()

TRAIN

In [7]:
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

test_datagen = ImageDataGenerator(rescale=1./255)

batch_size = 16
img_size = (224, 224)

# Create the EfficientNetB3 model
base_model = EfficientNetB3(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
base_model.trainable = False

In [8]:
# Create data generators with data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    zoom_range=0.2,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)


train_gen = train_datagen.flow_from_dataframe(
    train_df,
    x_col='filepaths',
    y_col='labels',
    target_size=img_size,
    class_mode='categorical',
    color_mode='rgb',
    shuffle=True,
    batch_size=batch_size
)

test_gen = test_datagen.flow_from_dataframe(
    test_df,
    x_col='filepaths',
    y_col='labels',
    target_size=img_size,
    class_mode='categorical',
    color_mode='rgb',
    shuffle=False,
    batch_size=batch_size
)

model = Sequential([
    base_model,
    BatchNormalization(),
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(5, activation='softmax')
])

# Compile the model
model.compile(optimizer=Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

Found 20000 validated image filenames belonging to 5 classes.
Found 5000 validated image filenames belonging to 5 classes.


In [9]:
# Train the model
history = model.fit(
    train_gen,
    epochs=30,
    validation_data=test_gen
)

Epoch 1/30

In [None]:
# Save the model
model.save('testmodel.h5')

In [None]:
# Load the model
loaded_model = tf.keras.models.load_model('model.h5', compile=False)
loaded_model.compile(Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Load and preprocess the image
image_path = "L:/!school/!uni/!classes/sem2-2023/software technology/assignments/assignment 2/lung_colon_image_set/lung_image_sets/lung_scc/lungscc4.jpeg"
image = Image.open(image_path)
img = image.resize((224, 224))
img_array = tf.keras.preprocessing.image.img_to_array(img)
img_array = np.expand_dims(img_array, 0)
img_array = img_array / 255.0

# Make a prediction
predictions = loaded_model.predict(img_array)
class_labels = {0: 'Colon Adenocarcinoma',
                1: 'Colon Benign Tissue',
                2: 'Lung Adenocarcinoma',
                3: 'Lung Benign Tissue',
                4: 'Lung Squamous Cell Carcinoma'}
predicted_class = np.argmax(predictions[0])

print(class_labels[predicted_class])