<a href="https://colab.research.google.com/github/zacharyzimm/mids-281-final-project/blob/main/compiled_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Required Packages

In [2]:
import numpy as np
import cv2 as cv

from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
from sklearn.svm import SVC

from utils import *

ModuleNotFoundError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Initial EDA

In [None]:
classN_img_path = "Data/train/normal/n8 - Copy.jpg"
classA_img_path = "Data/train/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/000000 (6).png"
classL_img_path = "Data/train/large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa/000002.png"
classS_img_path = "Data/train/squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa/000002 (4).png"

im_N = plt.imread(classN_img_path)
im_A = plt.imread(classA_img_path)
im_L = plt.imread(classL_img_path)
im_S = plt.imread(classS_img_path)

In [None]:

def display_img_colorbar(img):
  # display the points
  fig, ax = plt.subplots(figsize=(15, 10), nrows=2, ncols=2)
  im_ax = plt.imshow(img, cmap='gray')
  # create an axes on the right side of ax. The width of cax will be 5%
  # of ax and the padding between cax and ax will be fixed at 0.05 inch.
  divider = make_axes_locatable(ax)
  cax = divider.append_axes("right", size="5%", pad=0.05)
  plt.colorbar(im_ax, cax=cax)
  plt.show()

def plot_imgs(imN, imA, imL, imS):
    # display the points
    fig, ax = plt.subplots(figsize=(15, 10), nrows=2, ncols=2)
    ax[0][0].imshow(imN, cmap='gray')
    ax[0][0].set_title("Normal (N)")

    ax[0][1].imshow(imA, cmap='gray')
    ax[0][1].set_title("Adenocarcinoma (A)")

    ax[1][0].imshow(imL, cmap='gray')
    ax[1][0].set_title("Large cell carcinoma (L)")

    ax[1][1].imshow(imS, cmap='gray')
    ax[1][1].set_title("Squamous cell carcinoma (S)")
    plt.show()

In [None]:
def generate_edges(img):
    # extract the features from the image

    # convert to grayscale
    if np.max(img)>1:
        img = img.astype(np.float32)/255.0
    im_gray = np.mean(img, axis=2)

    # compute edges of the image
    sobelx = cv.Sobel(im_gray, cv.CV_32F, 1, 0, ksize=21) # Find x and y gradients
    sobely = cv.Sobel(im_gray, cv.CV_32F, 0, 1, ksize=21)
    magnitude = np.sqrt(sobelx**2.0 + sobely**2.0)
    magnitude = magnitude / np.max(magnitude) # normalize

    # threshold the image and get the interesting points
    im_threshold = cv.Canny(image=(magnitude * 255).astype(np.uint8), threshold1=0, threshold2=100) # Canny Edge
    im_threshold = im_threshold / np.max(im_threshold) # normalize

    return magnitude, im_threshold

In [None]:
edgesN, canny_edgesN = generate_edges(im_N)
edgesA, canny_edgesA = generate_edges(im_A)
edgesL, canny_edgesL = generate_edges(im_L)
edgesS, canny_edgesS = generate_edges(im_S)

In [None]:
plot_imgs(im_N, im_A, im_L, im_S)

In [None]:
plot_imgs(edgesN, edgesA, edgesL, edgesS)

In [None]:
plot_imgs(canny_edgesN, canny_edgesA, canny_edgesL, canny_edgesS)


Feature Extraction

In [None]:
class_mappings = {
    0: "normal",
    1: "adenocarcinoma",
    2: "large.cell.carcinoma",
    3: "squamous.cell.carcinoma"
}

In [None]:

train_path = "/content/drive/MyDrive/W281/Final Project/Data_Cropped_and_Resized/train"

train_imgs, train_sobel_edges, train_labels = extract_features(train_path, detect_edges_sobel, class_mappings)
_, train_hounsfield_edges, _ = extract_features(train_path, apply_hounsfield_units, class_mappings)

In [None]:
valid_path = "/content/drive/MyDrive/W281/Final Project/Data_Cropped_and_Resized/valid"

valid_imgs, valid_sobel_edges, valid_labels = extract_features(valid_path, detect_edges_sobel, class_mappings)
_, valid_hounsfield_edges, _ = extract_features(valid_path, apply_hounsfield_units, class_mappings)

In [None]:
plot_features(train_imgs, train_sobel_edges, train_labels, 0, 'Sobel Edge', class_mappings)


In [None]:
plot_features(train_imgs, train_hounsfield_edges, train_labels, 0, "Hounsfield Unit", class_mappings)


In [None]:
train_path = "/content/drive/MyDrive/W281/Final Project/Data/train"
val_path = "/content/drive/MyDrive/W281/Final Project/Data/valid"

classN_train_path = train_path + "/normal/"
classA_train_path = train_path + "/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/"
classL_train_path = train_path + "/large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa/"
classS_train_path = train_path + "/squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa/"
classN_valid_path = val_path + "/normal/"
classA_valid_path = val_path + "/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/"
classL_valid_path = val_path + "/large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa/"
classS_valid_path = val_path + "/squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa/"

In [None]:
mean_sizes = []

print("CLASS: NORMAL")
mean_sizes.append(get_average_image_size(classN_train_path)[0])

print("\nCLASS: A")
mean_sizes.append(get_average_image_size(classA_train_path)[0])

print("\nCLASS: L")
mean_sizes.append(get_average_image_size(classL_train_path)[0])

print("\nCLASS: S")
mean_sizes.append(get_average_image_size(classS_train_path)[0])

In [None]:

print("CLASS: NORMAL")
mean_sizes.append(get_average_image_size(classN_valid_path)[0])

print("\nCLASS: A")
mean_sizes.append(get_average_image_size(classA_valid_path)[0])

print("\nCLASS: L")
mean_sizes.append(get_average_image_size(classL_valid_path)[0])

print("\nCLASS: S")
mean_sizes.append(get_average_image_size(classS_valid_path)[0])

In [None]:
mean_sizes = np.array(mean_sizes)
out_img_size = (int(np.round(mean_sizes[:, 0].mean())), int(np.round(mean_sizes[:, 1].mean())))

print(f"Mean of all images: {out_img_size}")

In [None]:
out_img_dir = "/content/drive/MyDrive/W281/Final Project/Data_Cropped_and_Resized"
output_img_size = (256, 256)

crop_and_resize_images(train_path, output_img_size, out_img_dir)

In [None]:
crop_and_resize_images(val_path, output_img_size, out_img_dir)


In [None]:
img_name = "/content/drive/MyDrive/W281/Final Project/Data/train/normal/n8.jpg"
img = plt.imread(img_name)
plt.imshow(img)

In [None]:
img_name = "/content/drive/MyDrive/W281/Final Project/Data_Resized/train/normal/n8.jpg"
img = plt.imread(img_name)
plt.imshow(img, cmap="gray")

In [None]:
def show_slice_window(slice, level, window):
   """
   Function to display an image slice
   Input is a numpy 2D array
   """
   max = level + window/2
   min = level - window/2
   slice = slice.clip(min,max)

   fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20,10))
   ax[0].imshow(slice, cmap="gray")

   retval, thresh_gray = cv.threshold(slice,
                                      thresh=50,
                                      maxval=255,
                                      type=cv.THRESH_BINARY)
   ax[1].imshow(thresh_gray,
                cmap='gray',
                vmin=0,
                vmax=255)

   return thresh_gray

In [None]:
img_N = cv.imread("/content/drive/MyDrive/W281/FinalProject/Data_Resized/train/normal/n9.jpg", cv.IMREAD_GRAYSCALE)
img_N = cv.equalizeHist(img_N)

# Calculate the histogram
hist, bins = np.histogram(img_N.flatten(), 256, [0, 256])

# Plot the histogram
plt.figure(figsize=(8, 6))
plt.bar(bins[:-1], hist, width=1, color='gray')
plt.title("Grayscale Histogram")
plt.xlabel("Pixel Value")
plt.ylabel("Frequency")
plt.show()

In [None]:
  def get_PCA(X_list, n_components=2):
  pca_list = []
  xpca_list = []
  for X in X_list:
    pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X)
    X_pca = pca.transform(X)
    pca_list.append(pca)
    xpca_list.append(X_pca)
  return pca_list, xpca_list

def plot_PCA(X_list, labels, n_components=2):
  pca_list, xpca_list = get_PCA(X_list, n_components=n_components)

  plt.figure(figsize=(15,5))
  colors = ['b-', 'm-']
  for i in range(len(X_list)):
    plt.plot(np.cumsum(pca_list[i].explained_variance_ratio_), colors[i], label=labels[i])
  plt.xticks(np.arange(n_components)+1)
  plt.yticks(np.linspace(0, 1, 8))
  plt.grid(True)
  plt.xlabel('Number of components')
  plt.ylabel('Explained Variances')
  plt.legend()
  plt.show()

def get_tsne(X_list, n_components=2):
  xtsne_list = []
  for X in X_list:
    tsne = TSNE(n_components=n_components, random_state=0)
    X_tsne = tsne.fit_transform(X)
    xtsne_list.append(X_tsne)
  return xtsne_list

In [None]:
labels = ['sobel edges', 'houndsfield edges']

training_features = [[img.flatten() for img in train_sobel_edges],
            [img.flatten() for img in train_hounsfield_edges]]

plot_PCA(training_features, labels, n_components=50)

LDA with Sobel Edges

In [None]:
X_sobel_pca, X_hounsfield_pca = get_PCA(training_features, n_components=48)[-1]

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_sobel_pca, train_labels)

X_lda = lda.transform(X_sobel_pca)

coef_lda = lda.coef_[0]
intercept_lda = lda.intercept_[0]

plt.figure(figsize=(8, 6))

for label in np.unique(train_labels):
    plt.scatter(X_lda[train_labels == label, 0], X_lda[train_labels == label, 1], label=label)

line_x = np.array([X_lda[:, 0].min() - 1, X_lda[:, 0].max() + 1])
line_y = -(line_x * coef_lda[0] + intercept_lda) / coef_lda[1]

plt.plot(line_x, line_y, c='black', linewidth=2, label='Fitted Line')

plt.xlabel('LD1')
plt.ylabel('LD2')
plt.title('Fitted Line from Linear Discriminant Analysis')
plt.legend()
plt.grid(True)
plt.show()

LDA with Hounsfield Units

In [None]:
lda.fit(X_hounsfield_pca, train_labels)

X_lda = lda.transform(X_hounsfield_pca)

coef_lda = lda.coef_[0]
intercept_lda = lda.intercept_[0]

plt.figure(figsize=(8, 6))

for label in np.unique(train_labels):
    plt.scatter(X_lda[train_labels == label, 0], X_lda[train_labels == label, 1], label=label)

line_x = np.array([X_lda[:, 0].min() - 1, X_lda[:, 0].max() + 1])
line_y = -(line_x * coef_lda[0] + intercept_lda) / coef_lda[1]

plt.plot(line_x, line_y, c='black', linewidth=2, label='Fitted Line')

plt.xlabel('LD1')
plt.ylabel('LD2')
plt.title('Fitted Line from Linear Discriminant Analysis')
plt.legend()
plt.grid(True)
plt.show()