### 1. Exploratory Data Analysis (EDA)

* Inspect class balance (number of "yes" vs. "no" images)
* Inspect images size and standardize if necessary (e.g., 128x128 or 224x224)
* Check pixel intensity distribution (to check if resizing is necessary)
* Normalize pixel values to [0,1]


# Week 1: Setup + EDA

#### 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns

#### 2. Import Data

In [None]:
data_dir = "../../../../env/brain_tumor_dataset"

In [None]:
number_files_yes = len(os.listdir(os.path.join(data_dir, "yes")))
number_files_no = len(os.listdir(os.path.join(data_dir, "no")))

print(f"Tumor images:    {number_files_yes}")
print(f"No tumor images:  {number_files_no}")
print(f"Total images:    {number_files_yes + number_files_no}")

In [None]:
Imbalance_Ratio = number_files_yes / number_files_no
print(f"Class Imbalance Ratio (yes:no) = {Imbalance_Ratio:.2f}:1")

In [None]:
images = []

In [None]:
for x in {"yes","no"}:
    paths = data_dir + "/" + x
    for filename in os.listdir(paths):
        ext = os.path.splitext(filename)[-1].lower()

        path = os.path.join(data_dir, x, filename)
        img = cv2.imread(path)
        if img is not None:
            height, width = img.shape[:2]
            images.append({'filename': filename, 'width': width, 'height': height, 'aspect_ratio': width / height if height > 0 else 0})
        else:
            images.append({'filename': filename, 'width': None, 'height': None, 'aspect_ratio': None}) # in case of read failure

# Convert to DataFrame
df = pd.DataFrame(images)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(15, 5))

# Histogram of Widths
plt.subplot(1, 3, 1)
plt.hist(df['width'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Image Widths')
plt.xlabel('Width (pixels)')
plt.ylabel('Frequency')
plt.axvline(244, color='red', linestyle='--', label='244 Px')
plt.axvline(354, color='blue', linestyle='--', label='mean = 354.24 Px')
plt.legend()

# Histogram of Heights
plt.subplot(1, 3, 2)
plt.hist(df['height'], bins=30, color='lightcoral', edgecolor='black')
plt.title('Distribution of Image Heights')
plt.xlabel('Height (pixels)')
plt.ylabel('Frequency')
plt.axvline(244, color='red', linestyle='--', label='244 Px')
plt.axvline(354, color='blue', linestyle='--', label='mean = 354.24 Px')
plt.legend()

# Histogram of Aspect Ratios
plt.subplot(1, 3, 3)
plt.hist(df['aspect_ratio'], bins=30, color='lightgreen', edgecolor='black')
plt.title('Distribution of Aspect Ratios')
plt.xlabel('Aspect Ratio (Width/Height)')
plt.ylabel('Frequency')
plt.axvline(1.0, color='red', linestyle='--', label='Aspect Ratio = 1.0')
plt.legend()

plt.tight_layout()
plt.show()

# Scatter plot of Width vs Height
plt.figure(figsize=(8, 6))
plt.scatter(df['width'], df['height'], alpha=0.5)
plt.title('Image Width vs. Height')
plt.xlabel('Width (pixels)')
plt.ylabel('Height (pixels)')
plt.grid(True)
plt.show()

#### Pixel Intensity

In [None]:
ex_img = data_dir + "/yes/Y1.jpg"
img = cv2.imread(ex_img,0)
plt.imshow(img, cmap='gray')
plt.show()
print(img)
img.min(), img.max()

In [None]:
ex_img = data_dir + "/no/1 no.jpeg"
img = cv2.imread(ex_img,0)
print(img)
plt.imshow(img, cmap='gray')
plt.show()
img.min(), img.max()

### Resize Images