#Can ignore this, this is to check how many features we are learning: should be two, Hit/Miss for this one

import os

train_dir = r"C:\Users\MHuang\OneDrive - Everstream\Desktop\MikeDHuang_2024\Machine Learning\basedata\T108\Train"
validation_dir = r"C:\Users\MHuang\OneDrive - Everstream\Desktop\MikeDHuang_2024\Machine Learning\basedata\T108\Valid"

#List the subdirectories in the training and validation directories
print("Training subdirectories:", os.listdir(train_dir))
print("Validation subdirectories:", os.listdir(validation_dir))


#You will need this to install all the package(s)

!pip install tensorflow

In [4]:
#Actual learning start from here

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing import image
from tensorflow.keras.optimizers import RMSprop
from datetime import datetime
import tensorflow as tf

In [9]:
# Initialize ImageDataGenerator
train = ImageDataGenerator(rescale=1/255)          #This turn pixels into number, think of it as a big excel sheet, don't touch
validation = ImageDataGenerator(rescale=1/255)     #Computer see everything as number (0,1)

# We teach the model what is hit and miss, feedback after each iterator
train_iterator = train.flow_from_directory(
    r"C:\Users\MHuang\OneDrive - Everstream\Desktop\MikeDHuang_2024\Machine Learning\basedata\T108\Train",#TODO: change the path
    target_size=(720, 720),    #This just mean we are training with 720x720p pictures, model only identify 720x720 aferward
    batch_size=16,             #This is the number of samples we feed into the model at each iteration of the training process.
    class_mode='binary'        #Binary: tw classes: Yes/No - 1/0. For this process 0 is Hit(No Miss), 1 is Miss (Yes Miss)
)

# We give them model a mock test to see how it did, it does learn from each iterator
validation_iterator = validation.flow_from_directory(
    r"C:\Users\MHuang\OneDrive - Everstream\Desktop\MikeDHuang_2024\Machine Learning\basedata\T108\Valid",#TODO: change the path
    target_size=(720, 720),    #Resize all images to 720x720 pixels to match the training data
    batch_size=16,             #Number of samples to process in each batch during validation
    class_mode='binary'
)

Found 15149 images belonging to 2 classes.
Found 5100 images belonging to 2 classes.


#Skip this as well, this is to check if 0 is hit or 1 is hit

#Print class indices for training data
print("Class indices for training data:", train_iterator.class_indices)

#Print class indices for validation data
print("Class indices for validation data:", validation_iterator.class_indices)


In [6]:
# Convert the DirectoryIterator to a tf.data.Dataset for training, because previously it keep running out of data???
train_dataset = tf.data.Dataset.from_generator(
    lambda: train_iterator,                                           # Using a lambda function to call the train_iterator
    output_signature=(
        tf.TensorSpec(shape=(None, 720, 720, 3), dtype=tf.float32),   # Defines the shape and type of the image tensors
                                                                      # ex. float number: 3.14, 0.001, and -2.5
                                                                      # 3 is the color channel (RGB)
        tf.TensorSpec(shape=(None,), dtype=tf.float32)                # Defines the shape and type of the labels (binary)
    )
).repeat()                                                            # Repeats the dataset indefinitely, fixed the out of range

# Convert the DirectoryIterator to a tf.data.Dataset for validation
validation_dataset = tf.data.Dataset.from_generator(
    lambda: validation_iterator,  # Using a lambda function to call the validation_iterator
    output_signature=(
        tf.TensorSpec(shape=(None, 720, 720, 3), dtype=tf.float32),  # Defines the shape and type of the image tensors 
        tf.TensorSpec(shape=(None,), dtype=tf.float32)               # Defines the shape and type of the labels (binary)
    )
).repeat()                                                           # Repeats the dataset indefinitely, fixed the out of range

# Calculate the number of steps per epoch (the process of going through the entire dataset) for training
steps_per_epoch = train_iterator.samples // train_iterator.batch_size
# The total number of training samples divided by the batch size is the number of steps per epoch. ex. 1000 // 100 = 10

# Calculate the number of steps per epoch for validation
validation_steps = validation_iterator.samples // validation_iterator.batch_size
# The total number of validation samples divided by the batch size is the number of validation steps


In [8]:
# Define the model architecture

#Filters extract features from the input image.
#Kernel Size determines the size of the region in the input image that the filter looks at to compute value in the feature map.
#Pool Size reduces the spatial dimensions of the feature maps, retaining important information to improve model efficiency

model = tf.keras.models.Sequential([  # Sequential model, where layers are added sequentially
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', input_shape=(720, 720, 3)),
    # First convolutional layer with 16 filters, 3x3 kernel size, and ReLU activation function.
    # input_shape is defined as 720x720 with 3 color channels (RGB).
    
    tf.keras.layers.MaxPooling2D(2, 2),
    # Max pooling layer with a 2x2 pool size to downsample the feature maps.

    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    # Second convolutional layer with 32 filters and 3x3 kernel size.
    
    tf.keras.layers.MaxPooling2D(2, 2),
    # Another max pooling layer to further reduce the spatial dimensions.
    
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    # Third convolutional layer with 64 filters and 3x3 kernel size.
    
    tf.keras.layers.MaxPooling2D(2, 2),
    # Max pooling layer to downsample the feature maps again.
    
    tf.keras.layers.Flatten(),
    # Flatten the 3D output from the convolutional layers into a 1D vector for the dense layers.

    tf.keras.layers.Dense(512, activation='relu'),
    # Fully connected layer with 512 neurons and ReLU activation.

    tf.keras.layers.Dense(1, activation='sigmoid')
    # Output layer with 1 neuron for binary classification (Hit/Miss), using sigmoid activation.
])

# Compile the model
model.compile(optimizer=RMSprop(learning_rate=0.001),
              # RMSprop optimizer with a learning rate of 0.001, think of it as one step at a time, we could up it to 0.002,
              # but we will increase the risk of the model became unstable, two step or more at a time vs one step at a time
              loss='binary_crossentropy',
              # Binary cross-entropy loss function, appropriate for binary classification.
              metrics=['accuracy'])
              # Tracking accuracy as a metric during training and validation.

# Define callbacks to improve training process
early_stopping = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)
# EarlyStopping callback to stop training if the validation loss does not improve for 6 epochs (change this if needed).
# restore_best_weights=True ensures that the model reverts to the best weights when stopping early.

checkpoint = ModelCheckpoint('best_T108_v1_window.keras', monitor='val_loss', save_best_only=True)
# ModelCheckpoint to save the best model (based on validation loss) during the training process

# Fit the model to the training data
model_fit = model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,     # Number of steps (batches) to run in each training epoch.
    epochs=40,                           # Maximum number of epochs to train the model. (bigger doesn't mean better, as I have
                                         # set up early stopping (see above) to prevent overfitting; study the answer instead
                                         # of the knowledge/process)
    validation_data=validation_dataset,  # Validation data to evaluate the model at the end of each epoch.
    validation_steps=validation_steps,   # Number of validation steps (batches) to run after each epoch.
    callbacks=[early_stopping, checkpoint]
    # Callbacks to control early stopping and save the best model during training.
)


Epoch 1/40
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2047s[0m 2s/step - accuracy: 0.7937 - loss: 2.0837 - val_accuracy: 0.9951 - val_loss: 0.0213
Epoch 2/40
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2028s[0m 2s/step - accuracy: 0.9968 - loss: 0.0160 - val_accuracy: 0.9969 - val_loss: 0.0166
Epoch 3/40
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2029s[0m 2s/step - accuracy: 0.9965 - loss: 0.0211 - val_accuracy: 0.7288 - val_loss: 3.1581
Epoch 4/40
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2034s[0m 2s/step - accuracy: 0.9945 - loss: 0.0476 - val_accuracy: 0.9851 - val_loss: 0.1547
Epoch 5/40
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2040s[0m 2s/step - accuracy: 0.9982 - loss: 0.0097 - val_accuracy: 0.9943 - val_loss: 0.0734
Epoch 6/40
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2053s[0m 2s/step - accuracy: 0.9975 - loss: 0.0117 - val_accuracy: 0.9854 - val_loss: 0.4924
Epoch 7/40
[1m9