<a href="https://colab.research.google.com/github/tylaar1/PICAR-autopilot/blob/main/BA_cleaned_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [14]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 1) DATA PRE-PROCESSING

a) Load in labels + image file paths

b) combine them into one dataframe

c) EDA - spotted and removed erroneous label (speed = 1.42...)

## `cleaned_df` is the final df with all of this completed

### 1a) load in labels + image file paths

In [40]:
labels_file_path = '/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_norm.csv'
labels_df = pd.read_csv(labels_file_path, index_col='image_id')

In [57]:
image_folder_path = '/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data'

image_file_paths = [
    os.path.join(image_folder_path, f)
    for f in os.listdir(image_folder_path)
    if f.lower().endswith(('.png', '.jpg', '.jpeg'))
]

image_file_paths.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) # sorts the files in the right order (1.png, 2.png, 3.png, ...)

imagefilepaths_df = pd.DataFrame(
    image_file_paths,
    columns=['image_file_paths'],
    index=[int(os.path.splitext(os.path.basename(path))[0]) for path in image_file_paths]
)

imagefilepaths_df.index.name = 'image_id'

Checking labels dataframe

In [58]:
labels_df.head()

Unnamed: 0_level_0,angle,speed
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.4375,0.0
2,0.8125,1.0
3,0.4375,1.0
4,0.625,1.0
5,0.5,0.0


Checking image file paths dataframe - as you can see the file paths are ordered correctly (1.png, 2.png, 3.png, ...)

In [59]:
imagefilepaths_df.head()

Unnamed: 0_level_0,image_file_paths
image_id,Unnamed: 1_level_1
1,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/1.png
2,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/2.png
3,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3.png
4,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/4.png
5,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/5.png


### 1b) Combine labels and image file paths into one dataframe

In [78]:
merged_df = pd.merge(labels_df, imagefilepaths_df, on='image_id', how='inner')
merged_df['speed'] = merged_df['speed'].round(6) # to get rid of floating point errors

In [79]:
merged_df.head()

Unnamed: 0_level_0,angle,speed,image_file_paths
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.4375,0.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/1.png
2,0.8125,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/2.png
3,0.4375,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3.png
4,0.625,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/4.png
5,0.5,0.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/5.png


In [80]:
merged_df.loc[3139:3143]

Unnamed: 0_level_0,angle,speed,image_file_paths
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3139,0.75,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3139.png
3140,0.875,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3140.png
3142,0.625,0.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3142.png
3143,0.625,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3143.png


The above cell shows that:

 1) the image files and labels match (see image_id and the number at the end of the file path)

 2) the missing rows in labels_df (image_id: 3141, 3999, 4895, 8285, 10171) have been taken care of

### 1c) EDA

In [81]:
merged_df.value_counts('speed')

Unnamed: 0_level_0,count
speed,Unnamed: 1_level_1
1.0,10402
0.0,3390
1.428571,1


note: imbalance datset

identifying the row with the erroneous speed value

In [82]:
merged_df[merged_df['speed'] == 1.428571]

Unnamed: 0_level_0,angle,speed,image_file_paths
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3884,0.4375,1.428571,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3884.png


we want to remove this row

In [87]:
cleaned_df = merged_df[merged_df['speed'] != 1.428571]
cleaned_df.loc[3882:3886]

Unnamed: 0_level_0,angle,speed,image_file_paths
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3882,0.5625,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3882.png
3883,0.375,0.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3883.png
3885,0.0,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3885.png
3886,0.75,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRING SEMESTER/1. PHYS4036 MLiS2/MLiS2 Project/KAGGLEDATAmachine-learning-in-science-ii-2025/training_data/training_data/3886.png


# STUFF BELOW HERE IS FROM WHEN I WAS EDITING THIS NOTEBOOK LAST WEEK SO FEEL FREE TO IGNORE

# resizing images

In [None]:
# using df with just 100 elements (instead of 13.8k) so computation is quicker while im trying to figure out what works

trial_df = concat_df[:100]
trial_df.head(5)

Unnamed: 0,image_id,angle,speed,image_file_paths
0,1,0.4375,0.0,/content/drive/MyDrive/0. MSc MLiS/google SPRI...
1,2,0.8125,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRI...
2,3,0.4375,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRI...
3,4,0.625,1.0,/content/drive/MyDrive/0. MSc MLiS/google SPRI...
4,5,0.5,0.0,/content/drive/MyDrive/0. MSc MLiS/google SPRI...


In [None]:
import cv2

im_size = 220

images = []

for path in trial_df['image_file_paths']:
  img = cv2.imread(path)
  img = cv2.resize(img, (im_size, im_size)) / 255.0 # div by 255 so numbers are all [0,1]
  images.append(img)

In [None]:
images[0].shape

(220, 220, 3)

# train, test split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

images = trial_df['image_file_paths'].values
y = trial_df[['angle', 'speed']].values  # Both angle and speed as labels

# Shuffle the dataset (images and labels are shuffled together)
images, y = shuffle(images, y, random_state=1)

# Split the dataset into training and test sets
train_x, test_x, train_y, test_y = train_test_split(images, y, test_size=0.2, random_state=1)

# Check the shapes of the split data
print(f"Training data (X) shape: {train_x.shape}")
print(f"Testing data (X) shape: {test_x.shape}")
print(f"Training labels (y) shape: {train_y.shape}")
print(f"Testing labels (y) shape: {test_y.shape}")

Training data (X) shape: (80,)
Testing data (X) shape: (20,)
Training labels (y) shape: (80, 2)
Testing labels (y) shape: (20, 2)


In [None]:
train_y[0]

array([0.875, 0.   ])

# 1) THE MODEL

In [None]:
import tensorflow as tf

In [None]:
num_channels = 3 # RGB
img_size = 220
num_outputs = 2 # angle and speed

# Model input
x = tf.keras.Input(shape=(img_size, img_size, num_channels), dtype=tf.float32)
y = tf.keras.Input(shape=(num_outputs,), dtype=tf.float32)

In [None]:
dropoutrate = 0.2
num_outputs = 2
input_shape = [224,224,3]

mbnet = tf.keras.applications.MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet') #We now dont want randomized weights but to load weights from imagenet

model = tf.keras.Sequential([
  mbnet,
  tf.keras.layers.GlobalAveragePooling2D(),
  tf.keras.layers.Dropout(dropoutrate),
  tf.keras.layers.Dense(num_outputs, activation='mse')
])
model.build()

mbnet.trainable = False # freeze the first layers to the imagenet weights

model.summary() # print the model