In [1]:
import pandas as pd
import os
from PIL import Image
import numpy as np


### Mappings

In [2]:
file_path = os.path.abspath('data_processing.ipynb')

# Extract the directory part from the absolute path
directory = os.path.dirname(file_path)


In [3]:
data_dir = "../data/Released_Data/"

sub_classes_mapping_df = pd.read_csv(data_dir+'sub_classes_mapping.csv')
sub_classes_mapping_df = sub_classes_mapping_df.rename(columns={"index": "subclass_index", "class": "sub_class"})

super_classes_mapping_df = pd.read_csv(data_dir+'super_classes_mapping.csv')
super_classes_mapping_df = super_classes_mapping_df.rename(columns={"index": "superclass_index", "class": "super_class"})

train_data_mapping_df = pd.read_csv(data_dir+'train_data.csv')

print("Num sub-class mappings: ", len(sub_classes_mapping_df))
print("Num super-class mappings: ", len(super_classes_mapping_df))
print("Num training data mappings: ", len(train_data_mapping_df))

Num sub-class mappings:  90
Num super-class mappings:  3
Num training data mappings:  6472


In [4]:
sub_classes_mapping_df.head()

Unnamed: 0,subclass_index,sub_class
0,0,"Scotch terrier, Scottish terrier, Scottie"
1,1,"African chameleon, Chamaeleo chamaeleon"
2,2,standard schnauzer
3,3,terrapin
4,4,"great grey owl, great gray owl, Strix nebulosa"


In [5]:
super_classes_mapping_df.head()

Unnamed: 0,superclass_index,super_class
0,0,bird
1,1,dog
2,2,reptile


In [6]:
train_data_mapping_df.head()

Unnamed: 0,image,superclass_index,subclass_index
0,0.jpg,1,63
1,1.jpg,0,24
2,2.jpg,0,74
3,3.jpg,2,72
4,4.jpg,2,43


### Shuffled Splits

In [7]:
# Directories containing the images
train_dir = data_dir + "train_shuffle"
test_dir = data_dir + "test_shuffle"

# Desired size for CNN input
input_size = (8, 8)  # Adjust as needed

# Function to preprocess the image
def preprocess_image(image_path):
    # Read the image
    image = Image.open(image_path)

    if image.size != (8, 8):
        raise ValueError("Image is not 8x8 pixels in size.")
        
     # Convert to a NumPy array and normalize
    image_array = np.array(image).astype('float32') / 255.0

    return image_array

# Function to create a DataFrame from a directory of images
def create_dataframe_from_images(directory):
    # List all files in the directory
    filenames = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

    # Preprocess images and store in a list
    images = [preprocess_image(os.path.join(directory, f)) for f in filenames]

    # Create a DataFrame
    df = pd.DataFrame({'image': filenames, 'image_data': images})
    return df

# Create DataFrames for training and testing images
train_df = create_dataframe_from_images(train_dir)
test_df = create_dataframe_from_images(test_dir)

In [8]:
train_df.head()

Unnamed: 0,image,image_data
0,63.jpg,"[[[0.4862745, 0.42745098, 0.27450982], [0.4588..."
1,6400.jpg,"[[[0.50980395, 0.50980395, 0.5411765], [0.5450..."
2,823.jpg,"[[[0.4392157, 0.41960785, 0.34117648], [0.5019..."
3,4217.jpg,"[[[0.5411765, 0.5882353, 0.5411765], [0.231372..."
4,3578.jpg,"[[[0.45882353, 0.54509807, 0.7372549], [0.4196..."


In [9]:
full_train_df = pd.merge(train_df, train_data_mapping_df, on='image', how='inner')
full_train_df = pd.merge(full_train_df, super_classes_mapping_df, on='superclass_index', how='inner')
full_train_df = pd.merge(full_train_df, sub_classes_mapping_df, on='subclass_index', how='inner')
full_train_df = full_train_df.rename(columns={"image": "image_name"})


In [10]:
full_train_df.head()

Unnamed: 0,image_name,image_data,superclass_index,subclass_index,super_class,sub_class
0,63.jpg,"[[[0.4862745, 0.42745098, 0.27450982], [0.4588...",2,75,reptile,"whiptail, whiptail lizard"
1,4820.jpg,"[[[0.36078432, 0.3372549, 0.28235295], [0.3294...",2,75,reptile,"whiptail, whiptail lizard"
2,2935.jpg,"[[[0.81960785, 0.7921569, 0.85490197], [0.7254...",2,75,reptile,"whiptail, whiptail lizard"
3,3343.jpg,"[[[0.42352942, 0.35686275, 0.28627452], [0.443...",2,75,reptile,"whiptail, whiptail lizard"
4,5295.jpg,"[[[0.31764707, 0.5372549, 0.72156864], [0.4901...",2,75,reptile,"whiptail, whiptail lizard"


In [11]:
print(len(full_train_df))

6472
