This notebook pulls the data from the Features.parquet file and is used to validate that we preseve the data accross saves to parquet format. Data should compare with the develop_simple_features.ipynb

In [1]:
from io import BytesIO
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
import polars as pl


from feature_utils import normalize_histogram
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

import keras,os
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D , Flatten, Conv1D, MaxPooling1D, Dropout, BatchNormalization, LeakyReLU, Activation
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from keras.optimizers import Adam, RMSprop

In [2]:
# Read the parquet file, this takes a while. Leave it here
features_file = Path("data/train_features.parquet")
if not features_file.exists():
    print("No features file found. Please run the create_features_table first")
    exit(1)

meta_file = Path("data/meta_full.parquet")
if not meta_file.exists():
    print("No features file found. Please run the create_features_table first")
    exit(1)

dft = pl.read_parquet(features_file, memory_map=True)
dfm = pl.read_parquet(meta_file, memory_map=True)
dft = dft.join(dfm, on="ClassId")
# del dfm

In [3]:
dft.head(1)

ClassId,Width,Height,Image,Resolution,Hue_Hist,Saturation_Hist,Value_Hist,LBP_Image,LBP_Hist,HOG_Features,HOG_Image,SIFT_Features,Path,ShapeId,ColorId,SignId,Description,Meta_Image
i64,i64,i64,binary,i64,list[i64],list[i64],list[i64],binary,list[i64],list[f32],binary,binary,str,i64,i64,str,str,binary
20,64,64,[binary data],4096,"[247, 0, … 0]","[30, 0, … 0]","[0, 0, … 0]",[binary data],"[127, 95, … 760]","[0.318327, 0.160083, … 0.28708]",[binary data],[binary data],"""C:\Users\lisaw…",0,0,"""1.1""","""Right curve""",[binary data]


In [4]:
samples_per_class = 200
random_seed = 42
train_sampled_df = pl.concat([x.sample(samples_per_class, with_replacement=False, seed=random_seed) for x in dft.partition_by("ClassId")])

In [5]:
train_sampled_df.columns

['ClassId',
 'Width',
 'Height',
 'Image',
 'Resolution',
 'Hue_Hist',
 'Saturation_Hist',
 'Value_Hist',
 'LBP_Image',
 'LBP_Hist',
 'HOG_Features',
 'HOG_Image',
 'SIFT_Features',
 'Path',
 'ShapeId',
 'ColorId',
 'SignId',
 'Description',
 'Meta_Image']

In [6]:
columns = ['ClassId', 'Hue_Hist', 'Saturation_Hist', 'Value_Hist', 'LBP_Hist', 'HOG_Features']
features = train_sampled_df[columns]
features.head(1)

ClassId,Hue_Hist,Saturation_Hist,Value_Hist,LBP_Hist,HOG_Features
i64,list[i64],list[i64],list[i64],list[i64],list[f32]
20,"[972, 151, … 16]","[247, 0, … 0]","[0, 0, … 0]","[80, 151, … 994]","[0.25923, 0.184951, … 0.0]"


In [7]:
features_hog = features['HOG_Features'].to_numpy()
features_hog = np.stack(features_hog)
features_hog.shape

# split the data into train, validation and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_hog, features['ClassId'].to_numpy(), test_size=0.2, random_state=42)

# normalize the data
X_train = X_train / 255
X_test = X_test / 255



In [35]:
features_lbp = features['LBP_Hist'].to_numpy()
features_lbp = np.stack(features_lbp)
features_lbp.shape

# split the data into train, validation and test
X_train_lbp, X_test_lbp, y_train_lbp, y_test_lbp = train_test_split(features_lbp, features['ClassId'].to_numpy(), test_size=0.2, random_state=42)

# normalize the data
X_train_lbp = X_train_lbp / 255
X_test_lbp = X_test_lbp / 255

features_hue = features['Hue_Hist'].to_numpy()
features_hue = np.stack(features_hue)
features_hue.shape

# split the data into train, validation and test
X_train_hue, X_test_hue, y_train_hue, y_test_hue = train_test_split(features_hue, features['ClassId'].to_numpy(), test_size=0.2, random_state=42)

# normalize the data
X_train_hue = X_train_hue / 255
X_test_hue = X_test_hue / 255

# create files for saturation
features_sat = features['Saturation_Hist'].to_numpy()
features_sat = np.stack(features_sat)

# split the data into train, validation and test
X_train_sat, X_test_sat, y_train_sat, y_test_sat = train_test_split(features_sat, features['ClassId'].to_numpy(), test_size=0.2, random_state=42)

# normalize the data
X_train_sat = X_train_sat / 255
X_test_sat = X_test_sat / 255


In [9]:
X_train.shape[1]

2916

### CNN Model

In [37]:
ms_input_shape_hog = (X_train.shape[1], 1)
ms_input_shape_lbp = (X_train_lbp.shape[1], 1)
ms_input_shape_hue = (X_train_hue.shape[1], 1)
ms_input_shape_sat = (X_train_sat.shape[1], 1)
# define the model 
def create_cnn_model(padding='same',               
                    input_shape= ms_input_shape_hog):
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)

    model = Sequential()
    model.add(Conv1D(filters=6, kernel_size=21, strides=1, padding=padding, activation='relu', 
                    input_shape= input_shape,kernel_initializer=keras.initializers.he_normal()))
    model.add(BatchNormalization()) 
    model.add(MaxPooling1D(pool_size=2, strides=2, padding=padding))
    model.add(Conv1D(filters=16, kernel_size=5, strides=1, padding=padding,activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2, strides=2, padding=padding))
    model.add(Flatten())
    model.add(Dense(120, activation='relu'))
    model.add(Dense(84))
    model.add(Dropout(rate=0.5, name='Dropout'))
    model.add(Dense(43, activation='softmax', name='Output'))
    return model

model_hog = create_cnn_model(input_shape=ms_input_shape_hog)
model_hog.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 2916, 6)           132       
                                                                 
 batch_normalization (Batch  (None, 2916, 6)           24        
 Normalization)                                                  
                                                                 
 max_pooling1d (MaxPooling1  (None, 1458, 6)           0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 1458, 16)          496       
                                                                 
 batch_normalization_1 (Bat  (None, 1458, 16)          64        
 chNormalization)                                                
                                                        

In [11]:
import tensorflow as tf
model_hog.compile(optimizer=tf.keras.optimizers.Adam(),loss='sparse_categorical_crossentropy',metrics=['acc'])

callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='acc', patience=1)
    ]

BATCH_SIZE = 32 
EPOCHS = 10

history = model_hog.fit(X_train,
                    y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    callbacks=callbacks_list,
                    validation_split=0.2,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [14]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model_hog.evaluate(X_test, y_test, batch_size=32)
print("test loss, test acc:", results)


Evaluate on test data
test loss, test acc: [0.24882948398590088, 0.9436046481132507]


In [15]:
#apply CNN to LBP features
model_lbp = create_cnn_model(input_shape=ms_input_shape_lbp)
model_lbp.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 18, 6)             132       
                                                                 
 batch_normalization (Batch  (None, 18, 6)             24        
 Normalization)                                                  
                                                                 
 max_pooling1d (MaxPooling1  (None, 9, 6)              0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 9, 16)             496       
                                                                 
 batch_normalization_1 (Bat  (None, 9, 16)             64        
 chNormalization)                                                
                                                        

In [23]:
#compile the model
model_lbp.compile(optimizer=tf.keras.optimizers.Adam(),
                loss='sparse_categorical_crossentropy',
                metrics=['acc'])

#train the model
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='acc', patience=1)
    ]

BATCH_SIZE = 32
EPOCHS = 20

history_2 = model_lbp.fit(X_train_lbp,
                    y_train_lbp,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    callbacks=callbacks_list,
                    validation_split=0.2,
                    verbose=1)





Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [32]:
# evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results_lbp = model_lbp.evaluate(X_test_lbp, y_test_lbp, batch_size=32)
print("LBP test loss, LBP test acc:", results_lbp)

Evaluate on test data
LBP test loss, LBP test acc: [2.267068862915039, 0.3593023121356964]


In [30]:
# compile the model
model_hue = create_cnn_model(input_shape=ms_input_shape_hue)
model_hue.summary()

model_hue.compile(optimizer=tf.keras.optimizers.Adam(),
                loss='sparse_categorical_crossentropy',
                metrics=['acc'])

#train the model
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='acc', patience=1)
    ]   

BATCH_SIZE = 32
EPOCHS = 50

history_3 = model_hue.fit(X_train_hue,
                    y_train_hue,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    callbacks=callbacks_list,
                    validation_split=0.2,
                    verbose=1)




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 180, 6)            132       
                                                                 
 batch_normalization (Batch  (None, 180, 6)            24        
 Normalization)                                                  
                                                                 
 max_pooling1d (MaxPooling1  (None, 90, 6)             0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 90, 16)            496       
                                                                 
 batch_normalization_1 (Bat  (None, 90, 16)            64        
 chNormalization)                                                
                                                        

In [34]:
# evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results_hue = model_hue.evaluate(X_test_hue, y_test_hue, batch_size=32)
print("HUE test loss, HUE test acc:", results_hue)


Evaluate on test data
HUE test loss, HUE test acc: [1.8206799030303955, 0.5656976699829102]


In [39]:
# create model for saturation
model_sat = create_cnn_model(input_shape=ms_input_shape_sat)

model_sat.compile(optimizer=tf.keras.optimizers.Adam(),
                loss='sparse_categorical_crossentropy',
                metrics=['acc'])

#train the model
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='acc', patience=1)
    ]

BATCH_SIZE = 32
EPOCHS = 50

history_4 = model_sat.fit(X_train_sat,
                    y_train_sat,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    callbacks=callbacks_list,
                    validation_split=0.2,
                    verbose=1)

                
# evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results_sat = model_sat.evaluate(X_test_sat, y_test_sat, batch_size=32)
print("SAT test loss, SAT test acc:", results_sat)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Evaluate on test data
SAT test loss, SAT test acc: [3.967654228210449, 0.34593021869659424]
