In [1]:
import os
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array # type: ignore
from tensorflow.keras.applications import VGG16, MobileNetV2 # type: ignore
from tensorflow.keras import layers, models # type: ignore
from tensorflow.keras.models import Model, Sequential # type: ignore
from tensorflow.keras.layers import GlobalAveragePooling2D # type: ignore
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input # type: ignore
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Define constants
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
NUM_CLASSES = 5  # Adjust this based on your actual number of classes
dataset_path = './data'

# Define the class names
class_names = {
    'Others': 0,
    'Honda': 1,
    'Suzuki': 2,
    'Yamaha': 3,
    'VinFast': 4
}

2024-07-04 12:22:09.048128: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Initialize lists to hold image data and labels
X = []
y = []

# Load images and labels
for class_name in class_names:
    class_path = os.path.join(dataset_path, class_name)
    for img_name in os.listdir(class_path):
        img_path = os.path.join(class_path, img_name)
        if img_path.split('.')[-1].lower() in ['png', 'jpg', 'jpeg']:
            X.append(img_path[len(dataset_path)+1:])
            y.append(class_names[class_name])

# Define KFold cross-validator with 5 splits, shuffling data, and fixed random state
kf = KFold(n_splits=5, shuffle=True, random_state=1337)


In [5]:
import random

# Tỉ lệ dữ liệu được ghi vào file
sampling_ratio = 0.1

# Loop through each fold and perform split
for i, (train_index, test_index) in enumerate(kf.split(X)):
    # Prepare training and testing data for CSV
    train_data = [(X[idx], str(y[idx])) for idx in train_index]
    test_data = [(X[idx], str(y[idx])) for idx in test_index]
    
    # Giảm dữ liệu ghi vào file bằng cách lấy mẫu ngẫu nhiên
    reduced_train_data = random.sample(train_data, int(len(train_data) * sampling_ratio))
    reduced_test_data = random.sample(test_data, int(len(test_data) * sampling_ratio))
    
    # Write reduced training data to CSV
    with open(f'splits/MotocycleDataset-Splits-{i+1}-Train.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['filename', 'label'])
        writer.writerows(reduced_train_data)
    
    # Write reduced testing data to CSV
    with open(f'splits/MotocycleDataset-Splits-{i+1}-Test.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['filename', 'label'])
        writer.writerows(reduced_test_data)


In [3]:
# Loop through each fold and perform split
for i, (train_index, test_index) in enumerate(kf.split(X)):
    # Prepare training and testing data for CSV
    train_data = [(X[idx], y[idx]) for idx in train_index]
    test_data = [(X[idx], y[idx]) for idx in test_index]
    
    # Write training data to CSV
    with open(f'splits/MotocycleDataset-Splits-{i+1}-Train.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['filename', 'label'])
        writer.writerows(train_data)
    
    # Write testing data to CSV
    with open(f'splits/MotocycleDataset-Splits-{i+1}-Test.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['filename', 'label'])
        writer.writerows(test_data)


In [8]:
def create_datagen(train_csv, test_csv, img_dir):
    train_data = pd.read_csv(train_csv)
    test_data = pd.read_csv(test_csv)
    
    # Data augmentation for training data
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    
    # Only rescaling for test data
    test_datagen = ImageDataGenerator(rescale=1./255)
    
    train_generator = train_datagen.flow_from_dataframe(
        train_data,
        directory=img_dir,
        x_col='filename',
        y_col='label',
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='categorical'
    )
    
    validation_generator = test_datagen.flow_from_dataframe(
        test_data,
        directory=img_dir,
        x_col='filename',
        y_col='label',
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='categorical'
    )
    
    return train_generator, validation_generator

In [5]:
def build_model():
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    base_model.trainable = False  # Freeze the base model

    model = models.Sequential([
        base_model,
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(NUM_CLASSES, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [6]:
def train_and_evaluate(train_csv, test_csv, img_dir):
    train_generator, validation_generator = create_datagen(train_csv, test_csv, img_dir)
    
    model = build_model()

    history = model.fit(
        train_generator,
        steps_per_epoch=len(train_generator),
        epochs=30,  # Adjust epochs as needed
        validation_data=validation_generator,
        validation_steps=len(validation_generator)
    )

    # Visualize training process
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(len(acc))

    plt.figure()
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()

    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()

    # Evaluate the model
    Y_pred = model.predict(validation_generator)
    y_pred = np.argmax(Y_pred, axis=1)
    print('Confusion Matrix')
    print(confusion_matrix(validation_generator.classes, y_pred))
    print('Classification Report')
    target_names = list(class_names.keys())
    print(classification_report(validation_generator.classes, y_pred, target_names=target_names))


In [10]:
print(dataset_path)

./data


In [11]:
splits = [
    ("splits/MotocycleDataset-Splits-1-Train.csv", "splits/MotocycleDataset-Splits-1-Test.csv"),
    ("splits/MotocycleDataset-Splits-2-Train.csv", "splits/MotocycleDataset-Splits-2-Test.csv"),
    ("splits/MotocycleDataset-Splits-3-Train.csv", "splits/MotocycleDataset-Splits-3-Test.csv"),
    ("splits/MotocycleDataset-Splits-4-Train.csv", "splits/MotocycleDataset-Splits-4-Test.csv"),
    ("splits/MotocycleDataset-Splits-5-Train.csv", "splits/MotocycleDataset-Splits-5-Test.csv"),
]

for train_csv, test_csv in splits:
    print(f"Training and evaluating on {train_csv} and {test_csv}")
    train_and_evaluate(train_csv, test_csv, dataset_path)

Training and evaluating on splits/MotocycleDataset-Splits-1-Train.csv and splits/MotocycleDataset-Splits-1-Test.csv
Found 27917 validated image filenames belonging to 5 classes.
Found 6980 validated image filenames belonging to 5 classes.
Epoch 1/30


  self._warn_if_super_not_called()


[1m 75/873[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m32:09[0m 2s/step - accuracy: 0.2338 - loss: 4.2018

2024-07-03 21:07:43.021387: W tensorflow/core/framework/op_kernel.cc:1827] UNKNOWN: UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7dae24f7f150>
Traceback (most recent call last):

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 260, in _get_iterator
    for i, batch in enumerate(gen_fn()):

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 253,

[1m 76/873[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m32:06[0m 2s/step - accuracy: 0.2341 - loss: 4.1797

2024-07-03 21:07:44.815193: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: UNKNOWN: UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7dae24f7f150>
Traceback (most recent call last):

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 260, in _get_iterator
    for i, batch in enumerate(gen_fn()):

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/trainers

UnknownError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7dae24f7f150>
Traceback (most recent call last):

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/ops/script_ops.py", line 270, in __call__
    ret = func(*args)

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/home/tony/.local/lib/python3.10/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 260, in _get_iterator
    for i, batch in enumerate(gen_fn()):

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/trainers/data_adapters/py_dataset_adapter.py", line 253, in generator_fn
    yield self.py_dataset[i]

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/legacy/preprocessing/image.py", line 68, in __getitem__
    return self._get_batches_of_transformed_samples(index_array)

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/legacy/preprocessing/image.py", line 313, in _get_batches_of_transformed_samples
    img = image_utils.load_img(

  File "/home/tony/.local/lib/python3.10/site-packages/keras/src/utils/image_utils.py", line 236, in load_img
    img = pil_image.open(io.BytesIO(f.read()))

  File "/home/tony/.local/lib/python3.10/site-packages/PIL/Image.py", line 3339, in open
    raise UnidentifiedImageError(msg)

PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7dae24f7f150>


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_2095]

To solve the problem, I would create a new cell in the Jupyter Notebook at index 6 and execute the code block provided, which iterates through the splits and calls the `train_and_evaluate()` function for each split. 