<a href="https://colab.research.google.com/github/ttekcor/neurontasksfefu/blob/main/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [29]:
import opendatasets as od
from google.colab import files
import os
import cv2
import numpy as np
import pandas as pd
import transformers
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [5]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ttekcor","key":"dff966458268ba7e3b25ceba3a132cf2"}'}

In [6]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
od.download(
    "https://www.kaggle.com/datasets/gpiosenka/100-bird-species")

Downloading 100-bird-species.zip to ./100-bird-species


100%|██████████| 1.96G/1.96G [00:24<00:00, 86.1MB/s]





In [79]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print('GPU enable')
    except Exception as e:
        print(e)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU enable


In [9]:
path = '/content/100-bird-species/'
train_path = f'{path}train/'
test_path = f'{path}test/'
valid_path = f'{path}valid/'

In [10]:
class_to_id = {}

def pd_from_path(directory: str) -> pd.DataFrame:
    global class_to_id
    data = []
    for path, _, files in os.walk(directory):
        for file in files:
            if file.endswith(('.jpg', '.png', '.jpeg', '.bmp', '.gif')):
                class_name = os.path.basename(path)
                if class_name not in class_to_id:
                    class_to_id[class_name] = len(class_to_id)
                data.append((f'{path}/{file}', class_to_id[class_name]))

    return pd.DataFrame(data, columns=['imagepath', 'label'])

In [11]:
deleted = []
for file in pd_from_path(path).imagepath:
    try:
      img_bytes = tf.io.read_file(file)
      decoded_img = tf.io.decode_image(img_bytes)
    except Exception:
      deleted.append(file)
      os.remove(file)

In [89]:
def load_and_preprocess_image(image_path: str, crop_size: tuple[int, int] = (256, 256)):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(tf.io.read_file(image_path), channels=3)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    img = tf.image.resize(img, crop_size)
    img = img / 255.0  # Normalize to [0, 1]
    return img

def tf_load_image(image_path, crop_size):
    return tf.py_function(load_and_preprocess_image, [image_path, crop_size], tf.float32)

def extract_and_zip(df: pd.DataFrame, crop_size: tuple[int, int] = (256, 256), batch_size: int = 32):
    image_dataset = tf.data.Dataset.from_tensor_slices(df.imagepath)
    label_dataset = tf.data.Dataset.from_tensor_slices(df.label)

    dataset = tf.data.Dataset.zip((image_dataset.map(lambda x: tf_load_image(x, crop_size)), label_dataset))

    return dataset.batch(batch_size)

In [13]:
def load_dataset_from_directory(
        directory: str = './',
        batch_size: int = 32,
        image_size: tuple[int, int] = (256, 256),
        shuffle: bool = True,
        seed: int = None,
        validation_split: float = None,
        subset: str = None
) -> tf.data.Dataset:

    df = pd_from_path(directory)

    if shuffle:
        df = df.sample(frac=1, random_state=seed)

    if validation_split:
        index = round(len(df) * validation_split)
        train_dataset = df[index:]
        valid_dataset = df[:index]

        if subset == 'training':
            df = train_dataset
        elif subset == 'validation':
            df = valid_dataset
        elif subset == 'both':
            return extract_and_zip(train_dataset), extract_and_zip(valid_dataset)


    return extract_and_zip(df, image_size, batch_size)


In [14]:
dataset = load_dataset_from_directory(path)

In [15]:
dataset

<_BatchDataset element_spec=(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [16]:
inputs = tf.keras.layers.Input((256, 256, 3), name='input')
x = tf.keras.layers.Conv2D(32, 3, padding='same')(inputs)
x = tf.keras.layers.PReLU()(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(64)(x)

In [27]:
model = tf.keras.Sequential([
  tf.keras.layers.Rescaling(1./255, input_shape=(256, 256, 3)),
  tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),

])
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [28]:
model.fit(dataset, epochs=1)



KeyboardInterrupt: 

In [90]:
def load_dataset(path: str, batch_size: int, image_size: tuple[int, int], shuffle: bool, split: str) -> tuple[tf.data.Dataset, dict[int, str]]:

    data = pd.read_csv(path)
    data.filepaths = [os.path.join(os.path.dirname(path), file).replace('\\', '/') for file in data.filepaths.tolist()]
    classes = pd.unique(data.labels)
    class_to_id = {name: i for i, name in enumerate(classes)}
    data.insert(3, 'labels_id', [class_to_id[name] for name in data.labels], True)

    if shuffle:
        data = data.sample(frac=1)

    if split:
        data = data[data['data set'] == split]

    image_dataset = tf.data.Dataset.from_tensor_slices(data.filepaths)
    label_dataset = tf.data.Dataset.from_tensor_slices(data.labels_id)

    dataset = tf.data.Dataset.zip((image_dataset.map(lambda x: tf_load_image(x, image_size)), label_dataset))

    return dataset.batch(batch_size), {i: name for i, name in enumerate(classes)}
# def load_dataset(path: str, batch_size: int, image_size: tuple[int, int], shuffle: bool, split: str) -> tuple[tf.data.Dataset, dict[int, str]]:
#     # Load the CSV index file
#     df = pd.read_csv(path)

#     # Filter the dataframe based on the specified split
#     df_split = df[df['dataset'] == split]

#     # Get the filepaths and labels
#     filepaths = df_split['filepaths'].values
#     labels = df_split['labels'].values

#     # Create a dictionary mapping class indices to class names
#     class_mapping = {idx: label for idx, label in enumerate(df['labels'].unique())}

#     # Create a list of class indices corresponding to the labels
#     class_indices = [list(class_mapping.keys())[list(class_mapping.values()).index(label)] for label in labels]

#     # Create a dataset from the filepaths and labels
#     dataset = tf.data.Dataset.from_tensor_slices((filepaths, class_indices))

#     # Function to load and preprocess images
#     def load_and_preprocess_image1(filepath, label):
#       img = tf.io.read_file(tf.strings.as_string(filepath))  # Ensure filepath is a string
#       img = tf.image.decode_image(img, channels=3)
#       img = tf.image.resize(load_image(img, image_size))
#       img = img / 255.0  # Normalize pixel values to the range [0, 1]
#       return img, label


#     # Map the load_and_preprocess_image function to the dataset
#     dataset = dataset.map(load_and_preprocess_image1)

#     # Shuffle the dataset if required
#     if shuffle:
#         dataset = dataset.shuffle(buffer_size=len(filepaths))

#     # Batch and prefetch the dataset
#     dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

#     return dataset, class_mapping

In [91]:
new_dataset, class_index = load_dataset('/content/100-bird-species/birds.csv', 32, (256, 256), False, None)

In [92]:
inputs_x = tf.keras.layers.Input((256, 256, 3), name='input')
x2 = tf.keras.layers.Conv2D(32, 3, padding='same')(inputs_x)
x2 = tf.keras.layers.PReLU()(x2)
x2 = tf.keras.layers.Flatten()(x2)
x2 = tf.keras.layers.Dense(64)(x2)

In [93]:
model2 = tf.keras.Model(inputs=inputs_x, outputs=x2, name='want2sleep')

In [94]:
model2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics='accuracy', optimizer='adam')

In [95]:
model2.fit(new_dataset, epochs=1)



NotFoundError: Graph execution error:

Detected at node ReadFile defined at (most recent call last):
<stack traces unavailable>
Detected at node ReadFile defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) NOT_FOUND:  /content/100-bird-species/train/PARAKETT  AKULET/038.jpg; No such file or directory
	 [[{{node ReadFile}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_7]]
  (1) NOT_FOUND:  /content/100-bird-species/train/PARAKETT  AKULET/038.jpg; No such file or directory
	 [[{{node ReadFile}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1186489]