# Gather data
```
https://data.caltech.edu/records/65de6-vp158
```


# Prepare the Data

In [None]:
# Autheticate google cloud
from google.colab import auth
auth.authenticate_user()

In [None]:
# Download dataset and unzip the file
!gsutil -m cp -r gs://bucket/CUB_200_2011.tgz /content/bird-classification

In [None]:
import pandas as pd

# Load metadata
images_df = pd.read_csv('CUB_200_2011/images.txt',
                        sep=' ',
                        header=None,
                        names=['Image_ID','File_Path'])
labels_df = pd.read_csv('CUB_200_2011/image_class_labels.txt',
                        sep=' ',
                        header=None,
                        names=['Image_ID','Class_ID'])
split_df = pd.read_csv('CUB_200_2011/train_test_split.txt',
                       sep=' ',
                       header=None,
                       names=['Image_ID', 'Is_Training'])
bbox_df = pd.read_csv('CUB_200_2011/bounding_boxes.txt',
                      sep=' ',
                      header=None,
                      names=['Image_ID', 'X', 'Y', 'Width', 'Height'])

# Merge metadata into a single data frame
metadata = pd.merge(images_df
                    labels_df,
                    on='Image_ID')
metadata = pd.merge(metadata,
                    split_df,
                    on='Image_ID')
metadata = pd.merge(metadata,
                    bbox_df,
                    on='Image_ID',
                    how='left')

# Inspect metadata
print(metadata.head())

In [None]:
# Save metadata
metadata.to_json('CUB_200_2011_metadata',
                 orient='records',
                 lines=True)

In [None]:
!gsutil cp CUB_200_2011_metadata gs://bucket/bird-classification

In [None]:
import json

# Create class mapping
classes_df = pd.read_csv('bird-classification/CUB_200_2011/classes.txt',
                         sep=' ',
                         header=None,
                         names=['Class_ID', 'Class_Name'])
class_mapping = {row['Class_ID']: row['Class_Name'] for _, row in classes_df.iterrows()}

In [None]:
# Save class mapping
with open('CUB_200_2011_label_mapping', 'w') as f:
    json.dump(class_mapping, f, indent=4)

In [None]:
# Inspect class mapping
print(class_mapping)

In [None]:
!gsutil cp CUB_200_2011_label_mapping gs://colab_data_bucket/bird-classification/

## Split data

In [None]:
# Split the metadata into training and test sets
train_data = metadata[metadata['Is_Training'] == 1]
test_data = metadata[metadata['Is_Training'] == 0]

In [None]:
# Using metadata prepare dataset for yolov7
import os
import shutil
import cv2
import pandas as pd

# Train and validation images directory paths
train_image_dir = '/content/dataset/train/images/'
val_image_dir = '/content/dataset/val/images/'

# Train and validation label paths
train_label_dir = '/content/dataset/train/labels/'
val_label_dir = '/content/dataset/val/labels/'

os.makedirs(train_image_dir, exist_ok=True)
os.makedirs(val_image_dir, exist_ok=True)
os.makedirs(train_label_dir, exist_ok=True)
os.makedirs(val_label_dir, exist_ok=True)

with open('CUB_200_2011_label_mapping', 'r') as f:
    class_mapping = json.load(f)

# Converts bounding box data to YOLO format
def bbox_to_yolo(row, img_width, img_height):
    x_center = (row['X'] + row['Width'] / 2) / img_width
    y_center = (row['Y'] + row['Height'] / 2) / img_height
    width = row['Width'] / img_width
    height = row['Height'] / img_height
    # Subtract 1 from Class_ID to convert the 1-based indexing of the CUB_200_2011 dataset to the 0-based indexing required for YOLO
    return f"{row['Class_ID'] - 1} {x_center} {y_center} {width} {height}\n"

# Process each image
for _, row in metadata.iterrows():
    img_path = os.path.join('bird-classification/CUB_200_2011/images/', row['File_Path'])

    # Extract the image base name
    img_base_name = os.path.splitext(os.path.basename(row['File_Path']))[0]

    if row['Is_Training'] == 1:
        target_image_dir = train_image_dir
        target_label_dir = train_label_dir
    else:
        target_image_dir = val_image_dir
        target_label_dir = val_label_dir

    # Copy image to corresponding directory
    shutil.copy(img_path, target_image_dir)

    # Get image size
    img = cv2.imread(img_path)
    img_height, img_width, _ = img.shape

    # Create corresponding label file with the same base name as the image
    label_file = os.path.join(target_label_dir, f"{img_base_name}.txt")

    # Collect all bounding box data for this image
    with open(label_file, 'w') as f:
        f.write(bbox_to_yolo(row, img_width, img_height))

In [None]:
# Save dataset
!gsutil cp dataset.zip gs://bucket/bird-classification

## Prepare dataset.yaml config file

In [None]:
# Create dataset.yaml file
import yaml
import json

# Load class names from class mappings
with open('/content/CUB_200_2011_label_mapping', 'r') as f:
    class_names = json.load(f)

# Remove the numeric prefix from class names
names = [name.split('.', 1)[1] for name in class_names.values()]

# Prepare the YAML content
yaml_content = {
    'train': './dataset/train/images/',
    'val': './dataset/val/images/',
    'nc': len(names),
    'names': names
}

with open('dataset.yaml', 'w') as yaml_file:
    yaml.dump(yaml_content, yaml_file)


In [None]:
# Check the content of the generated dataset.yaml file
with open('dataset.yaml', 'r') as f:
    print(f.read())

In [None]:
!gsutil cp -r dataset.yaml gs://bucket/bird-classification

# Move the data to the virtual machine

In [None]:
!scp /content/dataset.yaml timon_l_tukei@00.00.000.000:/bird-classification/yolov7/dataset.yaml
!scp /content/dataset timon_l_tukei@00.00.000.000:/bird-classification/yolov7/dataset

## Prepare enviroment

```
mkdir bird-classification && cd bird-classification
python3 -m venv venv
source venv/bin/activate
git clone https://github.com/WongKinYiu/yolov7.git
cd yolov7
pip install -r requierments
```



## Train the model

```
nohup python train.py \
  --workers 2 \
  --device 0 \
  --batch-size 16 \
  --data dataset.yaml \
  --img 640 640 \
  --cfg cfg/training/yolov7.yaml \
  --weights '' \
  --name yolov7 \
  --hyp data/hyp.scratch.p5.yaml \
  > train.log 2>&1 &
```

# Evaluate model

Please reffer to model_evaluation_report.md