In [5]:
import subprocess
import os
import zipfile

# Function to check if the dataset is downloaded
def is_dataset_downloaded():
    return os.path.isdir("./data/train")

# Function to check if Kaggle CLI is installed
def is_kaggle_installed():
    try:
        subprocess.run(["kaggle", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        return True
    except FileNotFoundError:
        return False

# Function to install Kaggle CLI if not already installed
def install_kaggle():
    if not is_kaggle_installed():
        print("Kaggle CLI is not installed. Installing...")
        install_kaggle_command = ["pip", "install", "kaggle"]
        result = subprocess.run(install_kaggle_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        if result.returncode == 0:
            print("Kaggle CLI installed successfully")
        else:
            print(f"Kaggle CLI installation failed with return code {result.returncode}")
            return False
    return True

# Function to download and unzip the dataset
def download_and_unzip_dataset():
    # Define the command to download the dataset
    download_command = ["kaggle", "datasets", "download", "-d", "gpiosenka/100-bird-species", "-p", "./data"]

    # Run the download command
    result = subprocess.run(download_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Check if the download was successful
    if result.returncode == 0:
        print("Download executed successfully")

        # Define the path to the downloaded zip file
        zip_file_path = "./data/100-bird-species.zip"

        # Unzip the downloaded file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall("./data")
        
        print(f"File '{zip_file_path}' has been successfully unzipped.")
    else:
        print(f"Download failed with return code {result.returncode}")

# Download the dataset if not already downloaded
if not is_dataset_downloaded():
    if install_kaggle():
        download_and_unzip_dataset()
else:
    print("Dataset is already downloaded.")


Dataset is already downloaded.


In [6]:
import os
import pandas as pd
import numpy as np
from PIL import Image

In [7]:
import os
from PIL import Image
import numpy as np

dir = './data'
folds = os.listdir(dir)

for fold in folds:
    fold_path = os.path.join(dir, fold)
    if os.path.isdir(fold_path):
        subfolds = os.listdir(fold_path)
        for subfold in subfolds:
            subfold_path = os.path.join(fold_path, subfold)
            if os.path.isdir(subfold_path):
                files = os.listdir(subfold_path)
                for file in files:
                    file_path = os.path.join(subfold_path, file)
                    if not (file.endswith('.jpg')) or np.array(Image.open(file_path)).shape != (224, 224, 3):
                        print('Deleting ' + file_path)
                        os.remove(file_path)


Deleting ./data/valid/PLUSH CRESTED JAY/4.jpg
Deleting ./data/valid/PLUSH CRESTED JAY/5.jpg
Deleting ./data/valid/PLUSH CRESTED JAY/2.jpg
Deleting ./data/valid/PLUSH CRESTED JAY/3.jpg
Deleting ./data/valid/PLUSH CRESTED JAY/1.jpg
Deleting ./data/test/PLUSH CRESTED JAY/4.jpg
Deleting ./data/test/PLUSH CRESTED JAY/5.jpg
Deleting ./data/test/PLUSH CRESTED JAY/2.jpg
Deleting ./data/test/PLUSH CRESTED JAY/3.jpg
Deleting ./data/test/PLUSH CRESTED JAY/1.jpg


In [None]:
# Load the CSV file
birds_data = pd.read_csv('./data/birds.csv')

# Create a dictionary mapping class IDs to class labels
labels_dict = dict(zip(birds_data['labels'], birds_data['class id'].astype(int)))

# Print the dictionary
print(labels_dict)

In [None]:
train_dir = './data/train'
filepaths = []
labels = []

folds = os.listdir(train_dir)
for fold in folds:
    foldpath = os.path.join(train_dir, fold)
    if os.path.isdir(foldpath):  # Check if it's a directory
        filelist = os.listdir(foldpath)
        for file in filelist:
            fpath = os.path.join(foldpath, file)
            filepaths.append(fpath)
            labels.append(fold)

train_pics_array = np.zeros((len(filepaths), 224*224*3))
train_labels_int = np.zeros(len(filepaths))
for i in range(len(filepaths)):
    train_pics_array[i] = (np.array(Image.open(filepaths[i])).reshape(-1))/255
    train_labels_int[i] = (labels_dict[labels[i]])

In [None]:
import matplotlib.pyplot as plt
plt.imshow(train_pics_array[0].reshape(224, 224, 3))
print(train_labels_int[0])
print(train_pics_array[0].shape)

In [None]:
valid_dir = './data/valid'
filepaths = []
labels = []

folds = os.listdir(valid_dir)
for fold in folds:
    foldpath = os.path.join(valid_dir, fold)
    if os.path.isdir(foldpath):  # Check if it's a directory
        filelist = os.listdir(foldpath)
        for file in filelist:
            fpath = os.path.join(foldpath, file)
            filepaths.append(fpath)
            labels.append(fold)

valid_pics_array = np.zeros((len(filepaths), 224*224*3))
valid_labels_int = np.zeros(len(filepaths))
for i in range(len(filepaths)):
    valid_pics_array[i] = (np.array(Image.open(filepaths[i])).reshape(-1))/255
    valid_labels_int[i] = (labels_dict[labels[i]])

In [None]:
# Generate  test data paths with labels
test_dir = './data/test'
filepaths = []
labels = []

folds = os.listdir(test_dir)
for fold in folds:
    foldpath = os.path.join(test_dir, fold)
    if os.path.isdir(foldpath):  # Check if it's a directory
        filelist = os.listdir(foldpath)
        for file in filelist:
            fpath = os.path.join(foldpath, file)
            filepaths.append(fpath)
            labels.append(fold)

test_pics_array = np.zeros((len(filepaths), 224*224*3))
test_labels_int = np.zeros(len(filepaths))
for i in range(len(filepaths)):
    test_pics_array[i] = (np.array(Image.open(filepaths[i])).reshape(-1))/255
    test_labels_int[i] = (labels_dict[labels[i]])

In [None]:
def compute_cost(x, y, w, b): 
    # Computes the cost function for linear regression
    # x: input data
    # y: labels
    # w: weights
    # b: bias
    # Returns: cost function value
    m = len(y)
    cost = (1/(2*m)) * np.sum((np.dot(x, w) + b - y)**2)
    return cost

def compute_gradient(x, y, w, b): 
    # Computes the gradient of the cost function for linear regression
    # x: input data
    # y: labels
    # w: weights
    # b: bias
    # Returns: gradient of the cost function w.r.t. w and b
    m = len(y)
    dw = (1/m) * np.dot(x.T, (np.dot(x, w) + b - y))
    db = (1/m) * np.sum(np.dot(x, w) + b - y)
    return dw, db

In [None]:
def train(x, y, w, b, learning_rate, num_iterations):
    # Trains the linear regression model
    # x: input data
    # y: labels
    # w: weights
    # b: bias
    # learning_rate: learning rate for gradient descent
    # num_iterations: number of iterations for gradient descent
    # Returns: w, b, and cost array
    costs = []
    for i in range(num_iterations):
        dw, db = compute_gradient(x, y, w, b)
        w = w - learning_rate * dw
        b = b - learning_rate * db
        cost = compute_cost(x, y, w, b)
        costs.append(cost)
    return w, b, costs

In [None]:
# Initialize the weights and bias
w = np.zeros((224*224*3, 1))
b = 0

In [None]:
# Train the model
w, b, costs = train(train_pics_array, train_labels_int, w, b, 0.0001, 100)

In [None]:
# Plot the cost function
plt.plot(costs)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost Function')
plt.show()

In [None]:
# Compute the predictions
predictions = np.dot(valid_pics_array, w) + b
predictions = predictions.reshape(-1)

In [None]:
# Compute the accuracy
valid_labels_int = valid_labels_int.reshape(-1)
accuracy = np.sum(valid_labels_int == predictions)/len(valid_labels_int)
print('Accuracy: ' + str(accuracy))


In [None]:
# Compute the predictions
predictions = np.dot(test_pics_array, w) + b
predictions = predictions.reshape(-1)


In [None]:
# Compute the accuracy
test_labels_int = test_labels_int.reshape(-1)
accuracy = np.sum(test_labels_int == predictions)/len(test_labels_int)
print('Accuracy: ' + str(accuracy))