##Import Dependencies

In [None]:
! pip install kaggle



In [None]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
import os
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb #remove when done
import pandas as pd
import seaborn as sns
# from clean import createImageDataset, getFeaturesAndLabels, getDigitGnd, getDigitRange
from sklearn.metrics import accuracy_score
    
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'    # removes a warning when using tensorflow


##Parameters to Tune

In [None]:
    """DATA EXTRACTION PARAMETERS"""
    train_batch_size = 47332    # 47332 images in training
    valid_batch_size = 1625       # 1625 images in validation
    test_batch_size = 1625         # 1625 images in testing
    seed_val = 42
    isColored = False           # if False -> color_mode = "grayscale", meaning way less features
    isNormalized = False         # for pixel brightness: if False -> [0,255], elif -> [0,1]

    """MODEL PARAMETERS"""
    # num_trees = 3
    # param = {
    #     'objective': 'multi:softmax',
    #     # 'num_parallel_tree': 3,
    #     'subsample': 0.8,
    #     'num_class': 7,
    #     'max_depth': 3,
    #     'tree_method': 'hist',
    #     'random_state': 42,
    #     'eval_metric': 'mlogloss'
    # }
    n_estimators=500
    learning_rate=1
    

##Getting the Data

In [None]:
! kaggle datasets download -d gpiosenka/100-bird-species

Downloading 100-bird-species.zip to /content
100% 1.17G/1.17G [00:15<00:00, 58.5MB/s]
100% 1.17G/1.17G [00:15<00:00, 82.7MB/s]


In [None]:
! unzip 100-bird-species.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: train/TURQUOISE MOTMOT/043.jpg  
  inflating: train/TURQUOISE MOTMOT/044.jpg  
  inflating: train/TURQUOISE MOTMOT/045.jpg  
  inflating: train/TURQUOISE MOTMOT/046.jpg  
  inflating: train/TURQUOISE MOTMOT/047.jpg  
  inflating: train/TURQUOISE MOTMOT/048.jpg  
  inflating: train/TURQUOISE MOTMOT/049.jpg  
  inflating: train/TURQUOISE MOTMOT/050.jpg  
  inflating: train/TURQUOISE MOTMOT/051.jpg  
  inflating: train/TURQUOISE MOTMOT/052.jpg  
  inflating: train/TURQUOISE MOTMOT/053.jpg  
  inflating: train/TURQUOISE MOTMOT/054.jpg  
  inflating: train/TURQUOISE MOTMOT/055.jpg  
  inflating: train/TURQUOISE MOTMOT/056.jpg  
  inflating: train/TURQUOISE MOTMOT/057.jpg  
  inflating: train/TURQUOISE MOTMOT/058.jpg  
  inflating: train/TURQUOISE MOTMOT/059.jpg  
  inflating: train/TURQUOISE MOTMOT/060.jpg  
  inflating: train/TURQUOISE MOTMOT/061.jpg  
  inflating: train/TURQUOISE MOTMOT/062.jpg  
  inflating: tr

In [None]:
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras import layers
from tensorflow_datasets import as_numpy

def createImageDataset(path, color, seedVal, batchSize):
    if color is True:
        color = "rgb"
    else:
        color = "grayscale"

    dataset = image_dataset_from_directory(
        path,
        labels="inferred",
        color_mode=color,
        image_size=(32,32),
        seed=seedVal,
        batch_size=batchSize,
        shuffle=False
    )

    return dataset

def getFeaturesAndLabels(norm, batch):
    if norm is True:
        normalization_layer = layers.Rescaling(1./255)
        batch = batch.map(lambda x, y: (normalization_layer(x), y))
    
    image, label = next(iter(batch))
    npImages = as_numpy(image)
   
    npImages = npImages.reshape(npImages.shape[0], -1)
    npLabels = as_numpy(label)

    return npImages, npLabels


In [None]:
    
# download dataset from Kaggle, unzip and place in /datasets folder with following name
train_data_dir = "./train"
valid_data_dir = "./valid"
test_data_dir = "./test"

train_set = createImageDataset(batchSize=train_batch_size, path=train_data_dir, color=isColored, seedVal=seed_val)
valid_set = createImageDataset(batchSize=valid_batch_size, path=valid_data_dir, color=isColored, seedVal=seed_val)
test_set = createImageDataset(batchSize=test_batch_size, path=test_data_dir, color=isColored, seedVal=seed_val)

train_features, train_labels = getFeaturesAndLabels(norm=isNormalized, batch=train_set)
valid_features, valid_labels = getFeaturesAndLabels(norm=isNormalized, batch=valid_set)
test_features, test_labels = getFeaturesAndLabels(norm=isNormalized, batch=test_set)



Found 47332 files belonging to 325 classes.
Found 1625 files belonging to 325 classes.
Found 1625 files belonging to 325 classes.


##Creating and training a model

In [None]:
import pickle
! mkdir ./models

def saveModel(model):
    i = 1
    model_dir = "./models/"
    json_dir = "./json/"
    prefix = "ada-clf-"
    suffix = ".sav"

    model_version = model_dir + prefix + str(i) + suffix
    json_version = json_dir + prefix + str(i)

    while os.path.exists(model_version) is True:
        i += 1
        model_version = prefix + str(i) + suffix

    pickle.dump(model, open(model_version, 'wb'))

In [None]:
print("Creating model...")
model = AdaBoostClassifier(
    n_estimators=150,
    learning_rate=0.05,
)
print("Training model...")
start = time.time()
gbm = model.fit(train_features, train_labels)
end = time.time()
saveModel(gbm)
print("Training complete. Elapsed time in seconds: " + str(end-start))

print("Predicting...")
y_pred = model.predict(valid_features)
pred = [round(value) for value in y_pred]
accuracy = accuracy_score(valid_labels, pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Creating model...
Training model...
Training complete. Elapsed time in seconds: 580.3369171619415
Predicting...
Accuracy: 40.00%


## Cross Validation

In [None]:
# insert andy's repurposed code
from sklearn.model_selection import cross_val_score

print("Cross Validating...")

mean_err=[]
NE_range = np.arange(150,500,150)
LR_range = np.arange(0.01,0.05,0.01)

parameter1="n_estimators"
parameter2="learning_rate"

for i in NE_range:
  for j in LR_range:
    ada = AdaBoostClassifier(
        base_estimator=None,
        n_estimators=i,
        learning_rate=j,
        algorithm='SAMME.R',
        random_state=None)
    model = ada.fit(train_features, train_labels)
    cv_score = cross_val_score(model, valid_features, valid_labels, n_jobs=2)
    cv_error = 1 - cv_score
    mean_err.append(cv_error.mean())

    print(f'{parameter1} = {i} | {parameter2} = {j} \t | \t Average Error: % {cv_error.mean()} | Average Accuracy: {cv_score.mean()}')


Cross Validating...
n_estimators = 150 | learning_rate = 0.01 	 | 	 Average Error: % 0.8400000000000001 | Average Accuracy: 0.16
n_estimators = 150 | learning_rate = 0.02 	 | 	 Average Error: % 0.8400000000000001 | Average Accuracy: 0.16
n_estimators = 150 | learning_rate = 0.03 	 | 	 Average Error: % 0.8400000000000001 | Average Accuracy: 0.16
n_estimators = 150 | learning_rate = 0.04 	 | 	 Average Error: % 0.8400000000000001 | Average Accuracy: 0.16


KeyboardInterrupt: ignored

## Plot Model

In [None]:
import matplotlib.pyplot as plt

print("Plotting model...")
plt.plot(LR_range, mean_err)
plt.ylabel("Test Error")
plt.xlabel("Number of Trees")

plt.subplot()
plt.plot()