# Libraries

### For CNN

In [8]:
from keras.layers import Rescaling
from keras.models import Sequential
from keras.utils import image_dataset_from_directory
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten

### For KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

### Utilities

In [10]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from data_cleanup import clean_data
from sklearn.preprocessing import LabelEncoder

# Setup

In [11]:
data_dir = 'pizza types'

In [12]:
# Converting all files to .jpg and resizing them to 256x256
clean_data(data_dir)

# KNN

In [13]:
def train_test_split_from_directory(data_dir):
    X_train, X_test, y_train, y_test = [], [], [], []

    for subdir in os.listdir(data_dir):
        subdir_path = os.path.join(data_dir, subdir)
        X_subdir, y_subdir = [], []

        for filename in os.listdir(subdir_path):
            img = cv2.cvtColor(cv2.imread(os.path.join(subdir_path, filename)), cv2.COLOR_BGR2RGB)
            X_subdir.append(img)
            y_subdir.append(subdir)
        
        X_train_subdir, X_test_subdir, y_train_subdir, y_test_subdir = train_test_split(X_subdir, y_subdir, test_size=0.2, random_state=42)

        X_train.extend(X_train_subdir)
        X_test.extend(X_test_subdir)
        y_train.extend(y_train_subdir)
        y_test.extend(y_test_subdir)
        
    return X_train, X_test, y_train, y_test

In [14]:
X_train, X_test, y_train, y_test = train_test_split_from_directory(data_dir)

In [15]:
def shuffle_data(data, seed):
    rng = np.random.default_rng(seed)
    shuffled_indices = rng.permutation(len(data))
    return [data[i] for i in shuffled_indices]

In [16]:
seed = 42
X_train = shuffle_data(X_train, seed)
X_test = shuffle_data(X_test, seed)
y_train = shuffle_data(y_train, seed)
y_test = shuffle_data(y_test, seed)


In [17]:
X_train = np.reshape(X_train, (len(y_train), -1))
X_test = np.reshape(X_test, (len(y_test), -1))

In [18]:
label_encoder = LabelEncoder()

y_test = label_encoder.fit_transform(y_test)
y_train = label_encoder.fit_transform(y_train)

In [19]:
max_testing_accuracy = 0
for k in range(1, 394, 5): 
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    if accuracy_score(y_test, y_test_pred) > max_testing_accuracy:
        max_testing_accuracy = accuracy_score(y_test, y_test_pred)

    print('For k =', k)
    print('Train Data Accuracy:', accuracy_score(y_train, y_train_pred))
    print()
    print('Test Data Accuracy:', accuracy_score(y_test, y_test_pred))
    print()
print(max_testing_accuracy)

For k = 1
Train Data Accuracy: 1.0

Test Data Accuracy: 0.40594059405940597

For k = 6
Train Data Accuracy: 0.44472361809045224

Test Data Accuracy: 0.31683168316831684

For k = 11
Train Data Accuracy: 0.42462311557788945

Test Data Accuracy: 0.2871287128712871

For k = 16
Train Data Accuracy: 0.40954773869346733

Test Data Accuracy: 0.33663366336633666

For k = 21
Train Data Accuracy: 0.3869346733668342

Test Data Accuracy: 0.33663366336633666

For k = 26
Train Data Accuracy: 0.38944723618090454

Test Data Accuracy: 0.297029702970297

For k = 31
Train Data Accuracy: 0.3768844221105528

Test Data Accuracy: 0.297029702970297

For k = 36
Train Data Accuracy: 0.3743718592964824

Test Data Accuracy: 0.33663366336633666

For k = 41
Train Data Accuracy: 0.3592964824120603

Test Data Accuracy: 0.37623762376237624

For k = 46
Train Data Accuracy: 0.3592964824120603

Test Data Accuracy: 0.32673267326732675

For k = 51
Train Data Accuracy: 0.35678391959798994

Test Data Accuracy: 0.3168316831683

# CNN

In [20]:
train_data = image_dataset_from_directory(
    data_dir,
    batch_size=32,
    image_size=(256, 256),
    validation_split=0.2,
    subset='training',
    seed=42
)

Found 499 files belonging to 5 classes.
Using 400 files for training.


In [21]:
val_data = image_dataset_from_directory(
    data_dir,
    batch_size=32,
    image_size=(256,256),
    validation_split=0.2,
    subset='validation',
    seed=42
)

Found 499 files belonging to 5 classes.
Using 99 files for validation.


In [22]:
model = Sequential()

model.add(Rescaling(1./255))
model.add(Conv2D(16, (3,3), activation='relu', input_shape=(256,256,3)))
model.add(MaxPooling2D())
model.add(Conv2D(32, (3,3), activation='relu'))
model.add(MaxPooling2D())
model.add(Conv2D(16, (3,3), activation='relu'))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(5))

model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(
    train_data,
    validation_data=val_data,
    epochs=20
)

Epoch 1/20


In [None]:
img = cv2.cvtColor(cv2.resize(cv2.imread('bestpizza.jpg'), (256, 256)), cv2.COLOR_BGR2RGB)/255.0
plt.imshow(img)
plt.show()
img = np.expand_dims(img, axis=0)


In [None]:
predictions = model.predict(img)

In [None]:
class_labels = train_data.class_names
predicted_class_index = np.argmax(predictions)
class_labels[predicted_class_index]