In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow import keras 
from tensorflow.keras import layers
import os
from PIL import Image
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from keras.layers import Conv2D, MaxPooling2D, Dropout, GlobalAveragePooling2D, Dense, BatchNormalization, AveragePooling2D, Flatten
from sklearn import svm 
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
import aspose.words as aw

In [None]:
"""Getting df with all filenames and images (missing labels at this stage) - shape = (166866,2)"""
# only take first ten photos in each brand folder
# (150,150) grayscale --> 2m 28.9s
# (150,150) rgb, with pixels as cols --> 

def load_images_from_folder(folder_path, image_size=(150, 150)):
    images = []
    
    # Recursively traverse through the folder structure
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            # Get the full path of the file
            file_path = os.path.join(root, filename)
            
            # Check if the file is an image
            if any(filename.endswith(extension) for extension in ['.jpg', '.jpeg', '.png', '.gif']):
                # Load the image using PIL (Python Imaging Library)
                try:
                    with Image.open(file_path) as img:
                        img = img.resize(image_size) #resizing image
                        img = img.convert('RGB') #Converting to gray scale
                        #img_array = np.array(img) / 255.0 # Convert the image to numpy array and normalize pixel values
                         # Extract folder name from the root path
                        label = os.path.basename(root)
                        
                        images.append({'Label': label, 'Filename': filename, 'Image': img})
                        #images.append({'Filename': filename, 'Image': img}) # Append the image array to the list
                except Exception as e:
                    print(f"Error loading image '{file_path}': {e}")
    
    # Convert the list of dictionaries to a DataFrame
    images_df = pd.DataFrame(images)
    return images_df

# Loading
png_folder_path=r"C:\Users\emili\OneDrive\Dokumente\03_Master_CBS\05_Summerterm24\01_ML and DL\Project\png logo findder"
image_df = load_images_from_folder(png_folder_path)
print('Image Dataframe: \n \n', image_df.head())

In [None]:
label_encoder = LabelEncoder()
image_df['Label coded'] = label_encoder.fit_transform(image_df['Label'])
image_df

In [None]:
#split it into features and labels
images = image_df.drop(['Filename','Label coded','Label'], axis=1)
labels = image_df[['Label coded']]

In [None]:
X_train ,X_test, y_train, y_test=train_test_split(images, labels, test_size=0.2, random_state=42)

In [None]:
# perform PCA on X to reduce amount of features
# reduce to dimensionality so that 80% of variance is kept
from sklearn.decomposition import PCA

# Convert PIL images/tensors in DataFrame to NumPy array
X_train = np.array([np.array(img) for img in X_train.iloc[:, 0]])
X_test = np.array([np.array(img) for img in X_test.iloc[:, 0]])

# reshape features accordingly to SVM architecture NOTE: train test split again!!
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))

pca = PCA(n_components=0.8, random_state=42)
X_pca_train = pca.fit_transform(X_train)

# check how many principal components are needed
pca.n_components_

# apply the trained PCA model on the test set as well
X_pca_test = pca.transform(X_test)

# check that PCA worked
print(X_pca_train.shape)
print(X_pca_test.shape)

In [None]:
# define the linear SVM model
from sklearn.svm import SVC
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import classification_report, multilabel_confusion_matrix


svm = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42)
multi_svm = ClassifierChain(svm)

# define the hyperparameter grid
param_grid = {
    # the selection of the hyperparameters was in the beginning larger and reduced after some tries
    # restricting the model with smaller values made the best results
    'base_estimator__C':[2**-5, 2**-4, 2**-3, 2**-1],
}

# define the scoring metric
scorer = make_scorer(roc_auc_score, multi_class='ovr', average='macro')

# define the grid search
grid = GridSearchCV(
    multi_svm,
    param_grid,
    cv=5,
    scoring=scorer,
    refit=True,
    return_train_score=True,
    verbose=3)

In [None]:
# train the SVMs
grid.fit(X_pca_train, y_train)