##  Task-03
##   Implement a support vector machine (SVM) to classify images of cats and dogs from the Kaggle dataset.



In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np

def prepare_image_dataframe(folder_path):
    """
    Prepares a Pandas DataFrame from images in a folder.

    Args:
        folder_path: Path to the folder containing images.

    Returns:
        A Pandas DataFrame with columns 'image', 'label', and 'filepath'. 
        Returns None if the folder is empty or doesn't exist.
    """

    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return None

    image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    if not image_files:
        print(f"Error: Folder '{folder_path}' is empty.")
        return None


    data = []
    for filename in image_files:
        try:
            filepath = os.path.join(folder_path, filename)
            label = filename.lower().split('.')[0] #Assumes label is before the first underscore.  Adjust if needed.
            
            if label not in ['cat', 'dog']: #add more labels if needed
                print(f"Warning: Skipping file '{filename}' due to unknown label.")
                continue

            img = Image.open(filepath)
            img_array = np.array(img) #convert to numpy array
            data.append({'image': img_array, 'label': label, 'filepath': filepath})

        except (IOError, OSError) as e:
            print(f"Error processing file '{filename}': {e}")
            continue  # Skip files that cause errors

    if not data:
        print(f"Error: No valid images found in '{folder_path}'.")
        return None


    df = pd.DataFrame(data)
    return df


# Example usage:
folder_path = "./train_data"  # Replace with your folder path
df_train = prepare_image_dataframe(folder_path)

In [2]:
import os
import pandas as pd
from PIL import Image
import numpy as np

def prepare_test_image_dataframe(folder_path):
    """
    Prepares a Pandas DataFrame from test images (no labels) in a folder.

    Args:
        folder_path: Path to the folder containing test images.

    Returns:
        A Pandas DataFrame with columns 'image' and 'filepath'.
        Returns None if the folder is empty or doesn't exist.
    """

    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' not found.")
        return None

    image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    if not image_files:
        print(f"Error: Folder '{folder_path}' is empty.")
        return None

    data = []
    for filename in image_files:
        try:
            filepath = os.path.join(folder_path, filename)
            img = Image.open(filepath)
            img_array = np.array(img)
            data.append({'image': img_array, 'filepath': filepath})

        except (IOError, OSError) as e:
            print(f"Error processing file '{filename}': {e}")
            continue

    if not data:
        print(f"Error: No valid images found in '{folder_path}'.")
        return None

    df = pd.DataFrame(data)
    return df

# Example usage:
test_folder_path = "./test_data"  # Replace with your test folder path
test_df = prepare_test_image_dataframe(test_folder_path)

In [3]:
test_df.head()

Unnamed: 0,image,filepath
0,"[[[38, 67, 101], [36, 65, 99], [39, 65, 102], ...",./test_data\1.jpg
1,"[[[10, 10, 0], [9, 11, 0], [9, 10, 4], [9, 9, ...",./test_data\10.jpg
2,"[[[4, 4, 4], [3, 3, 3], [2, 2, 2], [3, 3, 3], ...",./test_data\100.jpg
3,"[[[221, 222, 216], [221, 222, 216], [222, 223,...",./test_data\11.jpg
4,"[[[141, 127, 114], [176, 162, 149], [163, 149,...",./test_data\12.jpg


In [4]:
df_train.head()

Unnamed: 0,image,label,filepath
0,"[[[202, 164, 89], [205, 167, 92], [208, 170, 9...",cat,./train_data\cat.0.jpg
1,"[[[39, 44, 38], [40, 45, 41], [41, 45, 44], [4...",cat,./train_data\cat.1.jpg
2,"[[[25, 32, 40], [16, 23, 33], [7, 11, 22], [11...",cat,./train_data\cat.10.jpg
3,"[[[223, 224, 219], [223, 224, 219], [223, 224,...",cat,./train_data\cat.100.jpg
4,"[[[71, 253, 254], [72, 254, 255], [72, 254, 25...",cat,./train_data\cat.101.jpg


In [5]:
df_train['label'].value_counts()

label
cat    251
dog    251
Name: count, dtype: int64

In [6]:
df_train['label'] = np.where(df_train['label'] == 'cat', 1, 0)

In [7]:
df_train.head()

Unnamed: 0,image,label,filepath
0,"[[[202, 164, 89], [205, 167, 92], [208, 170, 9...",1,./train_data\cat.0.jpg
1,"[[[39, 44, 38], [40, 45, 41], [41, 45, 44], [4...",1,./train_data\cat.1.jpg
2,"[[[25, 32, 40], [16, 23, 33], [7, 11, 22], [11...",1,./train_data\cat.10.jpg
3,"[[[223, 224, 219], [223, 224, 219], [223, 224,...",1,./train_data\cat.100.jpg
4,"[[[71, 253, 254], [72, 254, 255], [72, 254, 25...",1,./train_data\cat.101.jpg


In [8]:
from sklearn import svm

clf = svm.SVC()

In [9]:
x_train = np.array([img.flatten() for img in df_train['image']])
y_train = df_train["label"]

In [10]:
clf.fit(x_train, y_train)

In [17]:
clf.predict(np.array([img.flatten() for img in df_train['image']]))

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
clf.predict(np.array([img.flatten() for img in test_df['image']]))

array([0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1])