In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.datasets import fetch_openml            # common data set access
from sklearn.preprocessing import StandardScaler     # scaling transform
from sklearn.model_selection import train_test_split # validation tools
from sklearn.metrics import zero_one_loss
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDClassifier       # Used in 2D data problems
from sklearn.linear_model import LogisticRegression  # Used in MNIST data problem
from sklearn.linear_model import LogisticRegressionCV
import torch

seed = 1234
np.random.seed(seed) 

from scipy.ndimage import rotate

In [9]:
# 1. Setup the paths
base_path = "." 
images_folder = os.path.join(base_path, "images")
csv_file = os.path.join(base_path, "legend.csv")

# 2. Read CSV file
df = pd.read_csv(csv_file)

# Lists to store our data
data = []   # This will become X
labels = [] # This will become y

# Define a fixed size for images. For example, MINST is 28*28, here we do 64*64 
IMG_SIZE = 64

print("Loading images... this might take a moment.")

# 3. Loop through the CSV and load images
for index, row in df.iterrows():
    img_name = row['image']
    emotion = row['emotion']

    img_path = os.path.join(images_folder, img_name)

    # Check if file actually exists to avoid errors
    if os.path.exists(img_path):
        # Load image in Grayscale (usually sufficient for emotion detection)
        # If you want color, remove the second argument.
        img_array = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        
        if img_array is not None:
            # Resize the image (Crucial step!)
            new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
            
            # Flatten the image
            # This converts the 2D image (64x64) into a 1D row of numbers (4096,)
            # This matches the format your professor used for X
            flat_array = new_array.flatten()
            
            data.append(flat_array)
            labels.append(emotion)

# 4. Convert to Numpy Arrays (X and y)
X = np.array(data)
y = np.array(labels)

print("Data loaded successfully!")
print(f"Features (X) shape: {X.shape}") # Should be (Number of images, 4096)
print(f"Labels (y) shape: {y.shape}")    # Should be (Number of images,)


Loading images... this might take a moment.
Data loaded successfully!
Features (X) shape: (13690, 4096)
Labels (y) shape: (13690,)


### **Augmentation**
- Apply augmentation for all the data that are not "happiness" or "neutral"

In [20]:
def augment_data(X, y):
    new_X = []
    new_y = []
    
    print(f"Starting augmentation on {len(X)} images...")
    
    for i in range(len(X)):
        label = y[i]

        if label != 'happiness' and label != 'neutral':
            
            # Reshape to image for processing
            if X[i].ndim == 1:
                original_img = X[i].reshape(64, 64)
            else:
                original_img = X[i]

            # --- Augmentation 1: Horizontal Flip ---
            flipped_img = np.fliplr(original_img)
            
            # --- Augmentation 2: Rotation ---
            rotated_img = rotate(original_img, angle=10, reshape=False)

            new_X.append(flipped_img.flatten())
            new_y.append(label)
            
            new_X.append(rotated_img.flatten())
            new_y.append(label)
    
    # Convert lists to numpy arrays
    if len(new_X) > 0:
        new_X = np.array(new_X)
        new_y = np.array(new_y)
        
        print(f"Generated {len(new_X)} new augmented samples.")
        
        # Stack the Original X with the New Augmented X
        # vstack stacks them vertically (adds more rows)
        final_X = np.vstack((X, new_X))
        final_y = np.concatenate((y, new_y))
        
        return final_X, final_y
    else:
        print("No augmentation needed based on criteria.")
        return X, y  

In [21]:
# Run the fixed function
X_aug, y_aug = augment_data(X, y)
print(f"Original shape: {X.shape}")
print(f"Augmented shape: {X_aug.shape}")

Starting augmentation on 13690 images...
Generated 2252 new augmented samples.
Original shape: (13690, 4096)
Augmented shape: (15942, 4096)
