# 01_Preprocess.ipynb
### Facial Emotion Recognition — Data Preprocessing (Clean Version)
This notebook:
- Loads `fer2013.csv`
- Converts pixel strings → cropped & resized images
- Uses MTCNN (optional) for face detection
- Displays a single clean tqdm progress bar
- Suppresses all TensorFlow noise for a neat output

In [8]:
# Install dependencies (if not already installed)
!pip install mtcnn opencv-python pandas numpy tqdm pillow scikit-learn tensorflow --quiet

In [2]:
import cv2
import numpy as np
import pandas as pd
import os, sys, time
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from mtcnn.mtcnn import MTCNN
from PIL import Image

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(0)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [3]:
# Paths and parameters
DATA_PATH = '../data/fer2013.csv'
OUTPUT_DIR = '../data/cropped_faces'
TARGET_SIZE = 224
USE_MTCNN = True
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Reading dataset from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
df = df[df['pixels'].notnull()]
print(f"Loaded {len(df)} samples.")

Reading dataset from: ../data/fer2013.csv
Loaded 35887 samples.


In [4]:
def pixels_to_image(pixels_str):
    arr = np.fromstring(pixels_str, dtype=int, sep=' ')
    img = arr.reshape(48, 48).astype('uint8')
    return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

def detect_and_crop(img, target_size=224, use_mtcnn=True):
    if not use_mtcnn:
        return cv2.resize(img, (target_size, target_size))
    global detector
    if 'detector' not in globals():
        detector = MTCNN()
    results = detector.detect_faces(img)
    if not results:
        return cv2.resize(img, (target_size, target_size))
    x, y, w, h = results[0]['box']
    x, y = max(0, x), max(0, y)
    face = img[y:y + h, x:x + w]
    return cv2.resize(face, (target_size, target_size))

In [5]:
start_time = time.time()

# Convert pixel strings to temporary images
temp_dir = os.path.join(OUTPUT_DIR, 'temp_raw')
os.makedirs(temp_dir, exist_ok=True)
X_paths, y_labels = [], []

for idx, row in tqdm(df.iterrows(), total=len(df), desc='Converting pixel data', dynamic_ncols=True):
    img = pixels_to_image(row['pixels'])
    label = str(int(row['emotion']))
    out_path = os.path.join(temp_dir, f'{idx}_{label}.jpg')
    Image.fromarray(img).save(out_path)
    X_paths.append(out_path)
    y_labels.append(label)

X_train, X_val, y_train, y_val = train_test_split(X_paths, y_labels, test_size=0.15, stratify=y_labels, random_state=42)
splits = [('train', X_train, y_train), ('val', X_val, y_val)]

Converting pixel data: 100%|██████████| 35887/35887 [00:08<00:00, 4452.43it/s]


In [6]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
tf.get_logger().setLevel("ERROR")
tf.autograph.set_verbosity(0)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
try:
    tf.keras.utils.disable_interactive_logging()
except Exception:
    pass

all_images = [
    (img_path, label, split_name)
    for split_name, X_split, y_split in splits
    for img_path, label in zip(X_split, y_split)
]

failed = 0
start_time = time.time()

with tqdm(total=len(all_images), desc="Processing all images", dynamic_ncols=True, file=sys.stdout, position=0, leave=True) as pbar:
    for img_path, label, split_name in all_images:
        img = cv2.imread(img_path)
        if img is None:
            failed += 1
            pbar.update(1)
            continue

        # Face detection & crop
        cropped = detect_and_crop(img, target_size=TARGET_SIZE, use_mtcnn=USE_MTCNN)

        # Save output
        out_folder = os.path.join(OUTPUT_DIR, split_name, label)
        os.makedirs(out_folder, exist_ok=True)
        out_file = os.path.join(out_folder, os.path.basename(img_path))
        cv2.imwrite(out_file, cropped)

        pbar.update(1)

elapsed = time.time() - start_time
print(f"\n✅ Completed in {elapsed/60:.2f} minutes  •  Failed images: {failed}")

Processing all images: 100%|██████████| 35887/35887 [1:55:12<00:00,  5.19it/s]     

✅ Completed in 115.21 minutes  •  Failed images: 0


In [7]:
total_processed = len(all_images) - failed
print(f"\n✅ Preprocessing completed successfully!")
print(f"Total processed: {total_processed}")
print(f"Failed: {failed}")
print(f"Time taken: {elapsed/60:.2f} minutes")
print(f"Processed data saved in: {OUTPUT_DIR}")


✅ Preprocessing completed successfully!
Total processed: 35887
Failed: 0
Time taken: 115.21 minutes
Processed data saved in: ../data/cropped_faces
