In [None]:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D, Input
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler

BASE_DIR = '../data/imdb_crop'

image_paths = []
age_labels = []

for root, _, files in tqdm(os.walk(BASE_DIR)):
    for filename in files:
        try:
            parts = filename.split('_')
            if len(parts) >= 4:
                dob = parts[2]  
                photo_year = int(parts[3].split('.')[0]) 
                birth_year = int(dob.split('-')[0])
                age = photo_year - birth_year
                if 0 < age < 100:  
                    image_path = os.path.join(root, filename)
                    image_paths.append(image_path)
                    age_labels.append(age)
        except Exception as e:
            continue


df = pd.DataFrame({'image': image_paths, 'age': age_labels})
print(df.head())


if not df.empty:
    sns.displot(df['age'], kde=True, bins=30)
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()
else:
    print("DataFrame is empty. Please check dataset paths and structure.")



def extract_feature(images):
    features = []
    for image in tqdm(images):
        try:
            img = Image.open(image).convert('L')
            img = img.resize((128, 128), Image.BILINEAR)
            img = np.array(img)
            features.append(img)
        except:
            continue
    features = np.array(features)
    return features.reshape(len(features), 128, 128, 1)



In [None]:
X = extract_feature(df['image'])  

In [None]:
X = X.astype('float32') / 255.0  
y = np.array(df['age'])


if len(X) == 0:
    raise ValueError("No valid image data found. Check dataset loading.")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


input_shape = (128, 128, 1)
inputs = Input(shape=input_shape)
x = Conv2D(32, (3, 3), activation='relu')(inputs)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
age_output = Dense(1, activation='relu', name='age_output')(x)

model = Model(inputs=inputs, outputs=age_output)
model.compile(optimizer='adam', loss='mae', metrics=['mae'])
model.summary()


model_path = 'age_model.keras'
checkpointer = ModelCheckpoint(filepath=model_path, monitor='val_mae', mode='min',
                               save_best_only=True, verbose=1)

annealer = LearningRateScheduler(lambda epoch: 1e-3 * 0.9 ** epoch)


history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=64,
    epochs=50,
    callbacks=[annealer, checkpointer]
)


plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('Model Mean Absolute Error')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend()
plt.show()


loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae:.2f} years")


sample_idx = 25
img = X_test[sample_idx].reshape(1, 128, 128, 1)
pred_age = model.predict(img)[0][0]
true_age = y_test[sample_idx]

plt.title(f"Predicted Age: {round(pred_age)}, True Age: {true_age}")
plt.imshow(X_test[sample_idx].reshape(128, 128), cmap='gray')
plt.axis('off')
plt.show()

model.save("age_model.h5")