In [None]:
import os
import re
import numpy as np
import pandas as pd
import torch
from PIL import Image
import cv2

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alfredhhw/adiencegender")

print("Path to dataset files:", path)

In [None]:
IMAGE_DIR = path + "/AdienceGender/aligned/"

In [None]:
adience_dict = {"image_path": [], "age": [], "gender": []}
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if os.path.splitext(filename)[1] == ".txt":
            with open(os.path.join(dirname, filename), 'r') as f:
                for idx, line in enumerate(f):
                    if idx == 0:
                        continue
                    line = line.strip().split('\t')
                    if line[3] == "None":
                        continue
                    adience_dict['image_path'].append(line[0] + "/landmark_aligned_face." + line[2] + "." + line[1])
                    adience_dict['age'].append(line[3])
                    adience_dict['gender'].append(line[4])

In [None]:
adience_df = pd.DataFrame(adience_dict)

In [None]:
def calculateVariance(image_path):
    # Read the image
    image = cv2.imread(IMAGE_DIR + image_path)

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()

    return laplacian_var

In [None]:
variance = adience_df['image_path'].apply(lambda x: calculateVariance(x))

In [None]:
stats = {
    'Mean': variance.mean(),
    'Standard Deviation': variance.std(),
    'Minimum': variance.min(),
    'Maximum': variance.max(),
    'Median': variance.median(),
    '25th Percentile (Q1)': variance.quantile(0.25),
    '75th Percentile (Q3)': variance.quantile(0.75),
    'Count': variance.count()
}
stats

In [None]:
Q1 = variance.quantile(0.25)
Q2 = variance.quantile(0.50)
Q3 = variance.quantile(0.75)
count_Q1 = ((variance < Q1).sum())
count_Q2 = ((variance >= Q1) & (variance < Q2)).sum()
count_Q3 = ((variance >= Q2) & (variance < Q3)).sum()
count_above_Q3 = (variance >= Q3).sum()
print(count_Q1, count_Q2, count_Q3, count_above_Q3)

In [None]:
adience_df['variance'] = variance

In [None]:
adience_df = adience_df[adience_df['variance'] >= variance.quantile(0.25)]
adience_df.drop(columns='variance')

In [None]:
adience_df = adience_df.reset_index()

In [None]:
age_ranges = [tuple(map(int, i[1:-1].split(", "))) for i in adience_df["age"].unique() if ',' in i]
age_ranges.remove((27, 32))
age_ranges.remove((38, 42))
age_ranges.remove((38, 43))

In [None]:
sorted(age_ranges)

In [None]:
adience_df["age"].unique()

In [None]:
for i in range(len(adience_df)):
    if adience_df.loc[i, "age"] == "(27, 32)":
        adience_df.loc[i, "age"] = "(25, 32)"
    elif adience_df.loc[i, "age"] == "(38, 42)":
        adience_df.loc[i, "age"] = "(38, 48)"
    elif adience_df.loc[i, "age"] == "(38, 43)":
        adience_df.loc[i, "age"] = "(38, 48)"
    elif "(" not in adience_df.loc[i, "age"]:
        min_dist = 10000
        age_range = ()
        for j in age_ranges:
            dist = min(abs(int(adience_df.loc[i, "age"]) - j[0]),
                       abs(int(adience_df.loc[i, "age"]) - j[1]))
            if min_dist > dist:
                min_dist = dist
                age_range = j
        adience_df.loc[i, "age"] = str(age_range)

In [None]:
adience_df

In [None]:
sorted(adience_df["age"].unique())

In [None]:
adience_df.to_csv("adience.csv", columns=["image_path", "age"], index=False)