## Import Library

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
import time
from torch.autograd import Variable
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
!pip install torch



## Data Preparation

In [None]:
data_path='drive/MyDrive/Dataset'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.listdir(data_path)

['gender_classification.xlsx',
 'gender_classification.csv',
 'list_attribute.txt',
 'class_identity.txt',
 'Images']

In [None]:
images_list = os.listdir(data_path+'/Images')

In [None]:
# load the data
data = pd.read_csv("/content/drive/MyDrive/Dataset/list_attribute.txt", sep="\s+", header = 1)

In [None]:
data.shape

(202599, 40)

In [None]:
data.head()

Unnamed: 0,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,Blond_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1


In [None]:
data.replace(-1, 0, inplace=True)

In [None]:
data.head()

Unnamed: 0,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,Blond_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
000001.jpg,0,1,1,0,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
000002.jpg,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
000003.jpg,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
000004.jpg,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
000005.jpg,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [None]:
data = data.reset_index()
data.rename(columns={'index': 'Img_filename'}, inplace=True)

In [None]:
data.head()

Unnamed: 0,Img_filename,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,0,1,1,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
1,000002.jpg,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
2,000003.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
3,000004.jpg,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
4,000005.jpg,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [None]:
import re

In [None]:
for filename in data['Img_filename']:
  filename = re.sub('\(\d+\)', '', filename)

In [None]:
data['Img_filename'].duplicated().sum()

0

In [None]:
gender_col = pd.read_csv('/content/drive/MyDrive/Dataset/gender_classification.csv')

In [None]:
gender_col.head()

Unnamed: 0,Male
0,1
1,1
2,1
3,0
4,0


In [None]:
female_rows = gender_col[gender_col['Male'] == 0]

In [None]:
gender_col['Female'] = female_rows

In [None]:
gender_col.head()

Unnamed: 0,Male,Female
0,1,
1,1,
2,1,
3,0,0.0
4,0,0.0


In [None]:
gender_col.loc[gender_col['Male'] == 0, 'Female'] = 1

In [None]:
gender_col.head()

Unnamed: 0,Male,Female
0,1,
1,1,
2,1,
3,0,1.0
4,0,1.0


In [None]:
gender_col.fillna(0, inplace=True)
gender_col['Female'].astype(int)

Unnamed: 0,Female
0,0
1,0
2,0
3,1
4,1
...,...
4995,1
4996,0
4997,0
4998,1


In [None]:
gender_col.shape

(5000, 2)

In [None]:
image_filenames = []

for filename in images_list:
    if filename.endswith(('.png', '.jpg', '.jpeg')):
        image_filenames.append(filename)

image_filenames = sorted(image_filenames)

In [None]:
data_filenames = pd.DataFrame(image_filenames, columns=['Img_filenames'])

In [None]:
data_filenames.head()

Unnamed: 0,Img_filenames
0,000051.jpg
1,000052.jpg
2,000065.jpg
3,000166.jpg
4,000198.jpg


In [None]:
data_filenames.shape

(5017, 1)

In [None]:
data_filenames['Cleaned_filenames'] =  data_filenames['Img_filenames'].str.replace(r'\(\d+\)', '', regex=True)

In [None]:
data_filenames['Cleaned_filenames'].duplicated().sum()

17

In [None]:
data_filenames = data_filenames.drop_duplicates(subset='Cleaned_filenames', keep='last')

In [None]:
data_filenames['Cleaned_filenames'].duplicated().sum()

0

In [None]:
data_filenames.shape

(5000, 2)

In [None]:
data_filenames.drop('Cleaned_filenames', axis=1, inplace=True)

In [None]:
data_filenames.head()

Unnamed: 0,Img_filenames
0,000051.jpg
1,000052.jpg
2,000065.jpg
3,000166.jpg
4,000198.jpg


In [None]:
data_filenames.reset_index(drop=True, inplace=True)

In [None]:
data_filenames.shape

(5000, 1)

In [None]:
data_filenames['Male'] = gender_col['Male']

In [None]:
data_filenames.shape

(5000, 2)

In [None]:
data_filenames['Male'].isna().sum()

0

In [None]:
data_filenames['Male'].astype(int)

Unnamed: 0,Male
0,1
1,1
2,1
3,0
4,0
...,...
4995,0
4996,1
4997,1
4998,0


In [None]:
# split the data into train and test sets
train_data, test_data = train_test_split(data_filenames, test_size=0.2, random_state=42)

## Preprocessing

In [None]:
import cv2
from skimage import io

class GenderDataset(Dataset):
    def __init__(self, data, image_folder_path, transform=None):
        self.data = data
        self.image_folder_path = image_folder_path
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = os.path.join(self.image_folder_path, self.data.iloc[idx, 0])
        # convert image to RGB
        image = io.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        gender = self.data.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(gender, dtype=torch.long)

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
train_set = GenderDataset(train_data, image_folder_path=os.path.join(data_path, "Images"), transform=transform)
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=2)

test_set = GenderDataset(test_data, os.path.join(data_path, "Images"), transform=transform)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=2)

## Architecture

In [None]:
# model optimizer and loss function
model = models.googlenet()
feature_num = model.fc.in_features
model.fc = nn.Linear(feature_num, 1)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.BCELoss()



## Modeling

In [None]:
import time
from torch.autograd import Variable

def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, use_gpu=torch.cuda.is_available(), num_epochs=10):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
      print('Epoch {}/{}'.format(epoch, num_epochs - 1))
      print('-' * 10)

      for phase in ['train', 'test']:
        if phase == 'train':
          model.train(True)
        else:
          model.train(False)

      running_loss = 0.0
      running_corrects = 0

      for data in dataloaders[phase]:
        inputs, labels = data
        if use_gpu:
          inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
        else:
          inputs, labels = Variable(inputs), Variable(labels)

      optimizer.zero_grad()

      outputs = model(inputs)

      if phase == 'train':
        _, preds = torch.max(outputs.logits, 1)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
      else:
        _, preds = torch.max(outputs.data, 1)
        loss = criterion(outputs, labels.unsqueeze(-1).float())

      running_loss += loss.item() * inputs.size(0)
      running_corrects += torch.sum(preds == labels.data)

      epoch_loss = running_loss / dataset_sizes[phase]
      epoch_acc = running_corrects.float() / dataset_sizes[phase]
      print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

      if phase == 'test' and epoch_acc > best_acc:
          best_acc = epoch_acc
          best_model_wts = model.state_dict()
          state = {'model': model.state_dict(), 'optimizer': optimizer.state_dict()}
          torch.save(state, '/content/drive/MyDrive/Dataset/best_model.pth')

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best test Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model

In [None]:
dataloaders = {
    "train":train_loader, "test":test_loader
}
dataset_sizes= {
    "train":len(train_set), "test":len(test_set)
}

In [None]:
use_gpu = torch.cuda.is_available()

if use_gpu:
  model = model.to("cuda")

In [None]:
model = train_model(model, dataloaders, dataset_sizes, criterion, optimizer, use_gpu, 10)

## Evaluation

In [None]:
def evaluate_model(model, test_loader, target_labels):
    # define the evaluation function here
    pass

In [None]:
# evaluate_model(model, dataloaders['test'], ["female", "male"])