In [1]:
# These codes are adapted from Sagawa and Hino's work: https://github.com/ssgw320/gdacnf

In [2]:
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path

In [3]:
data_dir = "../../../../workspace/dataset/" # your datast path

In [4]:
male_path = Path(data_dir + 'portraits/M')
female_path = Path(data_dir + 'portraits/F')
male_list = list(male_path.glob("*.png"))
female_list = list(female_path.glob("*.png"))

df = pd.concat([pd.DataFrame({'img_path': male_list}), pd.DataFrame({'img_path': female_list})]).reset_index(drop=True)
df['year'] = df['img_path'].apply(lambda p: p.stem.split('_')[0]).astype(int)
# df['name'] = df['img_path'].apply(lambda p: p.stem.split('\\')[-1])
df['sex'] = df['img_path'].apply(lambda p: p.parent.stem.split('\\')[-1])
df['sex'] = df['sex'].apply(lambda p: 0 if p=='M' else 1)
df = df.sort_values(by='year').reset_index(drop=True).drop('year', axis=1)

In [5]:
df.head()

Unnamed: 0,img_path,sex
0,..\..\..\..\workspace\dataset\portraits\M\1905...,0
1,..\..\..\..\workspace\dataset\portraits\F\1905...,1
2,..\..\..\..\workspace\dataset\portraits\F\1905...,1
3,..\..\..\..\workspace\dataset\portraits\M\1905...,0
4,..\..\..\..\workspace\dataset\portraits\M\1905...,0


In [6]:
paths = []
for i in df[:18000]['img_path']:
    paths.append("portraits/%s/%s" % (i.parent.stem.split('portraits\\')[-1] , i.stem))

In [7]:
def make_split_data(df: pd.DataFrame, target: str, num_inter_domain: int, num_domain_samples: dict):
    """ use for Portraits, Gas Sensor, Cover Type """
    split_index = np.split(np.arange(df.shape[0]), np.cumsum(list(num_domain_samples.values())))
    x_all, y_all = list(), list()
    for idx, key in zip(split_index, num_domain_samples.keys()):
        x = df.drop(target, axis=1).loc[idx].values
        y = df.loc[idx, target].values
        if key == 'inter':
            x_all += np.vsplit(x, num_inter_domain)
            y_all += np.hsplit(y, num_inter_domain)
        else:
            x_all.append(x)
            y_all.append(y)
    return x_all, y_all

In [8]:
def convert_portraits(p: Path):
    # read, gray scale, resize
    img = Image.open(p).convert('L').resize((32,32))
    img = np.array(img, dtype=np.float32) / 255
    return img

In [9]:
num_domain_samples = {'source': 2000, 'inter': 14000, 'target': 2000}
# split to each domain
x_all, y_all = make_split_data(df, 'sex', 7, num_domain_samples)

for i, domain in enumerate(x_all):
    domain = np.array([convert_portraits(x) for x in domain.flatten()])
    x_all[i] = domain.reshape(-1, 1, 32, 32)

In [10]:
obj = {'data': x_all, 'label': y_all}
pd.to_pickle(obj, f'portraits_original.pkl')

In [11]:
pd.to_pickle(paths, f'portraits_path.pkl')

In [12]:
import umap

In [13]:
def fit_umap(x_all, y_all, **umap_kwargs) -> list:
    umap_settings = dict(n_components=2, n_neighbors=15, metric='cosine')
    umap_settings.update(umap_kwargs)
    X = np.vstack(x_all)
    X = X.reshape(X.shape[0], -1)
    # use source label as semi-superviesd UMAP
    Y_semi_supervised = [np.full(shape=y.shape[0], fill_value=-1) for y in y_all]
    Y_semi_supervised[0] = y_all[0].copy()
    Y_semi_supervised = np.hstack(Y_semi_supervised)
    # fit UMAP
    encoder = umap.UMAP(random_state=1234, **umap_settings)
    Z = encoder.fit_transform(X, Y_semi_supervised)
    z_idx = np.cumsum([i.shape[0] for i in x_all])
    z_all = np.vsplit(Z, z_idx)[:-1]
    return z_all, encoder

In [14]:
z_all, encoder = fit_umap(np.array(x_all)[[0, len(x_all)-1]], np.array(y_all)[[0, len(y_all)-1]], n_components=8)

In [16]:
obj = {'data': z_all, 'label': np.array(y_all)[[0, len(y_all)-1]]}
pd.to_pickle(obj, f'portraits.pkl')