In [1]:
import numpy as np
import os

In [2]:
def celeba_attr_dataset(root_dir, train_or_test, restricted_degree=0, target_attr_idx=20):
    info_pak = np.load(os.path.join(root_dir, 'celeba_attr.npz'))
    train_idxs = info_pak['train_idxs']
    val_idxs = info_pak['val_idxs']
    test_idxs = info_pak['test_idxs']

    attribute_names = info_pak['attribute_names']
    attributes = info_pak['attributes']

    def get_label(idxs, restricted_degree):
        def jj(attr):
            important_attributes_idx = [0, 1, 4, 9, 16, 18, 22, 24, 29, 30, 34, 36, 37, 38]
#             important_attributes_idx = [i for i in range(attr.shape[1])]
            x = np.array([0 for i in range(attr.shape[0])])
            for i in important_attributes_idx:
                x = x + attr[:, i]
            return x

        label = attributes[idxs]
#         sig = jj(label) >= restricted_degree
#         label = label[sig]

        data = np.delete(label, [target_attr_idx], 1)
        label = label[:, target_attr_idx]
        return data.astype('float32'), label

    if train_or_test=='train':
        data, label = get_label(train_idxs, restricted_degree)
    elif train_or_test=='test':
        data, label = get_label(test_idxs, restricted_degree)
    return data, label

train_data, train_label = celeba_attr_dataset('./data/toy_celeba', 'train', 20)


attributes_names = np.load('./data/toy_celeba/celeba_attr.npz')['attribute_names']
name2idx = {}
for i,j in enumerate(attributes_names):
    name2idx[j] = i
    print('{}: {}'.format(i,j))

0: 5_o_Clock_Shadow
1: Arched_Eyebrows
2: Attractive
3: Bags_Under_Eyes
4: Bald
5: Bangs
6: Big_Lips
7: Big_Nose
8: Black_Hair
9: Blond_Hair
10: Blurry
11: Brown_Hair
12: Bushy_Eyebrows
13: Chubby
14: Double_Chin
15: Eyeglasses
16: Goatee
17: Gray_Hair
18: Heavy_Makeup
19: High_Cheekbones
20: Male
21: Mouth_Slightly_Open
22: Mustache
23: Narrow_Eyes
24: No_Beard
25: Oval_Face
26: Pale_Skin
27: Pointy_Nose
28: Receding_Hairline
29: Rosy_Cheeks
30: Sideburns
31: Smiling
32: Straight_Hair
33: Wavy_Hair
34: Wearing_Earrings
35: Wearing_Hat
36: Wearing_Lipstick
37: Wearing_Necklace
38: Wearing_Necktie
39: Young


In [3]:
train_data.shape

(162770, 39)

In [4]:
train_label.shape

(162770,)

## CART
Code is taken from DrKwint/sounds-deep

In [5]:
sum(train_label)

68261

In [11]:
import sklearn.tree

# accu_list = []
# skew_list_train = []
# skew_list_test = []
# tree_list = []
result_list = []
for i in range(40):
    target_attr_idx = i
    
    # Defien the decision tree
    max_leaf_nodes = 20
    max_depth = 10
    decision_tree = sklearn.tree.DecisionTreeClassifier(
        max_depth=max_depth,
        min_weight_fraction_leaf=0.01,
        max_leaf_nodes=max_leaf_nodes)

    train_data, train_label = celeba_attr_dataset('./data/toy_celeba', 'train', target_attr_idx=target_attr_idx)
    test_data, test_label = celeba_attr_dataset('./data/toy_celeba', 'test', target_attr_idx=target_attr_idx)
    decision_tree.fit(train_data, train_label)
    
    accu = decision_tree.score(test_data, test_label)
    result_list.append({
        'idx': i,
        'name': attributes_names[i],
        'test_accu': accu,
        'skew_train': sum(train_label)/len(train_label),
        'skew_test': sum(test_label)/len(test_label),
        'tree': decision_tree
    })

In [12]:
sorted_result_list=sorted(result_list, reverse=True, key=lambda x:x['skew_train'])
print('{:6}, {:20}, {:10}, {:7}, {:17}, {:18}'.format('[Rank]', '[AttrName]', '[Accuracy]', '[Index]', '[PositivesInTest]', '[PositivesInTrain]'))
for i, x in enumerate(sorted_result_list):
    name = x['name']
    accu = x['test_accu']
    skew_test = x['skew_test']
    skew_train = x['skew_train']
    index = x['idx']
    print('{:6}, {:20}, {:10.4f}, {:7}, {:16.2f}%, {:17.2f}%'.format(i, name, accu, index, skew_test*100, skew_train*100))

[Rank], [AttrName]          , [Accuracy], [Index], [PositivesInTest], [PositivesInTrain]
     0, No_Beard            ,     0.9399,      24,            85.37%,             83.42%
     1, Young               ,     0.8246,      39,            75.71%,             77.89%
     2, Attractive          ,     0.7627,       2,            49.58%,             51.36%
     3, Mouth_Slightly_Open ,     0.7698,      21,            49.51%,             48.22%
     4, Smiling             ,     0.8516,      31,            50.03%,             47.97%
     5, Wearing_Lipstick    ,     0.9270,      36,            52.19%,             46.96%
     6, High_Cheekbones     ,     0.8430,      19,            48.18%,             45.24%
     7, Male                ,     0.9281,      20,            38.65%,             41.94%
     8, Heavy_Makeup        ,     0.8807,      18,            40.50%,             38.43%
     9, Wavy_Hair           ,     0.7520,      33,            36.40%,             31.94%
    10, Oval_Face    

In [None]:
range=(1,15)
print(accu_rank[range[0]:range[1]])
print([skew_list_train[i] for i in accu_rank[range[0]:range[1]]])

In [45]:
def tree2dot(tree, path, target_attr_idx):
    sklearn.tree.export_graphviz(
        tree,
        out_file=path,
        feature_names=np.delete(attributes_names,target_attr_idx),
        class_names=['negative', 'positive'],
        filled=True,
        rounded=True,
        proportion=True)

In [46]:
for i,x in enumerate(sorted_result_list):
    path = '.\\output\\SkewRank{:02d}_{}'.format(i, x['name'])
    os.mkdir(path)
    tree2dot(x['tree'], path + '\\CART.dot', target_attr_idx=x['idx'])
    os.system('dot -Tpng {} -o {}'.format(path+'\\CART.dot', path+'\\decision_tree_CART.png'))
    os.remove(path + '\\CART.dot')

In [None]:
for i,j in enumerate(accu_list):
    if(j>0.9):
        print(i, attributes_names[i], j)

## NB
From scikit-learn documentation

In [None]:
import sklearn.naive_bayes

model_nb = sklearn.naive_bayes.BernoulliNB()
model_nb.fit(train_data, train_label)
accu = model_nb.score(test_data, test_label)
print('Accuracy: {:.4f}'.format(accu))

## SVM
From scikit-learn documentation

In [None]:
import sklearn.svm

model_svm = sklearn.svm.SVC(gamma='scale')
model_svm.fit(train_data, train_label)
accu = model_svm.score(test_data, test_label)
print('Accuracy: {:.4f}'.format(accu))

### Get Ratio Male/Not male

In [None]:
def print_gender_ratio(degree):
    _, train_label = celeba_attr_dataset('./data/toy_celeba', 'train', restricted_degree=degree)
    _, test_label = celeba_attr_dataset('./data/toy_celeba', 'test', restricted_degree=degree)
    train_ratio = sum(train_label)/train_label.shape[0]
    test_ratio = sum(test_label)/test_label.shape[0]
    print('Training set, Male: {:.2f}%, Not male: {:.2f}%'.format(train_ratio*100, 100-train_ratio*100))
    print('Test set: {:.2f}%, Not male: {:.2f}%'.format(test_ratio*100, 100-test_ratio*100))


In [None]:
for i in range(4):
    print('Degree {}:'.format(i))
    print_gender_ratio(i)
    print('')

## Else

In [None]:
attributes_names

In [None]:
accu_rank