In [1]:
import sys
import pandas as pd
import imghdr
import hashlib
import os
import numpy as np

In [2]:
def remove_missing_files(df, data_path):
    count = 0
    indices = []
    curropt = []
    for index, row in df.iterrows():
        name = row['name'].replace(' ', '_')
        img_id = str(row['image_id'])

        # img_name = name + '_' + img_id + '.jpeg'
        img_name = hashlib.sha1(
            row['url'].encode('utf-8')).hexdigest() + '.jpg'
        # print(img_name)
        # img_path = os.path.join(data_path, 'images', name, img_name)
        img_path = os.path.join(data_path, name, 'face', img_name)
        if os.path.isfile(img_path):
            if imghdr.what(img_path) is not None:
                # print(img_path)
                continue
            else:
                print('Image is corrupt {}'.format(img_path))
                count += 1
                if "actor" in data_path:
                    folder = "actor"
                else:
                    folder = "actress"
                curropt.append(os.path.join(folder, name, img_name))
                curropt.append(os.path.join(folder, name, 'face', img_name))
                indices.append(index)
        else:
            count += 1
            indices.append(index)
            
    with open('corrupt_files.txt', 'a') as output:
        for fname in curropt:
            output.write(fname)
            output.write('\n')
    print('corrupt file names written to file')
    return df.drop(actors_frame.index[indices]), count

In [3]:
def add_name_id(df):
    df['person_id'] = pd.Categorical(pd.factorize(df.name)[0] + 1)
    return df


def data_split(low=1, class_size=265, holdout_frac=0.2, val_test_split=0.5, seed=1791387):
    random_state = np.random.RandomState(seed)
    all_pids = np.arange(low, low+class_size+1)
    holdout_pids = random_state.randint(
        low, low+class_size+1, size=int(class_size*holdout_frac))
    val_pids = random_state.choice(holdout_pids, size=int(
        class_size*holdout_frac*val_test_split), replace=False)
    test_pids = np.setdiff1d(holdout_pids, val_pids)
    train_pids = np.setdiff1d(all_pids, holdout_pids)
    return train_pids, val_pids, test_pids

In [4]:
ANNOT_ACTORS_PATH = "/home/var/facescrub/facescrub_actors.txt"
ANNOT_ACTRESS_PATH = "/home/var/facescrub/facescrub_actresses.txt"

DATA_PATH = "/home/var/final-fs-data/"
DATA_ACTORS_PATH = "/home/var/final-fs-data/actor/"
DATA_ACTRESS_PATH = "/home/var/final-fs-data/actress/"

SAVE_PATH = "/home/var/final-fs-data/"

In [5]:
actors_frame = pd.read_csv(ANNOT_ACTORS_PATH, delimiter='\t')
actors_frame['gender'] = 'male'

print('Before deletion, actors frame')
print(actors_frame.head())
print('Shape:', actors_frame.shape)
print('Number of entries: ', len(actors_frame))

updated_actors_frame, actors_count = remove_missing_files(
    actors_frame, DATA_ACTORS_PATH)

Before deletion, actors frame
            name  image_id  face_id  \
0  Aaron Eckhart         1        1   
1  Aaron Eckhart         2        2   
2  Aaron Eckhart         3        3   
3  Aaron Eckhart         4        4   
4  Aaron Eckhart         5        5   

                                                 url              bbox  \
0  http://upload.wikimedia.org/wikipedia/commons/...    53,177,418,542   
1  http://movies.dosthana.com/sites/default/files...    80,102,260,282   
2  http://upload.wikimedia.org/wikipedia/commons/...  203,802,975,1574   
3  http://25.media.tumblr.com/nJ2vga5sae9o2ks4Flt...     62,90,231,259   
4  http://upload.wikimedia.org/wikipedia/commons/...   276,120,492,336   

                                              sha256 gender  
0  dec996994cf1eec33b53c203cff0e8f25638829fa2ad71...   male  
1  f84d0c3b1b854a51e6bc031bc353e801834e81df795e85...   male  
2  8548658ef00f2ac4c384fbfff9d3ae225b4b9e0c2aa45e...   male  
3  658d83f35859d2f313ff660c1900427c21eae1c

In [10]:
actors_frame.name.value_counts()/10

Steve Carell              29.3
Alec Baldwin              27.3
Bill Hader                27.2
Simon Pegg                27.1
Ben Stiller               26.9
Hugh Grant                26.8
Colin Farrell             26.6
George Clooney            26.5
Ethan Hawke               26.5
Colin Firth               26.5
Matt Damon                26.4
Ben Affleck               26.4
James Marsden             26.2
Andy Garcia               26.1
Josh Brolin               26.0
Jon Voight                26.0
Victor Garber             25.8
Jonah Hill                25.8
Nicolas Cage              25.8
Mark Ruffalo              25.7
Daniel Radcliffe          25.7
Bradley Cooper            25.6
Taylor Lautner            25.5
Matthew Perry             25.5
Christian Slater          25.4
Charlie Sheen             25.4
Emile Hirsch              25.3
Billy Burke               25.3
Tobey Maguire             25.2
Daniel Day-Lewis          25.2
                          ... 
Philip Seymour Hoffman    16.3
Patrick 