A first-pass trial notebook experimenting with creating a CNN for our FakeFaces dataset

In [1]:
# imports 
import pandas as pd # to read the csv files 
import os.path
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.layers.experimental import preprocessing

In [2]:
# need to allow for the memory limit to be able to grow (?) https://www.tensorflow.org/guide/gpu
gpu = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)

In [3]:
# helper function
def _parse_image_from_filepath(filename, label):
    img = tf.io.read_file(filename)
    img = tf.image.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32) 
    return img, label

# let's write a function that takes in the type of the dataset we want, and returns the tf_dataset 
def load_dataset_from_csv(setName):
    # load setName.csv into a pandas dataframe
    pd_dataset = pd.read_csv('data/archive/'+setName+'.csv')
    pd_dataset.drop(['Unnamed: 0', 'original_path', 'label_str', 'path'], axis='columns', inplace=True)
    
    set_filenames = []
    set_labels = []
    # check if we haven't saved the filepath-label lists already; if not, then generate and pickle it for the future 
    if (not os.path.isfile(setName+'_paths_labels_lists.pkl')):
        temp_filenames_list = []
        temp_labels_list = []
        for index, row in pd_dataset.iterrows():
            img_details = row
            img_label = img_details['label']
            img_label_string = 'real' if (img_details['label'] == 1) else 'fake'
            img_id = img_details['id']
            img_filepath = 'data/archive/real_vs_fake/'+setName+'/'+img_label_string+'/'+str(img_id)+'.jpg'
            if (not os.path.isfile(img_filepath)): # ignore instances in the pd that don't actually exist 
                continue 
            temp_filenames_list.append(img_filepath)
            temp_labels_list.append(int(img_label))
        filenames_labels_tuple = (temp_filenames_list, temp_labels_list)
        with open(setName+'_paths_labels_lists.pkl', 'wb') as f:
            pickle.dump(filenames_labels_tuple, f)
        set_filenames = temp_filenames_list
        set_labels = temp_labels_list
    else: # otherwise, if we already have those lists, just grab them from disk
        with open(setName+'_paths_labels_lists.pkl', 'rb') as f:
            temp_tuple = pickle.load(f)
            set_filenames = temp_tuple[0]
            set_labels = temp_tuple[1]
    
    # prepare to turn the string paths and labels into a tf dataset
    set_filenames = tf.constant(set_filenames)
    set_labels = tf.constant(set_labels)
    tf_dataset = tf.data.Dataset.from_tensor_slices((set_filenames, set_labels))
    
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    
    tf_dataset = tf_dataset.map(_parse_image_from_filepath)
    tf_dataset = tf_dataset.cache()
    tf_dataset = tf_dataset.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=True) # might change this 
    tf_dataset = tf_dataset.batch(32, drop_remainder=False) # might change this too
    tf_dataset = tf_dataset.prefetch(buffer_size=AUTOTUNE)
    
    return tf_dataset
        

In [5]:
train_tf_dataset = load_dataset_from_csv('train')
valid_tf_dataset = load_dataset_from_csv('valid')

Let's remind ourselves that in the labels, a 1 is real, while a 0 is fake.

In [None]:
# let's try plotting some of the images 
# don't try running this; rigth now this doesnt work because I changed the setup a little, but you can change it a bit to get it to work if you are curious 
plt.figure(figsize=(10, 10))
x = 1
for i in dataset.sample(n=9, random_state=1).index:
    ax = plt.subplot(3, 3, x)
    x += 1
    plt.imshow(mpimg.imread(train_img_filenames[i].numpy().decode('utf-8')))
    img_label = 'real' if (int.from_bytes(train_img_labels[i].numpy(), byteorder='little')== 1) else 'fake'
    plt.title('img ' + str(i) + ': ' + img_label)
    plt.axis('off') 
    
    

In [6]:
# just try adapting the cnn tutorial network for practice purposes 
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(256, 256, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(layers.Dense(2))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(train_tf_dataset, epochs=10, validation_data=valid_tf_dataset)

Epoch 1/10