In [113]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
import sys
import glob
import json
sys.path.append('../scripts/')
from get_csv_from_metadata import make_csv_from_metadata
from IPython.display import Video
sys.path.append('../scripts/helper_functions_cv/tensorflow_helpers/')
from gpu_starter_mirror_strategy import start_gpus
import cv2, PIL
from PIL import Image
import multiprocessing as mp
import collections
from sklearn.model_selection import train_test_split
from save_weights_every_epoch import CallbackForSavingModelWeights
import tensorflow.keras.backend as K

In [2]:
train_videos = glob.glob('../../ml-data-training/deep_fake_data/comp_data/train_sample_videos/*.mp4')

In [3]:
csv = make_csv_from_metadata('../../ml-data-training/deep_fake_data/comp_data/train_sample_videos/metadata.json')

In [7]:
csv['updated_file_name'] = csv['file_name'].apply(lambda x: '../../ml-data-training/deep_fake_data/comp_data/train_sample_videos/' + x)

In [11]:
def apply_binary_labels(label):
    if label == 'FAKE':
        return 1
    else:
        return 0

In [12]:
csv['binary_labels'] = csv['labels'].apply(apply_binary_labels)

In [24]:
strategy, REPLICAS, AUTO = start_gpus([0, 1, 2])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2')
Returning objects as strategy, replicas and auto in same order.


In [21]:
# Video(csv['updated_file_name'].values[0])

In [31]:
files = csv['updated_file_name'].values.tolist()

In [57]:
def save_imgs(path):
    try:
        folder_name = path.split('/')[-1].split('.')[0]
        folder_path = f'../../imgs_files/{folder_name}/'
        os.makedirs(folder_path)
        vidcap = cv2.VideoCapture(path)
        success, image = vidcap.read()
        count = 0
        while success:
            cv2.imwrite(f'{folder_path}{count}.jpg', image)
            success, image = vidcap.read()
            count += 1
        return 'DONE'
    except:
        return path

In [58]:
with mp.Pool(25) as p:
    results = p.map(save_imgs, files)

In [59]:
img_files = glob.glob('../../imgs_files/*/*.jpg')

In [85]:
file_labels = csv[['file_name', 'labels']].values.tolist()
file_labels = [[x[0].split('.')[0], x[1]] for x in file_labels]

In [86]:
label_dict = dict(zip([x[0] for x in file_labels], [1 if x[1] == 'FAKE' else 0 for x in file_labels]))

In [94]:
labels = []
for x in img_files:
    labels.append(label_dict[x.split('/')[-2]]) 

In [98]:
final_tuples = [(img_files[x], labels[x]) for x in range(len(labels))]

In [99]:
def split_datasets(generator_, test_size = 0.01):
    train, test = train_test_split(generator_, test_size = 0.01, random_state=42)
    train, val = train_test_split(train, test_size=0.01, random_state=42)
    return train, val, test

In [100]:
train, val, test = split_datasets(final_tuples)

In [115]:
def read_train_imgs(img, label):
    img = tf.io.read_file(img)
    img = tf.image.decode_jpeg(img, channels = 3)
    img = tf.image.resize(img, (128, 128))
    img = img / 255
    return img, label

In [116]:
def get_data(data, shape = (256, 256), repeat = True, shuffle = True, batch = True, prefetch = True, batch_size = 64):
    imgs = [x[0] for x in data]
    labels = [x[1] for x in data]
    tensor = tf.data.Dataset.from_tensor_slices((imgs, labels))
    tensor = tensor.cache()
    if repeat:
        tensor = tensor.repeat()
    if shuffle:
        tensor = tensor.shuffle(1024 * REPLICAS)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        tensor = tensor.with_options(opt)
    tensor = tensor.map(read_train_imgs, num_parallel_calls = AUTO)
    if batch:
        tensor = tensor.batch(batch_size * REPLICAS)
    if prefetch:
        tensor = tensor.prefetch(AUTO)
    return tensor

In [117]:
def create_model(model_name, shape):
    with strategy.scope():
        input_layer = tf.keras.Input(shape = shape)
        construct = getattr(keras.applications, model_name)
        mid_layer = construct(include_top = False, 
            weights = None, 
            pooling = 'avg')(input_layer)
        last_layer = keras.layers.Dense(1, activation = 'sigmoid')(mid_layer)
        model = keras.Model(input_layer, last_layer)
    return model
def compile_new_model(model):
    with strategy.scope():
        loss = keras.losses.BinaryCrossentropy()
        optimizer = keras.optimizers.SGD()
        prec = keras.metrics.Precision(name = 'prec')
        rec = keras.metrics.Recall(name = 'rec')
        model.compile(
            loss = loss,
            optimizer = optimizer,
            metrics = [prec, rec]
        )
    return model

In [118]:
train_dataset = get_data(train)
val_dataset = get_data(val, repeat=False, shuffle=False)

In [120]:
K.clear_session()
log_dir = '../TB/res_baseline_256/'
if os.path.exists(log_dir) == False:
    os.makedirs(log_dir)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir)
weights_path = '../../ml-data-training/deepfake_weights/'
weights_save = CallbackForSavingModelWeights(weights_path)
train_dataset = get_data(train, shape = (128, 128), batch_size=128)
val_dataset = get_data(val, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)
with strategy.scope():
    model = create_model('ResNet50', (128, 128, 3))
    model = compile_new_model(model)
    model_hist = model.fit(
        train_dataset,
        steps_per_epoch = len(train) // (128 * REPLICAS),
        epochs = 50,
        verbose = 1,
        validation_data = val_dataset,
        callbacks = [tensorboard_callback, weights_save]
    )

Epoch 1/50
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2').
INFO:tensorflow:Reduce to /job:localhost/replica:

In [121]:
model.load_weights('../../ml-data-training/deepfake_weights/37.h5')

In [122]:
train_dataset = get_data(train, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)
val_dataset = get_data(val, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)
test_dataset = get_data(test, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)

In [124]:
model.evaluate(test_dataset, verbose = 1)



[0.10647476464509964, 0.9612159132957458, 0.9642481803894043]

In [125]:
model.evaluate(train_dataset, verbose = 1)



[0.06027475744485855, 0.977334201335907, 0.9820473194122314]

In [126]:
model.evaluate(val_dataset, verbose = 1)



[0.08309068530797958, 0.9660144448280334, 0.9801462888717651]

In [127]:
train_dataset = get_data(train, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)
train_imgs = train_dataset.map(lambda img, label: img)
val_dataset = get_data(val, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)
val_imgs = val_dataset.map(lambda img, label: img)
test_dataset = get_data(test, shape = (128, 128), batch_size=128, shuffle=False, repeat=False)
test_imgs = test_dataset.map(lambda img, label: img)

In [129]:
train_preds = model.predict(train_imgs, verbose = 1)
val_preds = model.predict(val_imgs, verbose = 1)
test_preds = model.predict(test_imgs, verbose = 1)



In [148]:
dict_data = {}
for x in range(len(train)):
    folder_name = train[x][0].split('/')[-2]
    if folder_name not in dict_data:
        dict_data[folder_name] = [x]
    else:
        dict_data[folder_name].append(x)

In [152]:
updated_preds = np.where(train_preds.flatten() < 0.5, 0, 1)

In [158]:
np.unique(updated_preds[dict_data['ekkdjkirzq']], return_counts  = True)

(array([1]), array([295]))

In [159]:
for key, val in dict_data.items():
    

ekkdjkirzq
dwediigjit
esxrvsgpvb
ensyyivobf
cpjxareypw
dgzklxjmix
cnilkgvfei
cferslmfwh
ebkzwjgjhq
dakqwktlbi
dboxtiehng
avywawptfc
akzbnazxtz
ebebgmtlcu
ehfiekigla
degpbqvcay
byqzyxifza
aybumesmpk
eprybmbpba
dkwjwbwgey
asaxgevnnp
avgiuextiz
bzmdrafeex
afoovlsmtx
dofusvhnib
egbbcxcuqy
ehieahnhte
dbzpcjntve
epymyyiblu
ciyoudyhly
bqhtpqmmqp
ddjggcasdw
ddpvuimigj
bwhlgysghg
byfenovjnf
dxbqjxrhin
cettndmvzl
axwovszumc
aytzyidmgs
bhbdugnurr
eqvuznuwsa
atkdltyyen
axntxmycwd
dnhvalzvrt
diomeixhrg
btunxncpjh
bpxckdzddv
ehccixxzoe
ddqccgmtka
avfitoutyn
bggsurpgpr
dptrzdvwpg
bmjzrlszhi
cqrskwiqng
cwqlvzefpg
aklqzsddfl
asdpeebotb
bbvgxeczei
bhaaboftbc
czkdanyadc
ahbweevwpv
eqjscdagiv
dbhoxkblzx
cwsbspfzck
amaivqofda
alvgwypubw
bqnymlsayl
ckjaibzfxa
cepxysienc
aufmsmnoye
dlpoieqvfb
dtbpmdqvao
bvgwelbeof
emgjphonqb
dkdwxmtpuo
byunigvnay
bqeiblbxtl
asvcrfdpnq
dhoqofwoxa
eejswgycjc
elvvackpjh
atyntldecu
chzieimrwu
emfbhytfhc
cbbibzcoih
bntlodcfeg
dvumqqhoac
dfbpceeaox
arkroixhey
eepezmygaq
ddhfabwpuz