# Identifying Deepfake Videos with Convolutional Neural Networks

## Data Preprocessing

Our team initially worked on colab to extract random frames from videos, and extract faces from those videos. 

### i. Extract Random Frames and Labels

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt    # for plotting the images
%matplotlib inline
import cv2 as cv    # for capturing videos
import math   # for mathematical operations
import pandas as pd
from keras.preprocessing import image   # for preprocessing the images
import numpy as np    # for mathematical operations
from keras.utils import np_utils
from skimage.transform import resize   # for resizing images
import seaborn as sns
import os
import random

In [None]:
#Our data was first hosted on google drive, and mounted for colab 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATA_FOLDER = '/content/drive/My Drive/Summer 2020/CS542 Deepfake Project/data/'
TRAIN_SAMPLE_FOLDER = 'dfdc_train_part_47'
#TEST_FOLDER = 'test_videos'
print(f"Train samples: {len(os.listdir(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER)))}")

In [None]:
train_list = list(os.listdir(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER)))
train_list.remove('metadata.json')       #remove metadata file from video list 
len(train_list)

In [None]:
#The metadata file contains the labels we need to create npy for the frames we extract
def get_meta_from_json(path):
    df = pd.read_json(os.path.join(DATA_FOLDER, path, 'metadata.json'))
    df = df.T
    return df

meta_train_df = get_meta_from_json(TRAIN_SAMPLE_FOLDER)
meta_train_df.head()

In [None]:
#The function takes in a list of strings that represent videos, 
#and add random frames and their corresponding labels to two global lists 
def images_from_video(train_list):
  for n in range(len(train_list)):
    video_path = os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER, train_list[n])
    capture_image = cv.VideoCapture(video_path) 
    count = 5
    while count>0:
      frames=random.randint(1,50)
      capture_image.set(1,frames)
      ret, frame = capture_image.read()
      if ret == False:
        continue #go to the beginning of the while loop
      label = meta_train_df.loc[video_path.split("/")[-1]].label
      train_labels.append(label)
      train_images.append(frame)
      count -= 1
  return None

In [None]:
#Each data folder was larger and it was not possible to fit all frames into one numpy array per folder
#Here we break the entire video list into chunks for easier manipulation
chunks = [train_list[x:x+101] for x in range(0, len(train_list), 101)] #break the list of videos into batches

In [None]:
#For each chunk, we perform frame extraction and create image and label npy files 
for c in range(len(chunks)):
  train_images = []
  train_labels = []
  images_from_video(chunks[c])
  t_i =np.asarray(train_images)
  t_l =np.asarray(train_labels)
  np.save(os.path.join(DATA_FOLDER,  "npy",'train_images_47_'+str(c)+'.npy'), t_i) #saving the file in a folder "npy" in the data folder
  np.save(os.path.join(DATA_FOLDER,  "npy",'train_labels_47_'+str(c)+'.npy'), t_l) #change the '47' to the folder you are processing


### ii. Extract faces

Taking a few frames from each video still produced sizable data that was difficult computationally and for data transfer. We decided to extract faces from the frames, since most of the background would not present any artifacts of the GANs. 

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt    # for plotting the images
%matplotlib inline
import cv2 as cv    # for capturing videos
import math   # for mathematical operations
import pandas as pd
from keras.preprocessing import image   # for preprocessing the images
import numpy as np    # for mathematical operations
from keras.utils import np_utils
from skimage.transform import resize   # for resizing images
import seaborn as sns
import os

In [None]:
#For working on google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Path to the npy that we have created in the last part of the code
NPY_FOLDER = '/content/drive/My Drive/Summer 2020/CS542 Deepfake Project/data/npy'
#TRAIN_SAMPLE_FOLDER = 'dfdc_train_part_47'
#TEST_FOLDER = 'test_videos'
print(f"Train samples: {len(os.listdir(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER)))}")

In [None]:
#Install and import the module for face extraction 
pip install git+https://github.com/rcmalli/keras-vggface.git
pip install mtcnn
import mtcnn

In [None]:
detector = mtcnn.MTCNN()

Here is an sample output of the face extractor

[{'box': [525, 510, 129, 163],       \
  'confidence': 0.9989194869995117, \
  'keypoints': {'left_eye': (557, 569),\
   'mouth_left': (554, 635),\
   'mouth_right': (604, 639),\
   'nose': (577, 597),\
   'right_eye': (617, 574)}}]

In [None]:
# A function which takes an image as numpy array and outputs the face as numpy array, or an empty list if it can't find a face
def face_extraxtor(frame):
  results = detector.detect_faces(frame)
  if not results:
    return []
  else:
    x1, y1, width, height = results[0]['box']
    x2, y2 = x1 + width, y1 + height
    return frame[y1:y2, x1:x2]

In [None]:
# Cycle through each chunk of saved data, extract the faces, and save them to a numpy file along with the corresponding label
for I in range(37):
  chunk_dat = np.load(os.path.join('/content/drive/My Drive/Machine Learning/Deepfakes/folder_13/npy_13/train_images_13_'+str(I)+'.npy'),allow_pickle=True)
  chunk_lab = np.load(os.path.join('/content/drive/My Drive/Machine Learning/Deepfakes/folder_13/npy_13/train_labels_13_'+str(I)+'.npy'))
  face_set = []
  label_set = []
  for c in range(len(chunk_dat)):
    face = face_extraxtor(chunk_dat[c])
    if len(face) != 0:
      face = cv.resize(face,(252,252),interpolation=cv.INTER_AREA)
      face_set += [face]
      if chunk_lab[c] == 'REAL':
          label_set += [float(1)]
      elif chunk_lab[c] == 'FAKE':
          label_set += [float(0)]
  face_set = np.asarray(face_set)/255
  label_set = np.asarray(label_set)
  np.save(os.path.join('/content/drive/My Drive/Machine Learning/Deepfakes/folder_13/faces_13/tr_faces_13_'+str(I)+'.npy'), face_set)
  np.save(os.path.join('/content/drive/My Drive/Machine Learning/Deepfakes/folder_13/faces_13/tr_labs_13_'+str(I)+'.npy'), label_set)

Note that the above code only loads data from one folder. We ran this code repeatedly on different machines to maximize the size of our dataset. 

### iii. Balancing the Data

We uploaded the face and label npy arrays to SCC, and we balanced the data evenly between reals and fakes, shuffled their order, and then randomly divided them into folders MIX_1 MIX_2 and MIX_3 in roughly a 70-15-15 ratio, to be used as the training, validation and test sets respectively.

In [None]:
import pandas as pd
import numpy as np    
import os
import random
from keras.preprocessing.image import save_img
import cv2 

In [None]:
dataset = 47 #folder number        27        13       47
chunks = 24 #number of chunks      24        36       24

In [None]:
face_FOLDER = 'data/dfdc_'+str(dataset)
DATA_FOLDER = 'data'
print(f"face npy: {len(os.listdir(face_FOLDER))}")

In [None]:
#For each chunk of the face data, we create new balanced data arrays that have 50:50 real and fake data
for n in range(chunks):
    test= np.load('data/dfdc_'+str(dataset)+'/tr_faces_'+str(dataset)+'_'+str(n)+'.npy')#, allow_pickle = True)
    test_label= np.load('data/dfdc_'+str(dataset)+'/tr_labs_'+str(dataset)+'_'+str(n)+'.npy', allow_pickle = True)
    real_index = np.where(test_label == 1.)
    fake_index = np.where(test_label == 0.)
    a =fake_index[0].tolist()
    real = test[real_index]
    idx = random.sample(a,len(real_index[0]))  #select a random set of fake data that has the same length for balance 
    fake = test[idx]
    X = []
    X_labels = []
    for i in range(len(real)):
        X.append(fake[i])
        X.append(real[i])
        X_labels.append(0.)
        X_labels.append(1.)
    faces = np.asarray(X)
    labs = np.asarray(X_labels)
    np.save(os.path.join(DATA_FOLDER, "balanced",'tr_faces_'+str(dataset)+'_'+str(n)+'.npy'), faces)
    np.save(os.path.join(DATA_FOLDER, "balanced",'tr_labs_'+str(dataset)+'_'+str(n)+'.npy'), labs)

### iv. Saving Data as images for Keras datagen.flow_from_directory method

As we are training our neural network at this point, we realized that training on balanced data was good, yet the data was still not shuffled. The segmented data could lead to fitting for irrelevant features, such as the network learning the few actors in each data chunk. We adapted to Keras preprocessing library's datagen method, which pulls from pictures stored in a hierachy where the folder names are the labels  

In [None]:
# move through every file in the balanced folder

# for dfdc_27, there were 24 chunks
# for dfdc_14, there were 25 chunks
# for dfdc_46, there were 22 chunks
# for dfdc_13, there were 36 chunks 
# for dfdc_47, there were 24 chunks 

# We use data/aug/segregated_data/MIX_1 for train, MIX_2 for val, MIX_3 for test
# The ratio is roughly 70:15:15 


A =47;                  #folder number 
for B in range(24):     #number of chunks 
    chunk = np.load('data/balanced/tr_faces_'+str(A)+'_'+str(B)+'.npy', allow_pickle = True)
    label = np.load('data/balanced/tr_labs_'+str(A)+'_'+str(B)+'.npy', allow_pickle = True)
    #for array in each chunk, open each array, save as image, randomly choose a test/train/val folder to save to
    for c in range(len(chunk)):
        #face = chunk[c]*255.
        choice = np.random.choice([1,1,1,1,1,1,1,1,1,1,2,2,3,3])
        cv2.imwrite('data/aug/segregated_data/MIX_'+str(choice)+'/'+str(int(label[c]))+'/image_'+str(A)+'_'+str(B)+'_'+str(c)+'.png', chunk[c])