In [1]:
#Libraries Needed
import cv2 
import sys #using for face recognition
import os
from pathlib import Path
import glob
import random
from skimage.util import random_noise ## --> to add noise
import shutil
import numpy as np
import pandas as pd
import zipfile
import keras

#Data Processing
from skimage import io
import os
import glob
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from keras.preprocessing.image import array_to_img
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
import random
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter

#Neural Networks
from tensorflow.python import keras
import tensorflow as tf

#Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

#Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from keras.callbacks import ModelCheckpoint,Callback
from sklearn import metrics
from tensorflow.keras.models import load_model

In [2]:
#! pip install keras

In [3]:
#!pip install tensorflow==2.1.0

### Prepare Dataset

In [4]:
test_set = pd.read_csv('./Celeb-DF-v2/List_of_testing_videos.txt',sep=' ')
test_set.columns = ['label','filepath']

In [5]:
test_set.head()

Unnamed: 0,label,filepath
0,1,YouTube-real/00208.mp4
1,1,YouTube-real/00063.mp4
2,1,YouTube-real/00024.mp4
3,1,YouTube-real/00021.mp4
4,1,YouTube-real/00036.mp4


In [6]:
test_set.label.value_counts()

0    340
1    177
Name: label, dtype: int64

In [7]:
test_set.filepath = [file.split('/')[-2]+'-'+file.split('/')[-1] for file in test_set.filepath]

In [8]:
test_set.head()

Unnamed: 0,label,filepath
0,1,YouTube-real-00208.mp4
1,1,YouTube-real-00063.mp4
2,1,YouTube-real-00024.mp4
3,1,YouTube-real-00021.mp4
4,1,YouTube-real-00036.mp4


In [9]:
test_set.label.value_counts()

0    340
1    177
Name: label, dtype: int64

In [10]:
for index, row in test_set.iterrows():
    if row[0] == 0:
        test_set.label[index] = 1
    else:
        test_set.label[index] = 0
    test_set.filepath[index] = test_set.filepath[index].replace('.mp4','.jpg')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.label[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.filepath[index] = test_set.filepath[index].replace('.mp4','.jpg')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.label[index] = 1


In [11]:
test_set.label.value_counts()

1    340
0    177
Name: label, dtype: int64

In [12]:
test_set.head()

Unnamed: 0,label,filepath
0,0,YouTube-real-00208.jpg
1,0,YouTube-real-00063.jpg
2,0,YouTube-real-00024.jpg
3,0,YouTube-real-00021.jpg
4,0,YouTube-real-00036.jpg


### Data Pre-Processing

In [13]:
'''
func: prepare and process batch wise data for training/ablation. 
The image path is converted to image data while the classes are label encoded (0,1,2)
input:
    i.image_list: X data: array of image paths
    ii:classes: y data: dictionary of image paths and their corresponding classes
output:
    i. DataGenerator
'''
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, image_list, classes, batch_size=32, dim=(224,224), n_channels=3,
                 n_classes=3,shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.classes = classes
        self.image_list = image_list
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.image_list) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_img_temp = [self.image_list[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_img_temp)
        return X, y


    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.image_list))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_img_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization of Y
        y = np.empty((self.batch_size), dtype=int)
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        #X = np.random.rand(self.batch_size, *self.dim, self.n_channels)

        #For each image path in the batch: load it, convert it to array, normalize values and pre-process.
        #Append X and respective y values to the empty X and y variables
        for i, ID in enumerate(list_img_temp):
            img = load_img(ID, target_size=self.dim)
            img = img_to_array(img)
            #img = img/255
            img = preprocess_input(img)
            X[i,] = img
            filename = ID.split('/')[-1]
            y[i] = self.classes[filename]
            
        #class encoding to 0,1,2
#         le = LabelEncoder()
#         self.y_value = le.fit_transform(y)
        return X,tf.keras.utils.to_categorical(y, num_classes=self.n_classes)

In [14]:
files = glob.glob('./Celeb-DF-v2/test/frames/*.jpg', recursive = True)

In [15]:
frame_list = []
for file in files:
    name = file.split('/')[-1]
    frame_list.append(name)

In [16]:
y_test = test_set[test_set['filepath'].isin(frame_list)]
y_test_set = y_test.set_index('filepath')['label'].to_dict()

### Evaluation - Base Model

In [101]:
base_test_generator = DataGenerator(files, y_test_set,n_classes=2,shuffle=False)

In [102]:
base_eval_base_model = load_model('./base_binary_best_model.hdf5')



In [103]:
base_predictions = base_eval_base_model.predict_generator(base_test_generator)

In [104]:
base_predictions

array([[3.0761282e-24, 2.5846998e-12],
       [0.0000000e+00, 6.6093747e-37],
       [9.5028400e-01, 2.0582025e-21],
       ...,
       [3.1306454e-15, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 9.9817413e-01]], dtype=float32)

In [109]:
base_y_class = np.array(list(base_test_generator.classes.values()))[:len(base_predictions)]
base_y = tf.keras.utils.to_categorical(base_y_class, num_classes=2)

In [110]:
base_prob_max = base_predictions.max(axis=1).reshape(-1, 1)
base_y_pred = np.where(base_predictions == base_prob_max, 1, 0)

In [111]:
base_pred = np.argmax(base_predictions, axis=-1)

In [112]:
len(base_y_pred)

512

In [113]:
len(base_y)

512

In [114]:
Counter(base_pred)

Counter({1: 156, 0: 356})

In [96]:
base_score_test = metrics.accuracy_score(base_y, base_y_pred)
print('Base Model Test Score ',base_score_test)

Base Model Test Score  0.40625


In [115]:
base_cm = confusion_matrix(base_y_class, base_pred)
print(base_cm)

[[235 100]
 [121  56]]


In [116]:
base_report = classification_report(base_y,base_y_pred)
print(base_report)

              precision    recall  f1-score   support

           0       0.66      0.70      0.68       335
           1       0.33      0.50      0.40       177

   micro avg       0.52      0.63      0.57       512
   macro avg       0.49      0.60      0.54       512
weighted avg       0.55      0.63      0.58       512
 samples avg       0.52      0.63      0.56       512



In [117]:
roc_auc_score(base_y, base_y_pred)

0.4936082300362594

### Evaluation - ResNet Model

In [17]:
res_test_generator = DataGenerator(files, y_test_set,n_classes=2,shuffle=False)

In [18]:
res_eval_base_model = load_model('res_binary_best_modelv2.hdf5')

In [19]:
res_predictions = res_eval_base_model.predict_generator(res_test_generator)

Instructions for updating:
Please use Model.predict, which supports generators.


In [20]:
res_predictions

array([[0.6366977 , 0.37497282],
       [0.80993176, 0.19774905],
       [0.5721438 , 0.56675065],
       ...,
       [0.09291667, 0.8699349 ],
       [0.2593543 , 0.70544374],
       [0.21030086, 0.7821566 ]], dtype=float32)

In [21]:
res_y_class = np.array(list(res_test_generator.classes.values()))[:len(res_predictions)]
res_y = tf.keras.utils.to_categorical(res_y_class, num_classes=2)

In [22]:
res_prob_max = res_predictions.max(axis=1).reshape(-1, 1)
res_y_pred = np.where(res_predictions == res_prob_max, 1, 0)

In [23]:
res_pred = np.argmax(res_predictions, axis=-1)

In [24]:
len(res_y_pred)

512

In [25]:
len(res_y)

512

In [26]:
Counter(res_pred)

Counter({0: 296, 1: 216})

In [27]:
res_score_test = metrics.accuracy_score(res_y, res_y_pred)
print('Base Model Test Score ',res_score_test)

Base Model Test Score  0.498046875


In [29]:
res_cm = confusion_matrix(res_y_class, res_pred)
print(res_cm)

[[108  69]
 [188 147]]


In [30]:
res_report = classification_report(res_y,res_y_pred)
print(res_report)

              precision    recall  f1-score   support

           0       0.36      0.61      0.46       177
           1       0.68      0.44      0.53       335

   micro avg       0.50      0.50      0.50       512
   macro avg       0.52      0.52      0.50       512
weighted avg       0.57      0.50      0.51       512
 samples avg       0.50      0.50      0.50       512



In [31]:
roc_auc_score(res_y, res_y_pred)

0.5244877308373388

### Evaluation - VGG Model

In [17]:
res_test_generator = DataGenerator(files, y_test_set,n_classes=2,shuffle=False)

In [18]:
res_eval_base_model = load_model('vgg_binary_best_modelv2.hdf5')

In [19]:
res_predictions = res_eval_base_model.predict_generator(res_test_generator)

Instructions for updating:
Please use Model.predict, which supports generators.


In [20]:
res_predictions

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [21]:
res_y_class = np.array(list(res_test_generator.classes.values()))[:len(res_predictions)]
res_y = tf.keras.utils.to_categorical(res_y_class, num_classes=2)

In [22]:
res_prob_max = res_predictions.max(axis=1).reshape(-1, 1)
res_y_pred = np.where(res_predictions == res_prob_max, 1, 0)

In [23]:
res_pred = np.argmax(res_predictions, axis=-1)

In [24]:
len(res_y_pred)

512

In [25]:
len(res_y)

512

In [26]:
Counter(res_pred)

Counter({0: 512})

In [27]:
res_score_test = metrics.accuracy_score(res_y, res_y_pred)
print('Base Model Test Score ',res_score_test)

Base Model Test Score  0.345703125


In [28]:
res_cm = confusion_matrix(res_y_class, res_pred)
print(res_cm)

[[177   0]
 [335   0]]


In [29]:
res_report = classification_report(res_y,res_y_pred)
print(res_report)

              precision    recall  f1-score   support

           0       0.35      1.00      0.51       177
           1       0.00      0.00      0.00       335

   micro avg       0.35      0.35      0.35       512
   macro avg       0.17      0.50      0.26       512
weighted avg       0.12      0.35      0.18       512
 samples avg       0.35      0.35      0.35       512



  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
roc_auc_score(res_y, res_y_pred)

0.5