In [32]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import matplotlib.pyplot as plt
%run function.ipynb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
import pickle
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
import pandas as pd
import json
import glob
import time
import cv2




# set path
data_path = "./jpegs_256/"    # define UCF-101 RGB data path
save_model_path = "./CRNN_ckpt/"
DATA_FOLDER = './deepfake-detection-challenge/'
TRAIN_SAMPLE_FOLDER = 'train_sample_videos'
TEST_FOLDER = 'test_videos'

# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 768
CNN_embed_dim = 512      # latent dim extracted by 2D CNN
img_x, img_y = 256, 342  # resize video 2d frame size
dropout_p = 0.0          # dropout probability

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

# training parameters
k = 101             # number of target category
epochs = 120        # training epochs
batch_size = 30  
learning_rate = 1e-4
log_interval = 10   # interval for displaying training info

# Select which frame to begin & end in videos
begin_frame, end_frame, skip_frame = 1, 29, 1


def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.train()
    rnn_decoder.train()

    losses = []
    scores = []
    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, y) in enumerate(train_loader):
        # distribute data to device
        X, y = X.to(device), y.to(device).view(-1, )

        N_count += X.size(0)

        optimizer.zero_grad()
        output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)

        loss = F.cross_entropy(output, y)
        losses.append(loss.item())

        # to compute accuracy
        y_pred = torch.max(output, 1)[1]  # y_pred != output
        step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        scores.append(step_score)         # computed on CPU

        loss.backward()
        optimizer.step()

        # show information
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))

    return losses, scores


def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.eval()
    rnn_decoder.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        for X, y in test_loader:
            # distribute data to device
            X, y = X.to(device), y.to(device).view(-1, )

            output = rnn_decoder(cnn_encoder(X))

            loss = F.cross_entropy(output, y, reduction='sum')
            test_loss += loss.item()                 # sum up batch loss
            y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # compute accuracy
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100* test_score))

    # save Pytorch models of best record
    torch.save(cnn_encoder.state_dict(), os.path.join(save_model_path, 'cnn_encoder_epoch{}.pth'.format(epoch + 1)))  # save spatial_encoder
    torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'rnn_decoder_epoch{}.pth'.format(epoch + 1)))  # save motion_encoder
    torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_epoch{}.pth'.format(epoch + 1)))      # save optimizer
    print("Epoch {} model saved!".format(epoch + 1))

    return test_loss, test_score


# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU


# Data loading parameters
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}

train_list = list(os.listdir(os.path.join(DATA_FOLDER, TRAIN_SAMPLE_FOLDER)))

json_file = [file for file in train_list if  file.endswith('json')][0]
print(f"JSON file: {json_file}")

def get_meta_from_json(path):
    df = pd.read_json(os.path.join(DATA_FOLDER, path, json_file))
    df = df.T
    return df

meta_train_df = get_meta_from_json(TRAIN_SAMPLE_FOLDER)
meta_train_df.head()
filenames = glob.glob('./deepfake-detection-challenge/train_sample_videos/*.mp4')
print(filenames)

labels=[]
for fn in meta_train_df.index[:]:
    label = meta_train_df.loc[fn]['label']
    labels.append(label) 
    
    

action_names=["REAL","FAKE"] 
# convert labels -> category
le = LabelEncoder()
le.fit(action_names)

# show how many classes there are
list(le.classes_)

# convert category -> 1-hot
action_category = le.transform(action_names).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(action_category)

# # example
# y = ['HorseRace', 'YoYo', 'WalkingWithDog']
# y_onehot = labels2onehot(enc, le, y)
# y2 = onehot2labels(le, y_onehot)
                
all_y_list = labels2cat(le, labels)    # all video labels
#Y=all_y_list.type(torch.LongTensor)


transform = transforms.Compose([transforms.Resize([img_x, img_y]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()
# Load face detector
mtcnn = MTCNN(margin=14, keep_all=True, factor=0.5, device=device).eval()
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=0.25)

# Load facial recognition model
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

X = []
start = time.time()
n_processed = 0
with torch.no_grad():
    for i, filename in tqdm(enumerate(filenames), total=len(filenames)):
        try:
            # Load frames and find faces
            faces = detection_pipeline(filename)
            
            # Calculate embeddings
            X.append(process_faces(faces, resnet))

            X.append(faces)

        except KeyboardInterrupt:
            print('\nStopped.')
            break

        except Exception as e:
            print(e)
            X.append(None)


                

print(X)
train_set= X,all_y_list
print(train_set)
                       
#valid_set=Dataset_CRNN(data_path, test_list, test_label, selected_frames, transform=transform)
train_loader = data.DataLoader(train_set, **params)
print(train_loader)
#valid_loader = data.DataLoader(valid_set, **params)

# Create model
cnn_encoder = EncoderCNN(img_x=img_x, img_y=img_y, fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2,
                         drop_p=dropout_p, CNN_embed_dim=CNN_embed_dim).to(device)

rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, 
                         h_FC_dim=RNN_FC_dim, drop_p=dropout_p, num_classes=k).to(device)

# Parallelize model to multiple GPUs
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    cnn_encoder = nn.DataParallel(cnn_encoder)
    rnn_decoder = nn.DataParallel(rnn_decoder)

crnn_params = list(cnn_encoder.parameters()) + list(rnn_decoder.parameters())
optimizer = torch.optim.Adam(crnn_params, lr=learning_rate)


# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []

# start training
for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(log_interval, [cnn_encoder, rnn_decoder], device, train_loader, optimizer, epoch)

    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    
    # save all train test results
    A = np.array(epoch_train_losses)
    B = np.array(epoch_train_scores)

    np.save('./CRNN_epoch_training_losses.npy', A)
    np.save('./CRNN_epoch_training_scores.npy', B)
    

# plot
fig = plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(np.arange(1, epochs + 1), A[:, -1])  # train loss (on epoch end)
plt.title("model loss")
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc="upper left")
# 2nd figure
plt.subplot(122)
plt.plot(np.arange(1, epochs + 1), B[:, -1])  # train accuracy (on epoch end)
plt.title("training scores")
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc="upper left")
title = "./fig_UCF101_CRNN.png"
plt.savefig(title, dpi=600)
# plt.close(fig)
plt.show()


JSON file: metadata.json
['./deepfake-detection-challenge/train_sample_videos\\aagfhgtpmv.mp4', './deepfake-detection-challenge/train_sample_videos\\aapnvogymq.mp4', './deepfake-detection-challenge/train_sample_videos\\abarnvbtwb.mp4', './deepfake-detection-challenge/train_sample_videos\\abofeumbvv.mp4', './deepfake-detection-challenge/train_sample_videos\\abqwwspghj.mp4', './deepfake-detection-challenge/train_sample_videos\\acifjvzvpm.mp4', './deepfake-detection-challenge/train_sample_videos\\acqfdwsrhi.mp4', './deepfake-detection-challenge/train_sample_videos\\acxnxvbsxk.mp4', './deepfake-detection-challenge/train_sample_videos\\acxwigylke.mp4', './deepfake-detection-challenge/train_sample_videos\\aczrgyricp.mp4', './deepfake-detection-challenge/train_sample_videos\\adhsbajydo.mp4', './deepfake-detection-challenge/train_sample_videos\\adohikbdaz.mp4', './deepfake-detection-challenge/train_sample_videos\\adylbeequz.mp4', './deepfake-detection-challenge/train_sample_videos\\aelfnikyqj.

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.

  0%|                                                                                       | 0/400 [00:00<?, ?it/s]
  0%|▏                                                                            | 1/400 [01:04<7:09:07, 64.53s/it]


Stopped.
[array([0.600036  , 0.5997267 , 0.6134522 , 0.6299431 , 0.64046466,
       0.58557016, 0.5759526 , 0.4620227 , 0.44577545, 0.41024807,
       0.39507225, 0.43269485, 0.44035462, 0.3733154 , 0.36733487,
       0.41327313, 0.4221604 , 0.41183898, 0.48589796, 0.46804324,
       0.438835  , 0.48392117, 0.42420158, 0.43891016, 0.40966797,
       0.34922755, 0.38838512, 0.47490966, 0.41274762, 0.38346058,
       0.5971088 , 0.62296635, 0.58201635, 0.55773145, 0.41367593,
       0.53967655, 0.52106047, 0.5210022 , 0.40457585, 0.48501182,
       0.4795957 , 0.68119836, 0.58124864, 0.5329173 , 0.49983686,
       0.66787726, 0.6468714 , 0.66486585, 0.6121424 , 0.6633865 ,
       0.67285883, 0.48987886, 0.48987558, 0.5060136 , 0.42283303,
       0.44687322, 0.40436846, 0.36412618, 0.36563927, 0.39034522,
       0.38438526, 0.35067093, 0.39197776, 0.42705113, 0.48129532,
       0.5072479 , 0.5116049 , 0.48724765, 0.53315854, 0.5201533 ,
       0.50239795, 0.46593863, 0.47776124, 0.505811

          [-0.5898, -0.5898, -0.5898,  ...,  0.1133,  0.1211,  0.1211]]]])]]
([array([0.600036  , 0.5997267 , 0.6134522 , 0.6299431 , 0.64046466,
       0.58557016, 0.5759526 , 0.4620227 , 0.44577545, 0.41024807,
       0.39507225, 0.43269485, 0.44035462, 0.3733154 , 0.36733487,
       0.41327313, 0.4221604 , 0.41183898, 0.48589796, 0.46804324,
       0.438835  , 0.48392117, 0.42420158, 0.43891016, 0.40966797,
       0.34922755, 0.38838512, 0.47490966, 0.41274762, 0.38346058,
       0.5971088 , 0.62296635, 0.58201635, 0.55773145, 0.41367593,
       0.53967655, 0.52106047, 0.5210022 , 0.40457585, 0.48501182,
       0.4795957 , 0.68119836, 0.58124864, 0.5329173 , 0.49983686,
       0.66787726, 0.6468714 , 0.66486585, 0.6121424 , 0.6633865 ,
       0.67285883, 0.48987886, 0.48987558, 0.5060136 , 0.42283303,
       0.44687322, 0.40436846, 0.36412618, 0.36563927, 0.39034522,
       0.38438526, 0.35067093, 0.39197776, 0.42705113, 0.48129532,
       0.5072479 , 0.5116049 , 0.48724765, 0.53315

       0, 0, 1, 0], dtype=int64))
<torch.utils.data.dataloader.DataLoader object at 0x000001F4B2080E48>


AttributeError: 'list' object has no attribute 'to'