In [54]:
import os
import torch
import glob
import time
import numpy as np
import pandas as pd
import mmcv, cv2
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image, ImageDraw, ImageFont, ImageEnhance
from IPython import display
from tqdm import tqdm
from matplotlib import pyplot as plt
Image.__version__
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))
# Load face detector
mtcnn = MTCNN(margin=14, keep_all=True, post_process=False, thresholds = [0.9, 0.9, 0.9], device=device).eval()
# Load facial recognition model, but I didn't want to use it yet
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()
class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=60, resize=None):
        """Constructor for DetectionPipeline class.
        
        Keyword Arguments:
            n_frames {int} -- Total number of frames to load. These will be evenly spaced
                throughout the video. If not specified (i.e., None), all frames will be loaded.
                (default: {None})
            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
            resize {float} -- Fraction by which to resize frames from original prior to face
                detection. A value less than 1 results in downsampling and a value greater than
                1 result in upsampling. (default: {None})
        """
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces.

        Arguments:
            filename {str} -- Path to video.
        """
        # Create video reader and find length
        v_cap = cv2.VideoCapture(filename)
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Pick 'n_frames' evenly spaced frames to sample
        if self.n_frames is None:
            sample = np.arange(1, v_len)
        else:
            sample = np.linspace(1, v_len - 1, self.n_frames).astype(int)

        # Loop through frames
        faces = []
        frames = []
        for j in range(v_len):
            success = v_cap.grab()
            if j in sample:
                # Load frame
                success, frame = v_cap.retrieve()
                if not success:
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                max_p = frame.max()
                frame = Image.fromarray(frame)
                if max_p < 150:
                    enhancer = ImageEnhance.Brightness(frame)
                    frame = enhancer.enhance(255/max_p)
                    
                # Resize frame to desired size
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                
                frames.append(frame)

                # When batch is full, detect faces and reset frame list
                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()

        return faces    
def process_faces(faces, resnet):
    # Filter out frames without faces
    faces = [f for f in faces if f is not None]
    faces = torch.cat(faces).to(device)

    # Generate facial feature vectors using a pretrained model
    embeddings = resnet(faces)

    # Calculate centroid for video and distance of each face's feature vector from centroid
#     centroid = embeddings.mean(dim=0)
#     x = (embeddings - centroid).norm(dim=1).cpu().numpy()
    
    return embeddings

Running on device: cuda:0


In [56]:
# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=mtcnn, batch_size=60, resize=None, n_frames=45)
start = time.time()
n_processed = 0
with torch.no_grad():
    f = 1
#     for f in tqdm(np.arange(13, 50, 2), total = len(np.arange(13, 50, 2))):
        # Get all videos
    filenames = glob.glob('data/dfdc_train_part_' + str(f) + '/*.mp4')
    metadata = pd.read_json('data/dfdc_train_part_' + str(f) + '/metadata.json').T
    print('data/dfdc_train_part_' + str(f) + '/*.mp4 | '+ str(len(filenames)) + ' files')
    X1 = []
    X1_encoded = []
    Y1 = []
    X2 = []
    X2_encoded = []
    Y2 = []
    X3 = []
    X3_encoded = []
    Y3 = []
    start = time.time()
    n_processed = 0
    for i, filename in tqdm(enumerate(filenames[0:5]), total= len(filenames[0:5])):
        try:
            # Load frames and find faces
            faces = detection_pipeline(filename)
            y = int((metadata.label['data/dfdc_train_part_' + str(f) + '/' + metadata.index == filename] == 'REAL') * 1)
            n_faces = [x.shape[0] if x is not None else 0 for x in faces ]
            faces = [x for x in faces if x is not None]
            if n_faces.count(3) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 3]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X3.append(torch.cat(f_faces))
                X3_encoded.append(process_faces(f_faces, resnet))
                Y3.append(y)
            elif n_faces.count(2) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 2]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X2.append(torch.cat(f_faces))
                X2_encoded.append(process_faces(f_faces, resnet))
                Y2.append(y)
            elif n_faces.count(1) >= 30:
                f_faces = [x for x in faces if x.shape[0] == 1]
                f_faces = [f_faces[i] for i in np.linspace(0, len(f_faces)-1, 30).astype(int)]
                X1.append(torch.cat(f_faces))
                X1_encoded.append(process_faces(f_faces, resnet))
                Y1.append(y)
        except KeyboardInterrupt:
            print('\nStopped.')
            break

        except Exception as e:
            print(e)

    n_processed += len(faces)
    print(f'Frames per second (load+detect+embed): {n_processed / (time.time() - start):6.3}\r', end='')
#     torch.save(X1_encoded, 'data_processed/1face_X_part' + str(f) + '.pt')
#     torch.save(Y1, 'data_processed/1face_Y_part' + str(f) + '.pt')
#     torch.save(X2_encoded, 'data_processed/2face_X_part' + str(f) + '.pt')
#     torch.save(Y2, 'data_processed/2face_Y_part' + str(f) + '.pt')
#     torch.save(X3_encoded, 'data_processed/3face_X_part' + str(f) + '.pt')
#     torch.save(Y3, 'data_processed/3face_Y_part' + str(f) + '.pt')

  0%|          | 0/5 [00:00<?, ?it/s]

data/dfdc_train_part_1/*.mp4 | 1699 files


100%|██████████| 5/5 [00:14<00:00,  2.88s/it]

Frames per second (load+detect+embed):   3.07




In [57]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm
torch.manual_seed(1)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

torch.cuda.empty_cache() 

Running on device: cuda:0


In [58]:
X = []
Y = []
for p in tqdm(np.arange(0, 1), total = 1):
    X_p = torch.load('data_images/1face_X_part' + str(p) + '.pt', map_location = device)
    Y_p = torch.load('data_images/1face_Y_part' + str(p) + '.pt', map_location = device)
    X = X + [torch.stack(X_p[0])]
    Y = Y + Y_p
    del X_p

In [59]:
torch.cuda.empty_cache() 

In [68]:
X = torch.stack(X1[0:5])
Y = Y1

In [69]:
X.shape

torch.Size([5, 30, 3, 160, 160])

In [70]:
len(Y)

5

In [71]:
from torch.utils.data import TensorDataset, DataLoader
dataset = TensorDataset(X[0:5], torch.from_numpy(np.array(Y[0:5])))
train_data, val_data = torch.utils.data.random_split(dataset, [4, 1])
train_batch_size = 1
val_batch_size = 1
train_loader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=val_batch_size)

In [72]:
from torch import nn
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
    def forward(self, x):
        return x
import torch
import timm
model_ft = timm.create_model("mixnet_m", pretrained=True)
model_ft.classifier = Identity()
batch, seqlen, c, h, w =  X.shape
X = X.reshape(batch*seqlen, c, h, w) # .shape
out = model_ft(X)

In [73]:
out.shape

torch.Size([150, 1536])

In [74]:
out = out.reshape(batch, seqlen, out.shape[1])


In [75]:
out.shape

torch.Size([5, 30, 1536])

In [91]:
import torch.nn as nn

class DFDCNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.5):
        super(DFDCNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.mixnet = timm.create_model("mixnet_m", pretrained=True)
        for param in self.mixnet.parameters():
                param.requires_grad = True
            
        self.lstm = nn.LSTM(1000, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.75)
        self.batchnorm = nn.BatchNorm1d(hidden_dim)
        self.elu = nn.ELU()
        self.fc1 = nn.Linear(hidden_dim, 32)
#         self.fc2 = nn.Linear(64, 32)
#         self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(32, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size, seqlen, c, h, w = x.size()
        x = x.reshape(batch_size*seqlen, c, h, w).float()
        x = self.mixnet(x)
        x = x.reshape(batch_size, seqlen, x.shape[1])
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
#         out = self.batchnorm(out)
        out = self.fc1(out)
        out = self.elu(out)
#         out = self.fc2(out)
#         out = self.elu(out)
#         out = self.fc3(out)
#         out = self.elu(out)
        out = self.fc4(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [92]:
input_size = 512
output_size = 1
hidden_dim = 64
n_layers = 5

model = DFDCNet(input_size, output_size, hidden_dim, n_layers)
model.to(device)
train_criterion = nn.BCELoss()
val_criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)

DFDCNet(
  (mixnet): EfficientNet(
    (conv_stem): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(24, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=24, bias=False)
          (bn1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act1): ReLU(inplace=True)
          (conv_pw): Conv2d(24, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): Identity()
        )
      )
      (1): Sequential(
        (0): InvertedResidual(
          (conv_pw): MixedConv2d(
            (0): Conv2d(12, 72, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): Conv2d(1

In [93]:
# scheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=500, verbose=True)


In [94]:
epochs = 5
counter = 0
print_every = 1
clip = .5
valid_loss_min = np.Inf
val_loss = torch.tensor(np.Inf)
model.train()
for i in range(epochs):
    h = model.init_hidden(train_batch_size)
    
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = train_criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(val_batch_size)
            val_losses = []
            model.eval()
            for inp, lab in val_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = val_criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './model_1face_unfroze.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)
    scheduler.step(val_loss.item())


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 1/5... Step: 1... Loss: 0.651372... Val Loss: 0.657190
Validation loss decreased (inf --> 0.657190).  Saving model ...
Epoch: 1/5... Step: 2... Loss: 0.684561... Val Loss: 0.651209
Validation loss decreased (0.657190 --> 0.651209).  Saving model ...
Epoch: 1/5... Step: 3... Loss: 0.645299... Val Loss: 0.644558
Validation loss decreased (0.651209 --> 0.644558).  Saving model ...
Epoch: 1/5... Step: 4... Loss: 0.620451... Val Loss: 0.637743
Validation loss decreased (0.644558 --> 0.637743).  Saving model ...
Epoch: 2/5... Step: 5... Loss: 0.661193... Val Loss: 0.630168
Validation loss decreased (0.637743 --> 0.630168).  Saving model ...
Epoch: 2/5... Step: 6... Loss: 0.621610... Val Loss: 0.622214
Validation loss decreased (0.630168 --> 0.622214).  Saving model ...
Epoch: 2/5... Step: 7... Loss: 0.632420... Val Loss: 0.613557
Validation loss decreased (0.622214 --> 0.613557).  Saving model ...
Epoch: 2/5... Step: 8... Loss: 0.593267... Val Loss: 0.604560
Validation loss decreased 

# Reference
* https://github.com/ronghanghu/pytorch-gve-lrcn/blob/master/models/pretrained_models.py