<a href="https://colab.research.google.com/github/vijjus/CV/blob/master/direct_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
import torchvision
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch import nn
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import urllib
import urllib.request
import cv2
import re
import os
import random
from PIL import Image
import timm
import datetime

In [None]:
from retinaface import create_retinaface, retina_scan

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
return_layers = {'layer2':1,'layer3':2,'layer4':3}
RetinaFace = create_retinaface(return_layers)

retina_dict = RetinaFace.state_dict()
pre_state_dict = torch.load("/s3/pretrained_weights/model.pt", map_location=torch.device(device))
pretrained_dict = {k[7:]: v for k, v in pre_state_dict.items() if k[7:] in retina_dict}
RetinaFace.load_state_dict(pretrained_dict)

RetinaFace = RetinaFace.to(device)
RetinaFace.eval()

RetinaFace(
  (body): IntermediateLayerGetter(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Seque

In [None]:
def extract_hsv(scans):
    hue = 0
    sat = 0
    value = 0
    for scan in scans:
        h,s,v = cv2.split(scan)
        hue += h.mean()
        sat += s.mean()
        value += v.mean()
    return [round(hue/len(scans),2), round(sat/len(scans), 2), round(value/len(scans), 2)]

In [None]:
#base_dir = '/s3/DFDC/dfdc_train_part_0/'

In [None]:
#files = [f for f in os.listdir(base_dir)]

In [None]:
#for f in files:
#  scans = retina_scan(RetinaFace, base_dir + f)
#  if len(scans) > 0:
#    hsv = extract_hsv(scans)
#    print("[{}]: Hue: {}, Saturation: {}, Value: {}".format(f, hsv[0], hsv[1], hsv[2]))

In [None]:
#scans = retina_scan(RetinaFace, '/s3/DFDC/dfdc_train_part_0/acdkfksyev.mp4')
#plt.imshow(scans[0])

In [None]:
#scans = retina_scan(RetinaFace, '/s3/DFDC/dfdc_train_part_0/abhggqdift.mp4')
#plt.imshow(scans[0])

In [None]:
phases = ['train', 'val']

In [None]:
#transforms
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

Dataset: Input a dictionary with video names as keys, and folder as value. Create a dataset that returns either a stacked or single frame. Size of the dataset should be number of videos 

In [None]:
import pandas as pd

In [None]:
base_dir = '/s3/DFDC/dfdc_train_part_0/'

In [None]:
def get_meta_from_json(path):
    df = pd.read_json(os.path.join(path, "metadata.json"))
    df = df.T
    return df

In [None]:
def get_label(df, vid):
    label = df.loc[vid].label
    if label == 'FAKE':
        return 0
    return 1

In [None]:
# get a list of all videos in the directory
files = [f for f in os.listdir(base_dir) if f.split('.')[-1] == 'mp4']

In [None]:
# get a list of files sorted by real & fake, and split them into train & test
def split_by_label(video_dir):
    df = get_meta_from_json(video_dir)
    real_videos = []
    fake_videos = []
    for f in files:
        label = get_label(df, f)
        if label == 1:
            real_videos.append((video_dir, f, label))
        else:
            fake_videos.append((video_dir, f, label))
    return real_videos, fake_videos

In [None]:
reals, fakes = split_by_label(base_dir)

In [None]:
len(reals), len(fakes)

(72, 1041)

In [None]:
from random import shuffle

In [None]:
train_videos = reals[:int(0.7*len(reals))] + fakes[:int(0.7*len(fakes))]
print(len(train_videos))
shuffle(train_videos)
train_videos[:10]

778


[('/s3/DFDC/dfdc_train_part_0/', 'npneqyjgiq.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'ojayvjcdna.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'gbatfzbsjs.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'fmeptfderu.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'lrpjtzwsdt.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'oobmorozgk.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'jqlsethxvz.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'dibwmyrpql.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'avoqheikrk.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'gfoozygynw.mp4', 0)]

In [None]:
val_videos = reals[int(0.7*len(reals)):] + fakes[int(0.7*len(fakes)):]
print(len(val_videos))
shuffle(val_videos)
val_videos[:10]

335


[('/s3/DFDC/dfdc_train_part_0/', 'vopokawkip.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'wcqvzujamg.mp4', 1),
 ('/s3/DFDC/dfdc_train_part_0/', 'uvfkppqsjy.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'rqlpmeyhqh.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'qusnfjluuh.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'rhvsxaewcr.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'ujvrrbmmye.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'rtkhyvbcjg.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'voawxrmqyl.mp4', 0),
 ('/s3/DFDC/dfdc_train_part_0/', 'vtunvalyji.mp4', 1)]

In [None]:
# create a dictionary that will seed the Dataset
def create_ds_dict(videos):
    video_dict = {}
    for V in videos:
        d, v, l = V
        video_dict[v] = (l, d)
    return video_dict

In [None]:
# both training and validation dictionaries are ready
ds_dict = {
    'train': create_ds_dict(train_videos),
    'val': create_ds_dict(val_videos)
}

In [None]:
class DFDC(Dataset):  
    def __init__(self, video_dict, fcmodel, fpv, transform = None):
        self.vdict = video_dict
        self.fcmodel = fcmodel
        self.transform = transform
        self.fpv = fpv
        self.videos = list(self.vdict.keys())

    def __getitem__(self, i):
        j = i//self.fpv
        k = i%self.fpv
        vid = self.videos[j]
        label = self.vdict[vid][0]
        d = self.vdict[vid][1]
        vid_path = d + vid
        scans = retina_scan(self.fcmodel, vid_path, self.fpv)

        try:
            frame = scans[k]
        except IndexError:
            print(vid, k)
            frame = np.random.randint(0, 256, (256,256,3))

        frame = Image.fromarray(np.uint8(frame)).convert('RGB')
        if self.transform:
            frame = self.transform(frame)
        return frame, torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.vdict)*self.fpv

In [None]:
input_size = 512
output_size = 1
hidden_dim = 512
n_layers = 2
batchsize = 8
fpv = 2

In [None]:
def make_dataset(phase):
    d = ds_dict[phase]
    print(len(d))
    transform = data_transforms[phase]
    print(transform)
    return DFDC(d, RetinaFace, fpv, transform)

In [None]:
dataset = [make_dataset(x) for x in phases]

778
Compose(
    RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=PIL.Image.BILINEAR)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)
335
Compose(
    Resize(size=256, interpolation=PIL.Image.BILINEAR)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)


In [None]:
len(dataset[0]), len(dataset[1])

(1556, 670)

In [None]:
dataloader = [DataLoader(d, batch_size = batchsize, shuffle = True) for d in dataset]

In [None]:
dataloader

[<torch.utils.data.dataloader.DataLoader at 0x7f6c42113f60>,
 <torch.utils.data.dataloader.DataLoader at 0x7f6c42113470>]

In [None]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

In [None]:
set_seed_everywhere(1337, device)

In [None]:
net = torchvision.models.densenet121(pretrained=True)

In [None]:
num_ftrs = net.classifier.in_features
net.classifier = nn.Linear(num_ftrs, 2)

In [None]:
net = net.to(device)

In [None]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

In [None]:
class DFDCNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.5):
        super(DFDCNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # use mixnet_s, remove the final Linear layer (1536 -> 1000)
        self.mixnet = timm.create_model("mixnet_s", pretrained=True)
        self.mixnet.classifier = Identity()

        self.lstm = nn.LSTM(1536, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.batchnorm = nn.BatchNorm1d(hidden_dim)
        self.elu = nn.ELU()
        self.fc1 = nn.Linear(hidden_dim, 32)
        self.fc4 = nn.Linear(32, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size, seqlen, c, h, w = x.size()
        x = x.reshape(batch_size*seqlen, c, h, w).float()
        x = self.mixnet(x)
        x = x.reshape(batch_size, seqlen, x.shape[1])
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc1(out)
        out = self.elu(out)
        out = self.fc4(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [None]:
net.train()

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [None]:
from torch.optim.lr_scheduler import StepLR
scheduler = StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
for data in dataloader[0]:
    X, y = data
    break

In [None]:
outputs = net(X.to(device))

In [None]:
outputs

tensor([[-0.3621, -0.5597],
        [-0.2211,  0.2696],
        [-0.2365,  0.4368],
        [-0.2891,  0.9529],
        [ 0.1304,  0.4506],
        [-0.2733,  0.5774],
        [ 0.0156, -0.1525],
        [ 0.0904,  0.3849]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
l = criterion(outputs, y.to(device))

In [None]:
l

tensor(0.8053, device='cuda:0', grad_fn=<NllLossBackward>)

In [None]:
def train():
    for epoch in range(1, 21):
        for phase in phases:
            if phase == 'train':
                net.train() 
            else:
                net.eval()

            dataset_size = 0
            running_loss = 0

            # begin processing batch
            for data in dataloader[phases.index(phase)]:
                X, labels = data
                if (len(labels) != batchsize):
                    continue
                if phase == 'train':
                    net.zero_grad()
                outputs = net(X.to(device))
                loss = criterion(outputs, labels.to(device))
                dataset_size += len(labels)
                running_loss += loss.item() * len(labels)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                torch.cuda.empty_cache()
                scheduler.step()
        print("Epoch " + str(epoch) + " Phase: ", phase, " Loss: ", round(running_loss/dataset_size, 2))

In [None]:
train()

wfbqqomqmm.mp4 0
wfbqqomqmm.mp4 1
Epoch 1 Phase:  val  Loss:  0.31
wfbqqomqmm.mp4 1
wfbqqomqmm.mp4 0
Epoch 2 Phase:  val  Loss:  0.32
