## Setup

In [18]:
import os 
import os.path as osp
import random
import math
import multiprocessing as mp
import sys
import glob

import cv2
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
import pytorch_lightning as pl

import timm

pl.seed_everything(42)

preprocess_data = False
calibrate_data = False

Seed set to 42


## Preprocess

In [19]:
RAW_PATH = "/mnt/elice/dataset"
PREPROCESSED_PATH = "/home/elicer/data"

def preprocess_worker(path):
    global face_classifier
    if "face_classifier" not in globals():
        face_classifier = cv2.CascadeClassifier(
            cv2.data.a + "haarcascade_frontalface_default.xml"
        )
        
    cap = cv2.VideoCapture(path[0])
    
    if osp.isfile(path[1]):
        return
    
    output = cv2.VideoWriter(path[1], cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), cap.get(cv2.CAP_PROP_FPS), (224, 224))
    
    _, frame = cap.read()
    frame = cv2.resize(frame, None, fx=1, fy=10/16)
    face_region = face_classifier.detectMultiScale(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
    area = [w * h for _, _, w, h in face_region]
    
    while len(area) == 0  or max(area) == 0:
        _, frame = cap.read()
        frame = cv2.resize(frame, None, fx=1, fy=10/16)
        face_region = face_classifier.detectMultiScale(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
        area = [w * h for _, _, w, h in face_region]
    
    face_region = face_region[area.index(max(area))]

    padding_ratio = 1.2
    x, y, w, h = face_region
    center_x, center_y = x + w // 2, y + h // 2
    w, h = w * padding_ratio, h * padding_ratio
    x, y = max(center_x - w // 2, 0) , max(center_y - h // 2, 0)

    count = 0
    while (cap.isOpened()):
        ret, frame = cap.read()
        if not ret:
            break
        
        frame = cv2.resize(frame, None, fx=1, fy=10/16)
        frame = frame[int(y):int(y + h), int(x):int(x + w)]

        frame = cv2.resize(frame, (224, 224))
        
        count += 1
        output.write(frame)
        
        if count == 30 * 30:
            break

    cap.release()
    output.release()

def preprocess(data_type):
    if data_type == "train":
        canditate = ["real"]
    else:
        canditate = "."
    
    for label in canditate:
        raw_path = osp.join(RAW_PATH, data_type, label)
        preprocessed_path = osp.join(PREPROCESSED_PATH, data_type, label)

        if not osp.exists(preprocessed_path):
            os.makedirs(preprocessed_path)

        raw_paths = glob.glob(osp.join(raw_path, "*"))
        preprocess_paths = [osp.join(preprocessed_path, osp.basename(path)) for path in raw_paths]

        with mp.Pool(mp.cpu_count() - 2) as pool:
            list(tqdm(pool.imap(preprocess_worker, zip(raw_paths, preprocess_paths)), total=len(raw_paths)))

In [20]:
if preprocess_data:
    preprocess("train")

In [21]:
if preprocess_data:
    preprocess("test")

In [22]:
import shutil

if preprocess_data:
    for label in ["fake", "real"]:
        train_path = osp.join(PREPROCESSED_PATH, "train", label)
        val_path = osp.join(PREPROCESSED_PATH, "val", label)

        if not osp.exists(val_path):
            os.makedirs(val_path)

        train_paths = glob.glob(osp.join(train_path, "*"))
        val_paths = train_paths[: int(len(train_paths) * 0.1)]

        for path in val_paths:
            shutil.move(path, path.replace("train", "val"))    

## Calibration

In [23]:
from facenet_pytorch import MTCNN


def calibrate_worker(path):
    global face_classifier
    if "face_classifier" not in globals():
        face_classifier = MTCNN(device="cuda")
    
    cap = cv2.VideoCapture(path[0])
    
    if osp.isfile(path[1]):
        return
    
    output = cv2.VideoWriter(path[1], cv2.VideoWriter_fourcc('m', 'p', '4', 'v'), cap.get(cv2.CAP_PROP_FPS), (224, 224))
    
    _, frame = cap.read()
    frame = cv2.resize(frame, None, fx=1, fy=12/16)
    face_region = face_classifier.detect(np.expand_dims(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), axis=0))
    
    if face_region[0][0] is None:
        face_region = [[[[0, 0, 480, 480]]]]
    face_region = face_region[0][0][0]

    padding_ratio = 1.4
    x1, y1, x2, y2 = face_region
    
    x, y, w, h = x1, y1, x2 - x1, y2 - y1
    center_x, center_y = x + w // 2, y + h // 2
    w, h = w * padding_ratio, h * padding_ratio
    x, y = max(center_x - w // 2, 0) , max(center_y - h // 2, 0)

    count = 0
    while (cap.isOpened()):
        ret, frame = cap.read()
        if not ret:
            break
        
        frame = cv2.resize(frame, None, fx=1, fy=12/16)
        frame = frame[int(y):int(y + h), int(x):int(x + w)]

        frame = cv2.resize(frame, (224, 224))
        
        count += 1
        output.write(frame)
        
        if count == 30 * 30:
            break

    cap.release()
    output.release()

def calibrate(data_type):
    if data_type != "test":
        canditate = ["fake", "real"]
    else:
        canditate = "."
    
    for label in canditate:
        fault_file = f"/home/elicer/data/fault_{data_type}_{label}.txt" if data_type != "test" else f"/home/elicer/data/fault_test.txt"
        fault_paths = []
        with open(fault_file) as f:
            for line in f:
                fault_paths.append(line.strip())
        
        fault_folder = "train" if data_type == "val" else data_type
        fault_paths = [osp.join(RAW_PATH, fault_folder, label, file + ".mp4") for file in fault_paths]

        calibrate_path = osp.join(PREPROCESSED_PATH, f"calibrate_{data_type}", label)

        if not osp.exists(calibrate_path):
            os.makedirs(calibrate_path)

        calibrate_paths = [osp.join(calibrate_path, osp.basename(path)) for path in fault_paths]

        with mp.Pool(mp.cpu_count() - 2) as pool:
            list(tqdm(pool.imap(calibrate_worker, zip(fault_paths, calibrate_paths)), total=len(fault_paths)))

In [24]:
if calibrate_data:
    calibrate("train")

In [25]:
if calibrate_data:
    for label in ["real", "fake"]:
        calibrate_path = osp.join(PREPROCESSED_PATH, "calibrate_train", label)
        calibrate_paths = glob.glob(osp.join(calibrate_path, "*"))
        for path in calibrate_paths:
            shutil.move(path, path.replace("calibrate_train", "train"))   

In [26]:
if calibrate_data:
    calibrate("val")

In [27]:
if calibrate_data:
    for label in ["real", "fake"]:
        calibrate_path = osp.join(PREPROCESSED_PATH, "calibrate_val", label)
        calibrate_paths = glob.glob(osp.join(calibrate_path, "*"))
        for path in calibrate_paths:
            shutil.move(path, path.replace("calibrate_val", "val"))   

In [28]:
if calibrate_data:
    calibrate("test")

In [29]:
if calibrate_data:
    calibrate_path = osp.join(PREPROCESSED_PATH, "calibrate")
    test_path = osp.join(PREPROCESSED_PATH, "test")

    calibrate_paths = glob.glob(osp.join(calibrate_path, "*"))
    for path in calibrate_paths:
        shutil.move(path, path.replace("calibrate", "test"))   

## Dataset & Data Module

In [30]:
from torch.utils.data import Dataset
from torchvision.io import read_video



class CustomDataset(Dataset):
    def __init__(self, config, data_type, transform=None):
        super().__init__()

        self.config = config
        self.data_type = data_type
        self.transform = transform

        if self.data_type != "test":
            fake_paths = glob.glob(osp.join(PREPROCESSED_PATH, data_type, "fake", "*"))
            real_paths = glob.glob(osp.join(PREPROCESSED_PATH, data_type, "real", "*"))

            self.paths = fake_paths + real_paths
            self.labels = [1] * len(fake_paths) + [0] * len(real_paths)
            
        else:
            self.paths = sorted(glob.glob(osp.join(PREPROCESSED_PATH, data_type, "*")))
            self.labels = [1] * len(self.paths)

    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):
        idx, start_time, end_time = self._get_interval(idx)

        video = read_video(self.paths[idx], start_time, end_time, output_format="TCHW", pts_unit="sec")[0]
        label = torch.tensor(self.labels[idx])
        
        if self.transform:
            video = self.transform(video)
            
        if random.random() > 0.5:
            video = video.flip(dims=[3])
            
        video = video[random.sample(range(video.shape[0]), self.config["num_timesteps"])]
        return video, label
    
    def _get_interval(self, idx, alpha=1.8):
        cap = cv2.VideoCapture(self.paths[idx])
        frame_rate = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))      
        cap.release()
        
        if self.data_type != "test":
            while frame_count - int(self.config["num_timesteps"] * alpha) <= 0:
                idx = (idx + 1) % len(self.paths)
                cap = cv2.VideoCapture(self.paths[idx])
                frame_rate = cap.get(cv2.CAP_PROP_FPS)
                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                cap.release()
        
        start_index = random.randint(0, frame_count - int(self.config["num_timesteps"] * alpha))
        start_time = start_index / frame_rate
        end_time = min(start_index + int(self.config["num_timesteps"] * alpha), frame_count) / frame_rate
        
        return idx, start_time, end_time

In [31]:
from torch.utils.data import DataLoader
from torchvision import transforms


class CustomDataModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.batch_size = config["batch_size"]
        self.num_workers = config["num_workers"]

    def setup(self, stage=None):
        transform = transforms.Compose([
            transforms.ConvertImageDtype(torch.float32),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        
        self.train_dataset = CustomDataset(config, "train", transform=transform)
        self.val_dataset = CustomDataset(config, "val", transform=transform)
        self.test_dataset = CustomDataset(config, "test", transform=transform)


    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
        )


## Model

In [32]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 64):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x.permute(1, 0, 2)
        x = x + self.pe[:x.size(0)]
        x = x.permute(1, 0, 2)
        return self.dropout(x)


class TemporalTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads, num_layers):
        super().__init__()
        
        self.cls_token = nn.Parameter(torch.randn(1, 1, input_dim))
        self.pos_encoder = PositionalEncoding(input_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            input_dim, 
            dim_feedforward=hidden_dim, 
            nhead=num_heads, 
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(input_dim, 1)
        

    def forward(self, x, order):
        batch_size = x.shape[0]
        
        for _ in range(order):
            x = x[:, 1:] - x[:, :-1]
        
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        
        x = torch.cat((cls_tokens, x), dim=1)
        x = self.pos_encoder(x)
        output = self.transformer_encoder(x)[:, 0]
        
        return output

In [33]:
from collections import OrderedDict


class CustomModel(pl.LightningModule):
    def __init__(self, feature_dim=256, hidden_dim=512, nhead=4):
        super().__init__()
        
        self.feature_extractor = timm.create_model("efficientnet_b2", pretrained=True, num_classes=feature_dim)
        self.temporal_transformer = TemporalTransformer(feature_dim, hidden_dim, num_heads=nhead, num_layers=2)
        self.classifier = nn.Linear(feature_dim, 1)
        
    def forward(self, x):
        b, t, c, h, w = x.shape
        
        x = x.view(b * t, c, h, w)
        x = self.feature_extractor(x)
        x = x.view(b, t, -1)
        
        x = self.temporal_transformer(x, order=1)
        output = self.classifier(x)
        return output
    
    def training_step(self, batch, batch_idx):
        data, label = batch
        pred = self(data).reshape(-1)
        
        loss = F.binary_cross_entropy_with_logits(pred, label.float())
        self.log("train loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        data, label = batch
        pred = (torch.sigmoid(self(data)) > 0.5).reshape(-1)
        
        score = (pred == label).float().mean()
        self.log("val score", score, prog_bar=True)
        return score

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=1e-5)
        scheduler = optim.lr_scheduler.MultiplicativeLR(
            optimizer=optimizer,
            lr_lambda=lambda epoch: 0.80 ** epoch
        )
        return [optimizer], [scheduler]

## Training

In [34]:
config = {
    "batch_size" : 12, 
    "num_workers" : 6, 
    
    "num_timesteps": 16, 
}

In [36]:
datamodule = CustomDataModule(config)
model = CustomModel()
model.load_state_dict(torch.load("/home/elicer/weight/final.ckpt")["state_dict"])

torch.set_float32_matmul_precision("medium")

In [37]:
trainer = pl.Trainer(
    max_epochs=5, 
    log_every_n_steps=10, 
    val_check_interval=300,
    precision="16-mixed",
    accelerator="gpu",
)

model.train()
trainer.fit(model, datamodule=datamodule)
# trainer.fit(model, datamodule=datamodule, ckpt_path="/home/elicer/lightning_logs/version_116/checkpoints/epoch=1-step=825.ckpt")

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-11-30 22:10:19.294251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-30 22:10:19.434417: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-30 22:10:20.550101: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ebd17940] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ebd17940] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ebd17940] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ebd17940] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee3c6c00] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec1fce00] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ece4e100] moov atom not found


Validation: |          | 0/? [00:00<?, ?it/s]

[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ebd5f880] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ebdffbc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645eed7edc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f6c11c40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec4b8ac0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed163080] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee162200] moov atom not found


Validation: |          | 0/? [00:00<?, ?it/s]

[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ecf9ef40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ecf9ef40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f0914dc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee32a940] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee32a940] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee32a940] moov atom not found


Validation: |          | 0/? [00:00<?, ?it/s]

[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec496480] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed274040] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed274040] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec496480] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed2cb100] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ecf9ef40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed3ba100] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ecfca580] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed2c2080] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee4fb200] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee15d740] moov atom not found


Validation: |          | 0/? [00:00<?, ?it/s]

[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f6ae75c0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f6ae75c0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed4b0340] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee15d740] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed270480] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee329fc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec4fb900] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee329fc0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec4fb900] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed2688c0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed2688c0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ed2688c0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ec5d6140] moov atom not found


Validation: |          | 0/? [00:00<?, ?it/s]

[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f6ba9500] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee5d6a00] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645ee28ce40] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f6d515c0] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f9436340] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5645f9436340] moov atom not found
`Trainer.fit` stopped: `max_epochs=5` reached.


In [38]:
import time

current_time = time.time()
trainer.save_checkpoint(osp.join("/home/elicer/weight", f"{current_time}.ckpt"), weights_only=True)

## Inference

In [39]:
import pandas as pd

submission = pd.read_csv("sample_submission.csv")

In [40]:
datamodule = CustomDataModule(config)
datamodule.setup()
test_dataloader = datamodule.test_dataloader()

device = "cuda" if torch.cuda.is_available() else "cpu"

In [41]:
model_name = current_time
# model_name = "1701366401.355616"

model = CustomModel.load_from_checkpoint(osp.join("/home/elicer/weight", f"{model_name}.ckpt")).to(device)
model.eval()

CustomModel(
  (feature_extractor): EfficientNet(
    (conv_stem): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNormAct2d(
      32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn1): BatchNormAct2d(
            32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (conv_pw): Conv2d(32, 16, kern

In [65]:
total_pred = 0

In [72]:
for _ in range(23):
    pred = []
    for data, _ in tqdm(test_dataloader):
        with torch.no_grad():
            temp = torch.sigmoid(model(data.to(device))).reshape(-1).cpu().numpy().tolist()
            pred += temp
    total_pred += np.array(pred)

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

In [79]:
pred = (total_pred > 0.50).astype(int)
print(pred.sum())

pred = ["fake" if p == 1 else "real" for p in pred]
submission["label"] = pred

0


In [75]:
submission.to_csv("sample_submission.csv", index=False)