# Video-mutil-label-classification

## Data Download

In [None]:
!pip install numpy
!pip install pandas
!pip install decord
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
!pip install pytorch-lightning
!pip install pytorchvideo
!pip install scikit-learn
!pip install scikit-multilearn
!pip install segmentation-models-pytorch
!pip install transformers
!pip install einops
!pip install tqdm
!git clone https://github.com/ugiugi0823/DACON-Car-Crash-Analysis.git
!gdown "1npn0T-pMOKw4hu39p2gbgaVq0podIpT0&confirm=t"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: decord
Successfully installed decord-0.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://download.pytorch.org/whl/cu117
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.9.2-py3-none-any.whl (826 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m826.2/826.2 KB[0m [31m12

In [None]:
%cd DACON-Car-Crash-Analysis

/content/DACON-Car-Crash-Analysis


In [36]:
!mkdir data
!mkdir checkpoint
!mkdir submission

mkdir: cannot create directory ‘data’: File exists


In [None]:
import os
from zipfile import ZipFile
import glob


dir = '/content/'

base_dir = '/content/DACON-Car-Crash-Analysis/data'

ZipFile(dir + 'open.zip').extractall(base_dir)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from einops import rearrange
from decord import VideoReader
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from segmentation_models_pytorch.losses import FocalLoss
from transformers import AutoModel, AutoImageProcessor, AutoConfig
from skmultilearn.model_selection import iterative_train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorchvideo.transforms.transforms_factory import create_video_transform

from sampler import MultilabelBalancedRandomSampler
from apollo import Apollo

In [37]:
config = {
    "seed":2023,
    "model_name":"facebook/timesformer-base-finetuned-k400",
    "batch_size":3,
    "learning_rate":1e-5,
    "data_dir":'./data',
    "checkpoint_dir":'./checkpoint',
    "submission_dir":'./submission',
    "n_classes":(2,3,4,3),
    "label_dict":{
        -1:[-1,-1,-1,-1],
        0:[0,0,0,0],
        1:[1,1,1,1],
        2:[1,1,1,2],
        3:[1,1,2,1],
        4:[1,1,2,2],
        5:[1,1,3,1],
        6:[1,1,3,2],
        7:[1,2,1,1],
        8:[1,2,1,2],
        9:[1,2,2,1],
        10:[1,2,2,2],
        11:[1,2,3,1],
        12:[1,2,3,2]
    },
    "label_reverse_dict":{
        (0,0,0,0):0,
        (1,1,1,1):1,
        (1,1,1,2):2,
        (1,1,2,1):3,
        (1,1,2,2):4,
        (1,1,3,1):5,
        (1,1,3,2):6,
        (1,2,1,1):7,
        (1,2,1,2):8,
        (1,2,2,1):9,
        (1,2,2,2):10,
        (1,2,3,1):11,
        (1,2,3,2):12,
    }
}

In [38]:
import pprint

pprint.pprint(config['n_classes'])

(2, 3, 4, 3)


In [39]:
pl.seed_everything(config['seed'])

INFO:lightning_fabric.utilities.seed:Global seed set to 2023


2023

In [40]:
train_df = pd.read_csv(f"{config['data_dir']}/train.csv")
test_df = pd.read_csv(f"{config['data_dir']}/test.csv")

In [41]:
train_df['sample_id'] = train_df['sample_id'].apply(lambda x: int(x.split('_')[1]))
test_df['sample_id'] = test_df['sample_id'].apply(lambda x: int(x.split('_')[1]))

In [42]:
train_df['video_path'] = train_df['video_path'].apply(lambda x: config['data_dir'] + x[1:])
test_df['video_path'] = test_df['video_path'].apply(lambda x: config['data_dir'] + x[1:])

In [43]:
test_df['label']=-1
test_df['label_split'] = test_df['label'].apply(config['label_dict'].get)

In [44]:
train_df['label_split'] = train_df['label'].apply(config['label_dict'].get)
train_label_split = np.array(train_df['label_split'].tolist())

In [45]:
train_label_multi_hot = np.hstack([np.eye(n_class, dtype=np.int32)[train_label_split[:,idx]] for idx, n_class in enumerate(config['n_classes'])])
train_df['label_multi_hot'] = train_label_multi_hot.tolist()

In [46]:
train_df

Unnamed: 0,sample_id,video_path,label,label_split,label_multi_hot
0,0,./data/train/TRAIN_0000.mp4,7,"[1, 2, 1, 1]","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]"
1,1,./data/train/TRAIN_0001.mp4,7,"[1, 2, 1, 1]","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]"
2,2,./data/train/TRAIN_0002.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
3,3,./data/train/TRAIN_0003.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
4,4,./data/train/TRAIN_0004.mp4,1,"[1, 1, 1, 1]","[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]"
...,...,...,...,...,...
2693,2693,./data/train/TRAIN_2693.mp4,3,"[1, 1, 2, 1]","[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0]"
2694,2694,./data/train/TRAIN_2694.mp4,5,"[1, 1, 3, 1]","[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]"
2695,2695,./data/train/TRAIN_2695.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
2696,2696,./data/train/TRAIN_2696.mp4,0,"[0, 0, 0, 0]","[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0]"


In [47]:
train_df_for_dataset, _ , val_df_for_dataset, _  = iterative_train_test_split(X=train_df.values, y=train_label_multi_hot, test_size=0.2)
test_df_for_dataset = test_df.values

In [48]:
train_multi_hot_for_sampler = np.array(train_df_for_dataset[:,4].tolist())

In [49]:
class VideoDataset(Dataset):
    def __init__(self, df_for_dataset, transform=None):
        self.sample_id = df_for_dataset[:,0] # 1번부터 2698번
        self.video_path = df_for_dataset[:,1] # 비디오 경로
        self.label = df_for_dataset[:,2] # 1~12번 label
        self.label_split = np.array(df_for_dataset[:,3].tolist()) # 4개의 class 로 나눈거
        self.transform = transform

    def __len__(self):
        return len(self.sample_id)

    def __getitem__(self, idx):
        sample_id = self.sample_id[idx]
        video_path = self.video_path[idx]
        vr = VideoReader(video_path)
        video = torch.from_numpy(vr.get_batch(range(50)).asnumpy())
        video = rearrange(video, 't h w c -> c t h w')
        label = self.label[idx]
        label_split = self.label_split[idx]
        
        if self.transform:
            video = self.transform(video)
        video = rearrange(video, 'c t h w -> t c h w')

        sample = {
            'sample_id':sample_id,
            'video':video,
            'label':label,
            'label_split':label_split
        }
        
        return sample

In [50]:
model_config = AutoConfig.from_pretrained(config['model_name'])
image_processor_config = AutoImageProcessor.from_pretrained(config['model_name'])

In [51]:
train_transform = create_video_transform(
    mode='train',
    num_samples=model_config.num_frames,
    video_mean = tuple(image_processor_config.image_mean),
    video_std = tuple(image_processor_config.image_std),
    crop_size = tuple(image_processor_config.crop_size.values())
)

val_transform = create_video_transform(
    mode='val',
    num_samples=model_config.num_frames,
    video_mean = tuple(image_processor_config.image_mean),
    video_std = tuple(image_processor_config.image_std),
    crop_size = tuple(image_processor_config.crop_size.values())
)

In [52]:
train_dataset = VideoDataset(train_df_for_dataset, transform=train_transform)
val_dataset = VideoDataset(val_df_for_dataset, transform=val_transform)
test_dataset = VideoDataset(test_df_for_dataset, transform=val_transform)

In [53]:
train_sampler = MultilabelBalancedRandomSampler(train_multi_hot_for_sampler)
train_dataloader = DataLoader(train_dataset, batch_size= config['batch_size'], sampler=train_sampler)
val_dataloader = DataLoader(val_dataset, batch_size = config['batch_size']*2)
test_dataloader = DataLoader(test_dataset, batch_size = config['batch_size']*2)

In [54]:
class PLVideoModel(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.learning_rate = config['learning_rate']
        self.model = AutoModel.from_pretrained(config['model_name'])
        self.classifiers = nn.ModuleList([
            nn.LazyLinear(n_class) for n_class in config['n_classes']
        ])
        self.loss = FocalLoss('multiclass')

    def forward(self, x):
        x = self.model(x).last_hidden_state.mean(dim=1)
        x_out = [classifier(x) for classifier in self.classifiers]
        return x_out

    def training_step(self, batch, batch_idx):
        video, label, label_split = batch['video'], batch['label'], batch['label_split']
        y_hats = self.forward(batch["video"])
        loss = sum([self.loss(y_hats[i], batch["label_split"][:,i]) for i in range(len(self.config['n_classes']))])
        loss = loss/len(self.config['n_classes'])
        self.log("train_loss", loss)
        return loss


    '''
    분기점
    '''






    def validation_step(self, batch, batch_idx):
        video, label, label_split = batch['video'], batch['label'], batch['label_split']
        y_hats = self.forward(batch["video"])
        step_output = [*y_hats, label]
        return step_output
    
    def predict_step(self, batch, batch_idx):
        video, _, _ = batch['video'], batch['label'], batch['label_split']
        y_hats = self.forward(batch["video"])
        step_output = y_hats
        return step_output

    def validation_epoch_end(self, step_outputs):
        pred1, pred2, pred3, pred4, label = [], [], [], [], []
        for step_output in step_outputs:
            pred1.append(step_output[0])
            pred2.append(step_output[1])
            pred3.append(step_output[2])
            pred4.append(step_output[3])
            label.append(step_output[4])
            
        pred1 = torch.cat(pred1).argmax(1)
        pred2 = torch.cat(pred2).argmax(1)
        pred3 = torch.cat(pred3).argmax(1)
        pred4 = torch.cat(pred4).argmax(1)
        label = torch.cat(label).tolist()

        pred = torch.stack([pred1,pred2,pred3,pred4],dim=1).cpu().detach().numpy().tolist()
        pred = list(map(lambda x: self.config['label_reverse_dict'].get(tuple(x),0),pred))
        
        score = f1_score(label,pred, average='macro')
        self.log("val_score", score)
        return score
    
    def post_preproc(self, step_outputs):
        pred1, pred2, pred3, pred4 = [], [], [], []
        for step_output in step_outputs:
            pred1.append(step_output[0])
            pred2.append(step_output[1])
            pred3.append(step_output[2])
            pred4.append(step_output[3])
            
        pred1 = torch.cat(pred1).argmax(1)
        pred2 = torch.cat(pred2).argmax(1)
        pred3 = torch.cat(pred3).argmax(1)
        pred4 = torch.cat(pred4).argmax(1)

        pred = torch.stack([pred1,pred2,pred3,pred4],dim=1).cpu().detach().numpy().tolist()
        pred = list(map(lambda x: self.config['label_reverse_dict'].get(tuple(x),0),pred))

        return pred
            
    def configure_optimizers(self):
        optimizer = Apollo(self.parameters(), lr=self.learning_rate)  #아폴로 쓰이는 순간~!
        return [optimizer]

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_score',
    dirpath=config['checkpoint_dir'],
    filename=f'{config["model_name"]}'+'-{epoch:02d}-{train_loss:.4f}-{val_score:.4f}',
    mode='max'
)
early_stop_callback = EarlyStopping(
    monitor="train_loss",
    patience=3,
    verbose=False,
    mode="min"
)

pl_video_model = PLVideoModel(config)

trainer = pl.Trainer(
    max_epochs=100,
    accelerator='auto', 
    precision=16,
    callbacks=[early_stop_callback, checkpoint_callback]
                    
)
trainer.fit(pl_video_model, train_dataloader, val_dataloader)

Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k400 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit None Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.u

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [None]:
pl_video_model_pretrained = PLVideoModel.load_from_checkpoint(
    "./checkpoint/facebook/timesformer-base-finetuned-k400-epoch=08-train_loss=0.1318-val_score=0.5356.ckpt",
    config=config
)

trainer = pl.Trainer(accelerator='auto')
pred = trainer.predict(pl_video_model_pretrained, test_dataloader)

Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k400 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision fo

Predicting: 0it [00:00, ?it/s]

In [None]:
pred_post_proc = pl_video_model_pretrained.post_preproc(pred)

In [None]:
submit = pd.read_csv(f"{config['data_dir']}/sample_submission.csv")

In [None]:
submit['label'] = pred_post_proc

In [None]:
submit.to_csv(f"{config['submission_dir']}/testsubmit.csv", index=False)