## 라이브러리 IMPORT

In [5]:
import random
import pandas as pd
import numpy as np
import os
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#import albumentations as A
#from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore') 

In [7]:
CFG = {
    'FPS':30,
    'IMG_SIZE':32,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':4,
    'SEED':41
}

# 모델 생성 Facebook의 Timesformer

In [8]:
from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
from transformers import AutoImageProcessor, TimesformerForVideoClassification

model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-ssv2",num_labels=5,num_frames=30,ignore_mismatched_sizes=True)
feature_extractor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-ssv2")


Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-ssv2 and are newly initialized because the shapes did not match:
- timesformer.embeddings.time_embeddings: found shape torch.Size([1, 8, 768]) in the checkpoint and torch.Size([1, 30, 768]) in the model instantiated
- classifier.weight: found shape torch.Size([174, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([174]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## train.csv 파일을 읽어온다

In [32]:
df = pd.read_csv('./train.csv')

In [33]:
from sklearn.preprocessing import OneHotEncoder
l = df['label']
ohe = OneHotEncoder(sparse=False)
# fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
train_cat = ohe.fit_transform(df[['label']])
df

Unnamed: 0,id,path,label
0,TRAIN_000,./train/TRAIN_000.mp4,3
1,TRAIN_001,./train/TRAIN_001.mp4,0
2,TRAIN_002,./train/TRAIN_002.mp4,1
3,TRAIN_003,./train/TRAIN_003.mp4,4
4,TRAIN_004,./train/TRAIN_004.mp4,4
...,...,...,...
605,TRAIN_605,./train/TRAIN_605.mp4,0
606,TRAIN_606,./train/TRAIN_606.mp4,2
607,TRAIN_607,./train/TRAIN_607.mp4,1
608,TRAIN_608,./train/TRAIN_608.mp4,4


## VAL과 TRAIN 데이터셋 0.1 사이즈로 분할

In [34]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.1, random_state=CFG['SEED'])

# Custom 데이터셋 정의

In [35]:


class CustomDataset(Dataset):
    def __init__(self, video_path_list, label_list):
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        frames = list(np.array((self.get_video(self.video_path_list[index]))))
        #print(frames)
        #video = list(np.random.randn(16, 3, 224, 224))
        #print('gsdfsdfdsafdsafadf')
        #print(video)
        pixel_values = feature_extractor(frames,return_tensor="pt").pixel_values
        if self.label_list is not None:
            cell = [0] *5
            label = self.label_list[index]
            cell[label]+=1
            label = np.array(cell)
            label = label.astype(np.float32).tolist()
            #print(label)
            encoding = {"pixel_values" : torch.tensor(pixel_values).squeeze(),'labels': torch.tensor(label).squeeze()} 
            return encoding
        else:
            encoding = {"pixel_values" : torch.tensor(pixel_values).squeeze()}
            return encoding
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        cap = cv2.VideoCapture(path)
        for _ in range(CFG['FPS']):
            _, img = cap.read()
            #img = cv2.resize(img, (224, 224))
            #img = img / 255.
            frames.append(img)
        frames = np.array(tuple(frames))
        return torch.FloatTensor(np.array(frames)).permute(0, 3, 1, 2)

In [36]:
train_dataset = CustomDataset(train['path'].values, train['label'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val['path'].values, val['label'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

# 훈련에 필요한 각종 파라미터 정의

In [43]:
from transformers import TrainingArguments, Trainer
batch_size = 2

args = TrainingArguments(
    output_dir = 'times',
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=10,
    num_train_epochs=10,
    save_total_limit = 10,
    load_best_model_at_end=True,
    save_steps=275
    #max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [44]:
import evaluate

metric = evaluate.load("accuracy")

In [45]:
import numpy as np

In [46]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions."""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

# 훈련

In [47]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    #compute_metrics=compute_metrics,
    #data_collator=collate_fn,
)

In [48]:
trainer.train()

***** Running training *****
  Num examples = 549
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 2750
  Number of trainable parameters = 121279493


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# 훈련했던 모델의 가중치 파일과 전처리기를 불러옵니다

## 처음부터 가중치 파일로부터 추론할경우 아래의 model = TimesformerForVideoClassification.from_pretrained("E:/timesformer-base-ssv2/checkpoint-2750")에서 E:/timesformer-base-ssv2/checkpoint-2750를 경로에 맞게 바꾸시면 됩니다.

In [49]:
model = TimesformerForVideoClassification.from_pretrained("./times/checkpoint-2750")
feature_extractor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-ssv2")

loading configuration file ./times/checkpoint-2750\config.json
Model config TimesformerConfig {
  "_name_or_path": "facebook/timesformer-base-finetuned-ssv2",
  "architectures": [
    "TimesformerForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "divided_space_time",
  "drop_path_rate": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-06,
  "model_type": "timesformer",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 30,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "problem_type": "multi_label_classification",
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers

# TESE.csv파일을 읽어와 추론을 합니다.

In [17]:
test = pd.read_csv('./test.csv')

In [18]:
test_dataset = CustomDataset(test['path'].values, None)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=False, num_workers=0)

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.eval()
model.to(device)
predictions = []
with torch.no_grad():
    for batch_id, x in enumerate(tqdm(test_loader)):
        outputs = model(x['pixel_values'].to(device))
        logits = outputs.logits
        print(logits)
        probs  = logits.cpu().detach().numpy()
        class_ = np.argmax(probs)
        predictions.append(class_)

  0%|          | 0/153 [00:00<?, ?it/s]

tensor([[-5.4457,  4.6761, -6.3286, -5.2873, -4.9104]], device='cuda:0')
tensor([[-5.9229, -5.9610, -3.7331,  3.7620, -6.7892]], device='cuda:0')
tensor([[ 5.6294, -5.6800, -5.6347, -5.0933, -5.4685]], device='cuda:0')
tensor([[-5.9834, -5.1947,  4.6965, -4.7221, -6.8901]], device='cuda:0')
tensor([[-5.1755, -6.0655, -4.6566, -5.7257,  5.6133]], device='cuda:0')
tensor([[-5.1996, -5.6043,  5.7051, -5.9783, -4.9697]], device='cuda:0')
tensor([[-5.1315, -5.3775, -5.9836, -5.7376,  5.4706]], device='cuda:0')
tensor([[-6.2472, -6.2029, -4.8264,  4.4311, -5.5311]], device='cuda:0')
tensor([[-3.8584,  2.6701, -6.5978, -5.7971, -5.2993]], device='cuda:0')
tensor([[-6.2826, -6.4962,  0.1561, -2.1507, -5.1657]], device='cuda:0')
tensor([[-5.6307, -6.0634, -3.4440,  0.5440, -6.8746]], device='cuda:0')
tensor([[-6.2053, -6.0817, -2.0660,  1.8065, -6.7433]], device='cuda:0')
tensor([[ 4.9322, -5.3245, -5.6257, -5.7592, -5.3636]], device='cuda:0')
tensor([[-6.5837, -5.6309, -5.0798, -5.7058,  4.997

tensor([[-5.3840, -5.4612, -4.8333,  4.6005, -6.2425]], device='cuda:0')
tensor([[-5.8950,  4.8671, -5.3056, -4.6664, -6.3811]], device='cuda:0')
tensor([[-5.3992,  4.4882, -6.1350, -5.5738, -5.5669]], device='cuda:0')
tensor([[-4.5596, -3.8008,  1.2820, -4.4346, -8.0052]], device='cuda:0')
tensor([[-5.6932,  5.3402, -5.4417, -5.5394, -6.0742]], device='cuda:0')
tensor([[-5.3977,  6.1791, -5.2133, -5.7320, -5.4290]], device='cuda:0')
tensor([[-5.4625, -5.7001,  1.9833, -3.6620, -7.3164]], device='cuda:0')
tensor([[-5.8376, -4.3633, -4.0758, -4.9941, -3.3360]], device='cuda:0')
tensor([[-6.2549, -6.0618, -4.3074,  4.2416, -5.7237]], device='cuda:0')
tensor([[-4.9596,  4.5359, -5.9547, -4.9967, -5.1517]], device='cuda:0')
tensor([[-7.0433,  4.7003, -4.5260, -4.9491, -5.7864]], device='cuda:0')
tensor([[-6.0319, -6.3961, -3.9849,  3.1077, -6.4624]], device='cuda:0')
tensor([[-5.6153, -5.6684, -4.9411, -5.7821,  5.7118]], device='cuda:0')
tensor([[ 5.5260, -5.0707, -5.0350, -5.7433, -5.943

In [20]:
predictions

[1,
 3,
 0,
 2,
 4,
 2,
 4,
 3,
 1,
 2,
 3,
 3,
 0,
 4,
 1,
 4,
 3,
 1,
 4,
 1,
 2,
 2,
 3,
 3,
 2,
 3,
 1,
 4,
 4,
 1,
 3,
 1,
 0,
 4,
 3,
 4,
 2,
 3,
 2,
 0,
 2,
 4,
 4,
 3,
 3,
 4,
 4,
 0,
 4,
 1,
 3,
 4,
 4,
 0,
 1,
 3,
 4,
 2,
 0,
 4,
 0,
 3,
 4,
 2,
 4,
 2,
 1,
 0,
 2,
 4,
 3,
 1,
 3,
 3,
 3,
 0,
 4,
 4,
 3,
 0,
 1,
 4,
 3,
 1,
 2,
 2,
 4,
 3,
 0,
 2,
 1,
 2,
 2,
 0,
 4,
 1,
 4,
 0,
 2,
 3,
 2,
 4,
 1,
 4,
 0,
 2,
 0,
 0,
 3,
 1,
 0,
 1,
 4,
 3,
 1,
 1,
 2,
 1,
 1,
 2,
 4,
 3,
 1,
 1,
 3,
 4,
 0,
 2,
 1,
 4,
 0,
 0,
 4,
 2,
 4,
 4,
 0,
 2,
 3,
 0,
 0,
 0,
 4,
 1,
 3,
 1,
 2,
 2,
 2,
 4,
 0,
 1,
 2]

In [21]:
submit = pd.read_csv('./sample_submission.csv')

In [22]:
submit['label'] = predictions
submit.head()

Unnamed: 0,id,label
0,TEST_000,1
1,TEST_001,3
2,TEST_002,0
3,TEST_003,2
4,TEST_004,4


## 최종 csv파일 생성

In [23]:
submit.to_csv('./timesformer_submit_video-re.csv', index=False)