In [1]:
CUDA_LAUNCH_BLOCKING=1
import torch
import matplotlib.pyplot as plt
import os
from torchvision import transforms
import os.path
import numpy as np
from PIL import Image
from typing import List, Union, Tuple, Any

In [2]:
import os
import cv2
import torch
import matplotlib.pyplot as plt
from torchvision import transforms
from mpl_toolkits.axes_grid1 import ImageGrid
import os
import glob
from os.path import normpath, basename
from sklearn.model_selection import train_test_split
import time

start = time.time()

def check_mkdir(dir_name):
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
        
def create_frames_from_video(video_location, save_folder ,name_prefix='img', extension='jpg'):
    # Read the video from specified path
    cam = cv2.VideoCapture(video_location)
    currentframe = 1
    while(True):

        # reading from frame
        ret,frame = cam.read()
        if ret:
            # if video is still left continue creating images
            name= os.path.join(save_folder, f'{name_prefix}_{currentframe:05d}.{extension}')

            # writing the extracted images
            cv2.imwrite(name, frame)

            # increasing counter so that it will
            # show how many frames are created
            currentframe += 1
        else:
            break

    cam.release()
    cv2.destroyAllWindows()
    return currentframe

def plot_video(rows, cols, frame_list, plot_width, plot_height, title: str):
    fig = plt.figure(figsize=(plot_width, plot_height))
    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                     nrows_ncols=(rows, cols),  # creates 2x2 grid of axes
                     axes_pad=0.3,  # pad between axes in inch.
                     )

    for index, (ax, im) in enumerate(zip(grid, frame_list)):
        # Iterating over the grid returns the Axes.
        ax.imshow(im)
        ax.set_title(index)
    plt.suptitle(title)
    plt.show()
    
def denormalize(video_tensor):
    """
    Undoes mean/standard deviation normalization, zero to one scaling,
    and channel rearrangement for a batch of images.
    args:
        video_tensor: a (FRAMES x CHANNELS x HEIGHT x WIDTH) tensor
    """
    inverse_normalize = transforms.Normalize(
        mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225],
        std=[1 / 0.229, 1 / 0.224, 1 / 0.225]
    )
    return (inverse_normalize(video_tensor) * 255.).type(torch.uint8).permute(0, 2, 3, 1).numpy()

class ShiftWithChannelTensor:
    def __call__(self, data):
        return data.permute(1, 0, 2, 3).contiguous()

    def __repr__(self):
        return self.__class__.__name__ + '()'


In [3]:
'''MobilenetV2 in PyTorch.
See the paper "MobileNetV2: Inverted Residuals and Linear Bottlenecks" for more details.
'''
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv3d(inp, oup, kernel_size=3, stride=stride, padding=(1,1,1), bias=False),
        nn.BatchNorm3d(oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv3d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm3d(oup),
        nn.ReLU6(inplace=True)
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride

        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = self.stride == (1,1,1) and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv3d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm3d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv3d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm3d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv3d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm3d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv3d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm3d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv3d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm3d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, num_classes=1000, sample_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1,  16, 1, (1,1,1)],
            [6,  24, 2, (2,2,2)],
            [6,  32, 3, (2,2,2)],
            [6,  64, 4, (2,2,2)],
            [6,  96, 3, (1,1,1)],
            [6, 160, 3, (2,2,2)],
            [6, 320, 1, (1,1,1)],
        ]

        # building first layer
        assert sample_size % 16 == 0.
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, (1,2,2))]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                stride = s if i == 0 else (1,1,1)
                self.features.append(block(input_channel, output_channel, stride, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, num_classes),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = F.avg_pool3d(x, x.data.size()[-3:])
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.kernel_size[2] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

In [4]:
class ImglistToTensor(torch.nn.Module):
    """
    Converts a list of PIL images in the range [0,255] to a torch.FloatTensor
    of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1].
    Can be used as first transform for ``VideoFrameDataset``.
    """
    @staticmethod
    def forward(img_list: List[Image.Image]) -> 'torch.Tensor[NUM_IMAGES, CHANNELS, HEIGHT, WIDTH]':
        """
        Converts each PIL image in a list to
        a torch Tensor and stacks them into
        a single tensor.

        Args:
            img_list: list of PIL images.
        Returns:
            tensor of size ``NUM_IMAGES x CHANNELS x HEIGHT x WIDTH``
            
        
        """
        # print(type(img_list))
        # print(img_list)
        # print(np.array(img_list).shape)
        return torch.stack([transforms.functional.to_tensor(pic) for pic in img_list])
        # return torch.stack([transforms.functional.to_tensor(img_list)])

In [5]:
import os
import os.path
import numpy as np
from PIL import Image
from torchvision import transforms
import torch
from typing import List, Union, Tuple, Any


class VideoRecord(object):
    """
    Helper class for class VideoFrameDataset. This class
    represents a video sample's metadata.

    Args:
        root_datapath: the system path to the root folder
                       of the videos.
        row: A list with four or more elements where 1) The first
             element is the path to the video sample's frames excluding
             the root_datapath prefix 2) The  second element is the starting frame id of the video
             3) The third element is the inclusive ending frame id of the video
             4) The fourth element is the label index.
             5) any following elements are labels in the case of multi-label classification
    """
    def __init__(self, row, root_datapath):
        self._data = row
        self._path = os.path.join(root_datapath, row[0])


    @property
    def path(self) -> str:
        return self._path

    @property
    def num_frames(self) -> int:
        return self.end_frame - self.start_frame + 1  # +1 because end frame is inclusive
    @property
    def start_frame(self) -> int:
        return int(self._data[1])

    @property
    def end_frame(self) -> int:
        return int(self._data[2])

    @property
    def label(self) -> Union[int, List[int]]:
        # just one label_id
        if len(self._data) == 4:
            return int(self._data[3])
        # sample associated with multiple labels
        else:
            return [int(label_id) for label_id in self._data[3:]]

In [6]:
class VideoFrameDataset(torch.utils.data.Dataset):
    r"""
    A highly efficient and adaptable dataset class for videos.
    Instead of loading every frame of a video,
    loads x RGB frames of a video (sparse temporal sampling) and evenly
    chooses those frames from start to end of the video, returning
    a list of x PIL images or ``FRAMES x CHANNELS x HEIGHT x WIDTH``
    tensors where FRAMES=x if the ``ImglistToTensor()``
    transform is used.

    More specifically, the frame range [START_FRAME, END_FRAME] is divided into NUM_SEGMENTS
    segments and FRAMES_PER_SEGMENT consecutive frames are taken from each segment.

    Note:
        A demonstration of using this class can be seen
        in ``demo.py``
        https://github.com/RaivoKoot/Video-Dataset-Loading-Pytorch

    Note:
        This dataset broadly corresponds to the frame sampling technique
        introduced in ``Temporal Segment Networks`` at ECCV2016
        https://arxiv.org/abs/1608.00859.


    Note:
        This class relies on receiving video data in a structure where
        inside a ``ROOT_DATA`` folder, each video lies in its own folder,
        where each video folder contains the frames of the video as
        individual files with a naming convention such as
        img_001.jpg ... img_059.jpg.
        For enumeration and annotations, this class expects to receive
        the path to a .txt file where each video sample has a row with four
        (or more in the case of multi-label, see README on Github)
        space separated values:
        ``VIDEO_FOLDER_PATH     START_FRAME      END_FRAME      LABEL_INDEX``.
        ``VIDEO_FOLDER_PATH`` is expected to be the path of a video folder
        excluding the ``ROOT_DATA`` prefix. For example, ``ROOT_DATA`` might
        be ``home\data\datasetxyz\videos\``, inside of which a ``VIDEO_FOLDER_PATH``
        might be ``jumping\0052\`` or ``sample1\`` or ``00053\``.

    Args:
        root_path: The root path in which video folders lie.
                   this is ROOT_DATA from the description above.
        annotationfile_path: The .txt annotation file containing
                             one row per video sample as described above.
        num_segments: The number of segments the video should
                      be divided into to sample frames from.
        frames_per_segment: The number of frames that should
                            be loaded per segment. For each segment's
                            frame-range, a random start index or the
                            center is chosen, from which frames_per_segment
                            consecutive frames are loaded.
        imagefile_template: The image filename template that video frame files
                            have inside of their video folders as described above.
        transform: Transform pipeline that receives a list of PIL images/frames.
        test_mode: If True, frames are taken from the center of each
                   segment, instead of a random location in each segment.

    """
    def __init__(self,
                 root_path: str,
                 annotationfile_path: str,
                 num_segments: int = 3,
                 frames_per_segment: int = 1,
                 imagefile_template: str='img_{:05d}.jpg',
                 transform = None,
                 test_mode: bool = False):
        super(VideoFrameDataset, self).__init__()

        self.root_path = root_path
        self.annotationfile_path = annotationfile_path
        self.num_segments = num_segments
        self.frames_per_segment = frames_per_segment
        self.imagefile_template = imagefile_template
        self.transform = transform
        self.test_mode = test_mode

        self._parse_annotationfile()
        self._sanity_check_samples()

    def _load_image(self, directory: str, idx: int) -> Image.Image:
        return Image.open(os.path.join(directory, self.imagefile_template.format(idx))).convert('RGB')

    def _parse_annotationfile(self):
        self.video_list = [VideoRecord(x.strip().split(), self.root_path) for x in open(self.annotationfile_path)]

    def _sanity_check_samples(self):
        for record in self.video_list:
            if record.num_frames <= 0 or record.start_frame == record.end_frame:
                print(f"\nDataset Warning: video {record.path} seems to have zero RGB frames on disk!\n")

            elif record.num_frames < (self.num_segments * self.frames_per_segment):
                print(f"\nDataset Warning: video {record.path} has {record.num_frames} frames "
                      f"but the dataloader is set up to load "
                      f"(num_segments={self.num_segments})*(frames_per_segment={self.frames_per_segment})"
                      f"={self.num_segments * self.frames_per_segment} frames. Dataloader will throw an "
                      f"error when trying to load this video.\n")

    def _get_start_indices(self, record: VideoRecord) -> 'np.ndarray[int]':
        """
        For each segment, choose a start index from where frames
        are to be loaded from.

        Args:
            record: VideoRecord denoting a video sample.
        Returns:
            List of indices of where the frames of each
            segment are to be loaded from.
        """
        # choose start indices that are perfectly evenly spread across the video frames.
        if self.test_mode:
            distance_between_indices = (record.num_frames - self.frames_per_segment + 1) / float(self.num_segments)

            start_indices = np.array([int(distance_between_indices / 2.0 + distance_between_indices * x)
                                      for x in range(self.num_segments)])
        # randomly sample start indices that are approximately evenly spread across the video frames.
        else:
            max_valid_start_index = (record.num_frames - self.frames_per_segment + 1) // self.num_segments

            start_indices = np.multiply(list(range(self.num_segments)), max_valid_start_index) + \
                      np.random.randint(max_valid_start_index, size=self.num_segments)

        return start_indices

    def __getitem__(self, idx: int) -> Union[
        Tuple[List[Image.Image], Union[int, List[int]]],
        Tuple['torch.Tensor[num_frames, channels, height, width]', Union[int, List[int]]],
        Tuple[Any, Union[int, List[int]]],
        ]:
        """
        For video with id idx, loads self.NUM_SEGMENTS * self.FRAMES_PER_SEGMENT
        frames from evenly chosen locations across the video.

        Args:
            idx: Video sample index.
        Returns:
            A tuple of (video, label). Label is either a single
            integer or a list of integers in the case of multiple labels.
            Video is either 1) a list of PIL images if no transform is used
            2) a batch of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1]
            if the transform "ImglistToTensor" is used
            3) or anything else if a custom transform is used.
        """
        record: VideoRecord = self.video_list[idx]

        frame_start_indices: 'np.ndarray[int]' = self._get_start_indices(record)

        return self._get(record, frame_start_indices)

    def _get(self, record: VideoRecord, frame_start_indices: 'np.ndarray[int]') -> Union[
        Tuple[List[Image.Image], Union[int, List[int]]],
        Tuple['torch.Tensor[num_frames, channels, height, width]', Union[int, List[int]]],
        Tuple[Any, Union[int, List[int]]],
        ]:
        """
        Loads the frames of a video at the corresponding
        indices.

        Args:
            record: VideoRecord denoting a video sample.
            frame_start_indices: Indices from which to load consecutive frames from.
        Returns:
            A tuple of (video, label). Label is either a single
            integer or a list of integers in the case of multiple labels.
            Video is either 1) a list of PIL images if no transform is used
            2) a batch of shape (NUM_IMAGES x CHANNELS x HEIGHT x WIDTH) in the range [0,1]
            if the transform "ImglistToTensor" is used
            3) or anything else if a custom transform is used.
        """

        frame_start_indices = frame_start_indices + record.start_frame
        images = list()

        # from each start_index, load self.frames_per_segment
        # consecutive frames
        for start_index in frame_start_indices:
            frame_index = int(start_index)

            # load self.frames_per_segment consecutive frames
            for _ in range(self.frames_per_segment):
                image = self._load_image(record.path, frame_index)
                images.append(image)

                if frame_index < record.end_frame:
                    frame_index += 1

        if self.transform is not None:
            images = self.transform(images)

        return images, record.label

    def __len__(self):
        return len(self.video_list)

In [7]:
from torchvision import transforms
def eval_preprocess(size):
    preprocess = transforms.Compose([
        ImglistToTensor(),  # list of PIL images to (FRAMES x CHANNELS x HEIGHT x WIDTH) tensor
        transforms.Resize(size),  # image batch, resize smaller edge to 299
        transforms.CenterCrop(size),  # image batch, center crop to square 299x299
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ShiftWithChannelTensor()
    ])
    return preprocess

def train_preprocess(size):
    train_preprocess = transforms.Compose([
        ImglistToTensor(),  # list of PIL images to (FRAMES x CHANNELS x HEIGHT x WIDTH) tensor
        transforms.Resize(size),  # image batch, resize smaller edge to 299
        transforms.RandomCrop(size),  # image batch, center crop to square 299x299
        transforms.RandomHorizontalFlip(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ShiftWithChannelTensor()
    ])
    return train_preprocess

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
PATH = 'model_state_dict.pt'
device = torch.device('cuda')
model = MobileNetV2(num_classes=6, sample_size=112, width_mult=1.0)
model.load_state_dict(torch.load(PATH),strict=False)
model.to(device)
# freezing layer
ct = 0
for child in model.children():
    for param in child.parameters():
            ct +=1
            if ct <120:
                param.requires_grad = False
optimizer = optim.Adam(filter(lambda p : p.requires_grad, model.parameters()), lr=0.0001)

In [9]:
'''import os
import glob
from os.path import normpath, basename
from sklearn.model_selection import train_test_split

DATA_PATH = 'transfer/activities'#dataset
# create dictionary of activities

list_activities= os.listdir(DATA_PATH)
list_dict= {}
for index,activity in enumerate(list_activities):
    list_dict[activity] = index
    
# create a note 
with open('label_notes.txt', 'w') as f:
    for key,value in list_dict.items():
        f.write(f"{key}: {value}")
        f.write('\n')

# create list of data
all_x =[]
all_y = []
for path, subdirs, files in os.walk(DATA_PATH):
    for name in files:
        all_x.append(os.path.join(path, name))
        all_y.append(list_dict[basename(normpath(path))])

print(f"Currently have {len(all_x)} video data...")  

# split to train and test
X_train, X_test, y_train, y_test = train_test_split(all_x, all_y, test_size=0.2, random_state=42, stratify=all_y)

# generate image for train & test
check_mkdir('transfer/train')#dataset/train
check_mkdir('transfer/test')#dataset/test

for key,value in list_dict.items():
    check_mkdir(os.path.join('transfer/train',str(value)))
    check_mkdir(os.path.join('transfer/test',str(value)))

with open('transfer/train/annotations.txt', 'w') as f:
    for index,video in enumerate(X_train):
        vid_in_folder = len(os.listdir(os.path.join('transfer/train',str(y_train[index]))))
        path_folder = os.path.join('transfer/train',str(y_train[index]),str(vid_in_folder+1).zfill(5))
        check_mkdir(path_folder)
        # parse video into frame
        last_frame = create_frames_from_video(video,path_folder)
        # create note
        f.write(f'{y_train[index]}/{str(vid_in_folder+1).zfill(5)} 1 {last_frame-1} {y_train[index]}')
        f.write('\n')
        
with open('transfer/test/annotations.txt', 'w') as f:
    for index,video in enumerate(X_test):
        vid_in_folder = len(os.listdir(os.path.join('transfer/test',str(y_test[index]))))
        path_folder = os.path.join('transfer/test',str(y_test[index]),str(vid_in_folder+1).zfill(5))
        check_mkdir(path_folder)
        # parse video into frame
        last_frame = create_frames_from_video(video,path_folder)
        # create note
        f.write(f'{y_test[index]}/{str(vid_in_folder+1).zfill(5)} 1 {last_frame-1} {y_test[index]}')
        f.write('\n')'''

'import os\nimport glob\nfrom os.path import normpath, basename\nfrom sklearn.model_selection import train_test_split\n\nDATA_PATH = \'transfer/activities\'#dataset\n# create dictionary of activities\n\nlist_activities= os.listdir(DATA_PATH)\nlist_dict= {}\nfor index,activity in enumerate(list_activities):\n    list_dict[activity] = index\n    \n# create a note \nwith open(\'label_notes.txt\', \'w\') as f:\n    for key,value in list_dict.items():\n        f.write(f"{key}: {value}")\n        f.write(\'\n\')\n\n# create list of data\nall_x =[]\nall_y = []\nfor path, subdirs, files in os.walk(DATA_PATH):\n    for name in files:\n        all_x.append(os.path.join(path, name))\n        all_y.append(list_dict[basename(normpath(path))])\n\nprint(f"Currently have {len(all_x)} video data...")  \n\n# split to train and test\nX_train, X_test, y_train, y_test = train_test_split(all_x, all_y, test_size=0.2, random_state=42, stratify=all_y)\n\n# generate image for train & test\ncheck_mkdir(\'transfe

In [10]:


n_epochs= 150
train_on_gpu = True
size = 112

eval_preprocess = eval_preprocess(size)

train_preprocess = train_preprocess(size)

train_dataset = VideoFrameDataset(
    root_path='transfer/train',
    annotationfile_path='transfer/train/annotations.txt',
    num_segments=16,
    frames_per_segment=1,
    imagefile_template='img_{:05d}.jpg',
    transform=train_preprocess,
    test_mode=False
)
test_dataset = VideoFrameDataset(
    root_path='transfer/test',
    annotationfile_path='transfer/test/annotations.txt',
    num_segments=1,
    frames_per_segment=16,
    imagefile_template='img_{:05d}.jpg',
    transform=eval_preprocess,
    test_mode=False
)

train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=0,#2
    pin_memory=True
)

test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=0,#2
    pin_memory=True
)
'''
model = MobileNetV2(num_classes=6, sample_size=size, width_mult=1.)
#model.load_state_dict(torch.load('pretrained/kinetics_mobilenetv2_1.0x_RGB_16_best.pth'))'''

if train_on_gpu:
    model = nn.DataParallel(model)
    model.cuda()
    print('train on gpu')
else:
    model.cpu()
    print('train on cpu')
    
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, 'min',patience=10,verbose=True,min_lr=1e-10)

valid_loss_min = np.Inf # track change in validation loss

epoch_train_loss =[]
epoch_val_loss =[]
for epoch in range(1, n_epochs+1):
    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train()
    train_pred = np.array([],dtype='i')
    train_truth = np.array([],dtype='i')
    for data, target in train_dataloader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the batch loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)
        train_pred = np.concatenate((train_pred, np.argmax(output.clone().detach().cpu().numpy(),axis=1)))
        train_truth = np.concatenate((train_truth, target.clone().detach().cpu().numpy()))
        
    ######################    
    # validate the model #
    ######################
    model.eval()
    y_pred = np.array([],dtype='i')
    y_truth = np.array([],dtype='i')
    print('validation')
    with torch.no_grad():
        for data, target in test_dataloader:
            # move tensors to GPU if CUDA is available
            if train_on_gpu:
                data, target = data.cuda(), target.cuda()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # update average validation loss 
            valid_loss += loss.item()*data.size(0)
            y_pred = np.concatenate((y_pred, np.argmax(output.clone().detach().cpu().numpy(),axis=1)))
            y_truth = np.concatenate((y_truth, target.clone().detach().cpu().numpy()))
    
    # calculate average losses
    train_loss = train_loss/len(train_dataloader.sampler)
    valid_loss = valid_loss/len(test_dataloader.sampler)
    epoch_train_loss.append(train_loss)
    epoch_val_loss.append(valid_loss)
    
    trainacc=accuracy_score(train_truth, train_pred)
    acc = accuracy_score(y_truth, y_pred)
    rec = recall_score(y_truth, y_pred, average='macro')
    prec = precision_score(y_truth, y_pred, average='macro')
    f1 = f1_score(y_truth, y_pred, average='macro')
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tTraining acc: {:.2f} \tValidation Loss: {:.6f} \tAccuracy: {:.2f} \tF1-Score: {:.2f}'.format(
    epoch, train_loss, trainacc, valid_loss, acc, f1))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.module.state_dict(), 'P3D63_Global_2.pt')
        valid_loss_min = valid_loss
    """
    wandb.log({
        'train_loss': train_loss,
        'val_loss': valid_loss,
        'accuracy': acc,
        'f1-score': f1,
        'recall': rec,
        'precission': prec,
    }, step=epoch) 
    """
    
    scheduler.step(valid_loss)

train on gpu
validation


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1 	Training Loss: 1.396008 	Training acc: 0.45 	Validation Loss: 1.502460 	Accuracy: 0.43 	F1-Score: 0.17
Validation loss decreased (inf --> 1.502460).  Saving model ...
validation


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2 	Training Loss: 1.268910 	Training acc: 0.48 	Validation Loss: 1.554731 	Accuracy: 0.42 	F1-Score: 0.25
validation
Epoch: 3 	Training Loss: 1.170239 	Training acc: 0.52 	Validation Loss: 1.417397 	Accuracy: 0.45 	F1-Score: 0.39
Validation loss decreased (1.502460 --> 1.417397).  Saving model ...
validation
Epoch: 4 	Training Loss: 1.079528 	Training acc: 0.55 	Validation Loss: 1.395337 	Accuracy: 0.50 	F1-Score: 0.49
Validation loss decreased (1.417397 --> 1.395337).  Saving model ...
validation
Epoch: 5 	Training Loss: 1.002751 	Training acc: 0.59 	Validation Loss: 1.180161 	Accuracy: 0.54 	F1-Score: 0.58
Validation loss decreased (1.395337 --> 1.180161).  Saving model ...
validation
Epoch: 6 	Training Loss: 0.936023 	Training acc: 0.62 	Validation Loss: 1.371934 	Accuracy: 0.50 	F1-Score: 0.51
validation
Epoch: 7 	Training Loss: 0.877579 	Training acc: 0.64 	Validation Loss: 1.138179 	Accuracy: 0.56 	F1-Score: 0.57
Validation loss decreased (1.180161 --> 1.138179).  Saving m

validation
Epoch: 61 	Training Loss: 0.286395 	Training acc: 0.88 	Validation Loss: 0.776412 	Accuracy: 0.73 	F1-Score: 0.74
validation
Epoch: 62 	Training Loss: 0.301796 	Training acc: 0.88 	Validation Loss: 0.751565 	Accuracy: 0.74 	F1-Score: 0.77
validation
Epoch: 63 	Training Loss: 0.283289 	Training acc: 0.88 	Validation Loss: 0.721796 	Accuracy: 0.76 	F1-Score: 0.78
Validation loss decreased (0.724967 --> 0.721796).  Saving model ...
validation
Epoch: 64 	Training Loss: 0.284210 	Training acc: 0.88 	Validation Loss: 0.820832 	Accuracy: 0.72 	F1-Score: 0.75
validation
Epoch: 65 	Training Loss: 0.297467 	Training acc: 0.88 	Validation Loss: 0.840889 	Accuracy: 0.70 	F1-Score: 0.74
validation
Epoch: 66 	Training Loss: 0.283615 	Training acc: 0.89 	Validation Loss: 0.821976 	Accuracy: 0.74 	F1-Score: 0.75
validation
Epoch: 67 	Training Loss: 0.292987 	Training acc: 0.89 	Validation Loss: 0.750855 	Accuracy: 0.74 	F1-Score: 0.77
validation
Epoch: 68 	Training Loss: 0.299370 	Training 

validation
Epoch: 123 	Training Loss: 0.262791 	Training acc: 0.89 	Validation Loss: 0.804562 	Accuracy: 0.72 	F1-Score: 0.76
validation
Epoch: 124 	Training Loss: 0.260474 	Training acc: 0.89 	Validation Loss: 0.742497 	Accuracy: 0.74 	F1-Score: 0.79
validation
Epoch: 125 	Training Loss: 0.265810 	Training acc: 0.89 	Validation Loss: 0.826153 	Accuracy: 0.74 	F1-Score: 0.76
validation
Epoch: 126 	Training Loss: 0.265099 	Training acc: 0.89 	Validation Loss: 0.858387 	Accuracy: 0.72 	F1-Score: 0.74
validation
Epoch: 127 	Training Loss: 0.257599 	Training acc: 0.90 	Validation Loss: 0.854647 	Accuracy: 0.72 	F1-Score: 0.74
validation
Epoch: 128 	Training Loss: 0.262190 	Training acc: 0.89 	Validation Loss: 0.741840 	Accuracy: 0.75 	F1-Score: 0.78
validation
Epoch: 129 	Training Loss: 0.265143 	Training acc: 0.89 	Validation Loss: 0.822776 	Accuracy: 0.75 	F1-Score: 0.76
validation
Epoch: 130 	Training Loss: 0.262679 	Training acc: 0.90 	Validation Loss: 0.749655 	Accuracy: 0.74 	F1-Scor

In [11]:
test_dataset = VideoFrameDataset(
    root_path='test/finalfull/test',
    annotationfile_path='test/finalfull/test/annotations.txt',
    num_segments=1,
    frames_per_segment=16,
    imagefile_template='img_{:05d}.jpg',
    transform=eval_preprocess,
    test_mode=False
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=0,#2
    pin_memory=True
)
import timeit
start_time = timeit.default_timer()
model.eval()
y_pred = np.array([],dtype='i')
print('validation')
with torch.no_grad():
    for data, target in test_dataloader:
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        y_pred = np.concatenate((y_pred, np.argmax(output.clone().detach().cpu().numpy(),axis=1)))
terminate_time = timeit.default_timer()
print("%f초" % (terminate_time - start_time))

validation
17.536867초


In [12]:
print(y_pred)

[0 5 1 5 0 5 1 1 1 5 1 5 1 1 0 1 1 5 1 0 5 0 0 0 1 5 1 1 1 0 0 5 5 5 5 0 1
 0 5 5 0 5 5 1 5 1 1 0 5 1 5 0 5 1 0 1 1 1 1 1 0 0 5 1 1 1 0 5 1 0 1 0 5 0
 0 1 0 5 1 5 0 5 5 5 1 5 1 5 1 5 5 5 5 0 5 1 5 5 5 1 5 1 0 1 0 5 1 0 0 5 1
 0 5 5 1 5 5 5 0 1 1 1 1 5 1 5 0 5 1 5 0 1 1 1 5 0 5 1 0 0 0 5 5 1 5 1 1 1
 5 5 5 1 0 0 5 1 1 5 0 1 0 1 0 5 1 0 5 5 0 0 5 5 0 1 5 0 0 1 0 5 5 1 0 5 1
 1 5 0 0 5 0 5 1 5 5 5 5 5 1 1 5 0 5 0 1 5 0 1 0 1 5 1 0 5 5 5 0 1 1 5 5 1
 1 0 1 1 5 1 0 0 1 5 5 5 1 1 1 5 0 5 0 0 5 5]


In [13]:
import numpy as numpy
y_truth=numpy.loadtxt('final/label.txt',dtype='int', delimiter="\n")
print(y_truth)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 5 1 1 1 1 2 2 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 1 1 5 1 1 1 5 1 1 5 1 1 1 1 1 5 1 1 1 1
 1 1 1 1 1 1 5 1 1 1 1 1 1 1 5 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 5 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5
 1 5 5 5 5 5 5 0 5 1 1 5 5 5 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 5 5 5 5 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0]


In [47]:
accuracy_score(y_truth, y_pred)

0.3237704918032787

In [48]:
x=y_pred
i=0
for a in y_pred:
    if a == 1 or a == 5:
        print(a)
        x[i]=1
    else :
        x[i]=0
    i = i+1

1
1
1
5
5
5
5
5
5
1
5
5
1
1
5
5
1
5
1
5
1
1
5
5
1
5
5
1
1
5
1
1
5
5
5
5
5
1
1
5
5
5
5
5
5
5
1
5
5
5
1
1
5
5
1
1
5
5
5
5
1
1
5
1
1
1
1
1
5
5
1
1
1
1
1
5
1
5
1
1
1
5
1
5
5
5
5
1
5
5
1
5
1
1
5
5
5
1
1
1
1
1
5
1
5
1
1
5
1
5
5
5
1
1
1
5
5
5
5
1
5
1
5
1
5
1
1
1
5
1
1
1
5
5
5
5
5
1
1
1
1
1
5
1
1
1
1
5
5
5
1
1
5
1
5
5
1
5
1
5
5
5
1
5
1
5
1
1
1
1


In [49]:
print(x)

[1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
 1 0 1 0 0 1 1 0 0 1 1 1 0 0 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0
 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0
 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 0 1
 1 0 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1
 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 0]


In [57]:
y=y_truth
i=0
for a in y_truth:
    if a == 1 or a == 5:
        y[i]=True
    else :
        y[i]=False
    i = i+1

In [58]:
print(y)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0]


In [59]:
accuracy_score(x, y) #1과 5를 같은 행동이라고 생각했을때 

0.6311475409836066

In [61]:
x=y_pred
i=0
for a in y_pred:
    if a == 1 or a==3 or a == 5:
        x[i]=1
    else :
        x[i]=0
    i = i+1

1
1
5
1
5
5
1
5
1
5
5
1
3
1
1
5
5
1
5
1
1
1
1
5
1
1
5
1
5
1
5
1
1
5
5
1
1
5
1
1
1
5
5
1
3
3
1
1
5
5
5
5
5
5
3
5
5
1
1
1
1
5
5
1
1
1
3
5
3
5
5
5
5
5
1
1
5
5
5
1
1
1
3
1
1
5
1
5
1
1
1
1
5
1
1
5
1
1
1
1
5
1
5
1
5
3
5
1
1
5
1
5
1
1
5
5
5
5
1
1
1
1
5
1
3
1
5
1
1
5
5
1
1
5
5
5
1
1
5
1
5
5
5
1
1
1
5
5
5
5
1
5
5
1
5
1
1
1
1
1
5
1
5
1
1
1
5
1
1
1
1


In [62]:
print(x)

[1 0 1 1 0 1 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 0 1
 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 1
 1 0 1 0 0 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 0
 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1
 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1]


In [63]:
y=y_truth
i=0
for a in y_truth:
    if a == 1 or a==3 or a == 5:
        y[i]=True
    else :
        y[i]=False
    i = i+1

In [64]:
print(y)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0]


In [65]:
accuracy_score(x, y) #집중 비집중

0.6516393442622951