# Setup
Connect this notebook to your Google Drive.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Download the ChicagoFSWild dataset. This stores the files in your drive: at /content/drive/MyDrive/'Colab Notebooks'/'EECS 442 Project'/.

In [None]:
%cd /content/drive/Shareddrives/'EECS 442 Project'/
#!rm ChicagoFSWild/*
#!wget https://dl.ttic.edu/ChicagoFSWild.tgz
#!tar -xzvf ChicagoFSWild.tgz
#!rm ChicagoFSWild.tgz
!tar -xkvf ChicagoFSWild/ChicagoFSWild-Frames.tgz
#!rm ChicagoFSWild/ChicagoFSWild-Frames.tgz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
youtube_6/tom_humphries_6653/0022.jpg
tar: youtube_6/tom_humphries_6653/0022.jpg: Cannot open: File exists
youtube_6/tom_humphries_6653/0020.jpg
tar: youtube_6/tom_humphries_6653/0020.jpg: Cannot open: File exists
youtube_6/tom_humphries_6653/0017.jpg
tar: youtube_6/tom_humphries_6653/0017.jpg: Cannot open: File exists
youtube_6/tom_humphries_6653/0014.jpg
tar: youtube_6/tom_humphries_6653/0014.jpg: Cannot open: File exists
youtube_6/tom_humphries_6653/0009.jpg
tar: youtube_6/tom_humphries_6653/0009.jpg: Cannot open: File exists
youtube_6/peter_hauser_6917/
youtube_6/peter_hauser_6917/0003.jpg
tar: youtube_6/peter_hauser_6917/0003.jpg: Cannot open: File exists
youtube_6/peter_hauser_6917/0016.jpg
tar: youtube_6/peter_hauser_6917/0016.jpg: Cannot open: File exists
youtube_6/peter_hauser_6917/0012.jpg
tar: youtube_6/peter_hauser_6917/0012.jpg: Cannot open: File exists
youtube_6/peter_hauser_6917/0008.jpg
tar: youtube_6/pete

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm # Displays a progress bar

import os
import io
import imageio
from PIL import Image
import time
import pandas as pd
import threading

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchsummary import summary
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import Dataset, Subset, DataLoader, random_split

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
# import pytorchvideo.models.resnet
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
# from pytorchvideo.transforms import (
#     ApplyTransformToKey,
#     ShortSideScale,
#     UniformTemporalSubsample
# )


  "The 'torchvision.transforms._functional_video' module is deprecated since 0.12 and will be removed in 0.14. "
  "The 'torchvision.transforms._transforms_video' module is deprecated since 0.12 and will be removed in 0.14. "


In [3]:
if torch.cuda.is_available():
    print("Using the GPU. You are good to go!")
    device = 'cuda'
else:
    print("Using the CPU. Overall speed may be slowed down")
    device = 'cpu'

Using the CPU. Overall speed may be slowed down


# **Dataset**
We will create a custom Dataset function for the ChicagoFSWild dataset.

In [4]:
class ImageLoader():


  def __init__(self, shape, transform=None):
    self.cache = {}
    self.lock = threading.Lock()
    self.shape = shape
    self.transform = transform


  def load_image(self, path):
    if path not in self.cache:
      image = Image.open(path)
      image.draft('RGB', self.shape)
      image = transforms.ToTensor()(image)

      if self.transform is not None:
        image = self.transform(image)
      
      self.cache[path] = image

    return self.cache[path]  

  def load_images(self, paths):
    return [self.load_image(path) for path in paths]
  


In [11]:
# Open directory containing dataset
%cd /content/drive/Shareddrives/'EECS 442 Project'/

num_labels = 10

class ChicagoFSWild(Dataset):
  def __init__(self, split, shape=(64, 64), num_frames=8, max_size=-1):
    # split should be either 'train', 'val', or 'test' (?)
    # Assumes we are inside 'EECS 442 Project' dir
    super().__init__()
    self.shape = shape
    self.root_dir = "."
    self.annotations = pd.read_csv(
        "ChicagoFSWild/ChicagoFSWild.csv"
    )

    self.transform_image = transforms.Compose([
        transforms.Resize(shape),
    ])
    self.transform_video = transforms.Compose([
        transforms.Resize((shape[-1], num_frames)),
    ])

    self.loader = ImageLoader(shape, transform=self.transform_image)


    # (1) Process labels
    self.labels = []
    label_counts = {} 

    # Extract the most common labels
    for label in self.annotations.iloc[:, 7]:
      if label not in label_counts:
       label_counts[label] = 1
      else:
       label_counts[label] += 1

    label_counts = dict(sorted(label_counts.items(), key=lambda item: item[1])[-num_labels:])



    # Index all labels inside self.annotations
    self.label_itoa = []
    self.label_atoi = {}

    for label in self.annotations.iloc[:, 7]:
      # Filter out the kth most common labels
      if label in label_counts:
        if label not in self.label_atoi:
          self.label_atoi[label] = len(self.label_itoa)
          self.label_itoa.append(label)
      

    # (2) Process filenames
    self.filenames = []
    for i in range(self.annotations.shape[0]):
      path = self.annotations.iloc[i, 1]
      num_files = self.annotations.iloc[i, 4]
      label = self.annotations.iloc[i, 7]
      partition = self.annotations.iloc[i, 10]

      # Filter out the kth most common labels
      if label in label_counts and partition == split:
        # Choose only enough images needed to meet 
        indices = np.linspace(1, int(num_files), num=num_frames).astype(int)

        self.filenames.append([
            os.path.join(self.root_dir, path, "{:04d}.jpg".format(index)) for index in indices
        ])
  
        self.labels.append(self.label_atoi[label])

  def __len__(self):
    return len(self.filenames)
  

  def __getitem__(self, index):
    #starttime = time.time()

    label = self.labels[index]
    filenames = self.filenames[index]

    images = self.loader.load_images(filenames)
    video = torch.stack(images, dim=3)
    video = self.transform_video(video)

    #t1 = time.time() - starttime
    #print('\n', t1)


    # if self.transform:
    #     video = self.transform(video_dir)

    return (video, label)
  
  def saveitem(self, index):
    label = self.labels[index]
    filenames = self.filenames[index]

    images = self.loader.load_images(filenames)
    transform = transforms.ToPILImage()
    
    for i, image in enumerate(images):
      transform(image).save('temp{}.jpg'.format(i))
    
    print(self.label_itoa[label])

# load data
train_data = ChicagoFSWild("train")
test_data = ChicagoFSWild("test")

train_data.saveitem(10)


/content/drive/Shareddrives/EECS 442 Project
ok


In [None]:
num_classes = 10

# Create CNN Model
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        
        self.conv_layer1 = self._conv_layer_set(3, 32)
        self.conv_layer2 = self._conv_layer_set(32, 64)
        self.fc1 = nn.Linear(32*64*64*8, 10)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.LeakyReLU()
        self.batch=nn.BatchNorm1d(128)
        self.drop=nn.Dropout(p=0.15)        
        
    def _conv_layer_set(self, in_c, out_c):
        conv_layer = nn.Sequential(
        nn.Conv3d(in_c, out_c, 3, 1, 1))
        return conv_layer
    

    def forward(self, x):
        # Set 1
        out = self.conv_layer1(x)
        #out = self.conv_layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        #out = self.relu(out)
        #out = self.batch(out)
        #out = self.drop(out)
        #out = self.fc2(out)
        
        return out

print('Your network:')
CNNmodel = CNNModel().to(device)
print(summary(CNNmodel, (3,64,64,8), device=device)) # visualize your model

Your network:
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1        [-1, 32, 64, 64, 8]           2,624
            Linear-2                   [-1, 10]      10,485,770
Total params: 10,488,394
Trainable params: 10,488,394
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.38
Forward/backward pass size (MB): 8.00
Params size (MB): 40.01
Estimated Total Size (MB): 48.39
----------------------------------------------------------------
None


In [None]:


class Network(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_layer1 = nn.Conv3d(3, 32, 3, 1, 1)
        self.conv_layer2 = nn.Conv3d(32, 64, 3, 1, 1)
        self.fc1 = nn.Linear(64 * 128 * 128 * 10, 10)
        self.fc2 = nn.Linear(10, 10)
        self.relu = nn.LeakyReLU()
        self.batch=nn.BatchNorm1d(128)
        self.drop=nn.Dropout(p=0.15)        
        

    def forward(self, x):
        # Set 1
        x = self.conv_layer1(x)
        x = self.conv_layer2(x)
        x = x.view(x.size(0), -1)
        print(x.shape)
        print("hello world")
        x = self.fc1(x)
        #out = self.relu(out)
        #out = self.batch(out)
        #out = self.drop(out)
        #out = self.fc2(out)
        return x

print('Your network:')
baseline = torchvision.models.video.r3d_18().to(device)
print(summary(baseline, (3,64,64,8), device=device)) # visualize your model

Your network:
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1        [-1, 64, 64, 32, 4]          28,224
       BatchNorm3d-2        [-1, 64, 64, 32, 4]             128
              ReLU-3        [-1, 64, 64, 32, 4]               0
      Conv3DSimple-4        [-1, 64, 64, 32, 4]         110,592
       BatchNorm3d-5        [-1, 64, 64, 32, 4]             128
              ReLU-6        [-1, 64, 64, 32, 4]               0
      Conv3DSimple-7        [-1, 64, 64, 32, 4]         110,592
       BatchNorm3d-8        [-1, 64, 64, 32, 4]             128
              ReLU-9        [-1, 64, 64, 32, 4]               0
       BasicBlock-10        [-1, 64, 64, 32, 4]               0
     Conv3DSimple-11        [-1, 64, 64, 32, 4]         110,592
      BatchNorm3d-12        [-1, 64, 64, 32, 4]             128
             ReLU-13        [-1, 64, 64, 32, 4]               0
     Conv3DSimple-14     

# Train

In [None]:
def train(trainloader, net, criterion, optimizer, device, epoch):
  '''
  Function for training.
  '''
  start = time.time()
  running_loss = 0.0
  cnt = 0
  net = net.train()
  for sequences, labels in tqdm(trainloader):
    sequences = sequences.to(device)
    labels = labels.to(device)
    optimizer.zero_grad()
    
    output = net(sequences)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    cnt += 1
  end = time.time()
  running_loss /= cnt
  print('\n [epoch %d] loss: %.3f elapsed time %.3f' %
      (epoch, running_loss, end-start))
  return running_loss


In [None]:
def evaluate(model, loader):  # Evaluate accuracy on validation / test set
    model.eval()  # Set the model to evaluation mode
    correct = 0
    with torch.no_grad():  # Do not calculate grident to speed up computation
        for batch, label in tqdm(loader):
            batch = batch.to(device)
            label = label.to(device)
            pred = model(batch)
            correct += (torch.argmax(pred, dim=1) == label).sum().item()
        acc = correct/len(loader.dataset)
        print("\n Evaluation accuracy: {}".format(acc))
        return acc

In [None]:
# Hyperparameters
train_batch_size = 16
val_batch_size = 16
learning_rate = 1e-3
weight_decay = 1e-4
num_epoch = 10


# Load data
train_loader = DataLoader(
  train_data,
  batch_size=train_batch_size
)

test_loader = DataLoader(
  test_data,
  batch_size=1
)


# Initialize network
net = torchvision.models.video.r3d_18().to(device)
# https://pytorchvideo.org/docs/tutorial_classification
# net = pytorchvideo.models.resnet.create_resnet(
#       input_channel=3, # RGB input from Kinetics
#       model_depth=50, # For the tutorial let's just use a 50 layer network
#       model_num_class=400, # Kinetics has 400 classes so we need out final head to align
#       norm=nn.BatchNorm3d,
#       activation=nn.ReLU,
#   )
# net = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate,
                       weight_decay=weight_decay)
criterion_base = nn.CrossEntropyLoss()
optimizer_base = optim.Adam(CNNmodel.parameters(), lr=learning_rate,
                       weight_decay=weight_decay) 


  # Train network baseline
print('\nStart training for baseline')
for epoch in range(num_epoch):
  print('-----------------Epoch = %d-----------------' % (epoch+1))
  trn_loss = train(train_loader, CNNmodel, criterion_base, optimizer_base, device, epoch+1)

evaluate(CNNmodel, test_loader)

# Train network for resnet 
print('\nStart training for Resnet')
for epoch in range(num_epoch):
  print('-----------------Epoch = %d-----------------' % (epoch+1))
  trn_loss = train(train_loader, net, criterion, optimizer, device, epoch+1)

evaluate(net, test_loader)





Start training for baseline
-----------------Epoch = 1-----------------


 75%|███████▌  | 43/57 [34:06<11:06, 47.59s/it]


FileNotFoundError: ignored