In [43]:
import torch
from torchvision import transforms, datasets
import torch.utils.data
import torch.nn as nn
import torch.optim as optim


import numpy as np
import matplotlib.pyplot as plt

### Process the Data

In [20]:
# todo: read the csv files and label the images

# ```````````````````````````````````````#
# 0 ->	FS    	Foreground shot          #
# 1	->  ECU		Extreme close up	     #
# 2	->  CU		Close up                 #
# 3	->  MCU		Medium close up          #
# 4	->  MS 		Medium shot              #
# 5	->  MLS 	Medium long shot         #
# 6	->  LS 		Long shot                #
# 7	->  ELS 	Extreme long shot        #
# 8	->  INS 	Insert                   #
# 9 ->  NA 		Not available            #
# ```````````````````````````````````````#

import pandas as pd
import os
import shutil
import random

#Fill in the maximum number of all files in each folder
#======================================#
cnt = [0,0,0,0,0,0,0,0,0,0]
#======================================#


def label_the_images(trainset_dir, testset_dir, csv_dir, images_dir):
  """
  Based on the labels in the csv file, classify the images and save them.

  parameters：
      saveto_dir: the path to saving folder
      csv_dir: the path to csv files (which saves the labels)
      images_dir: the path to images 
  """

  df = pd.read_csv(csv_dir)
  labels = df['shotscale']
  images = []

  for filename in os.listdir(images_dir):
    if filename.endswith('.jpg') or filename.endswith('.png'):
      image_path = os.path.join(images_dir, filename)
      images.append(image_path)

  shot_scale_map = {
      0: 'FS',
      1: 'ECU',
      2: 'CU',
      3: 'MCU',
      4: 'MS',
      5: 'MLS',
      6: 'LS',
      7: 'ELS',
      8: 'INS',
      9: 'NA'
  }

  train_data_num = 0
  test_data_num = 0

  for i, label in enumerate(labels):
    if label == 9:
        continue  # 跳过 "NA" 图像

    if i >= len(images): 
        break
    else:
        name = images[i]
        l = shot_scale_map[label]
        cnt[label] += 1
    
        if random.random() <= 0.8:
            saveto_dir = trainset_dir
            train_data_num += 1
        else:
            saveto_dir = testset_dir
            test_data_num += 1
    
        saveto = os.path.join(saveto_dir, l)
        # os.makedirs(saveto, exist_ok=True)  # 如果目录不存在则创建
    
        new_name = f"{cnt[label]:04d}"  # 使用前导零格式化名称
        new_path = os.path.join(saveto, f"{new_name}.jpg")
        shutil.copy(name, new_path)
          
  print(train_data_num, 'images are saved to train set; ', test_data_num, 'images are saved to test set.')       
    

trainset_dir = './dataset/Classified/trainset'
testset_dir = './dataset/Classified/testset'

csv_dirs = ['./dataset/1950_antonioni_-_cronaca_di_un_amore/1950_antonioni_-_cronaca_di_un_amore.csv',
            './dataset/1960_bergman_-_djavulens_oga/1960_bergman_-_djavulens_oga.csv',
            './dataset/1969_fellini_-_satyricon/1969_fellini_-_satyricon.csv',
            './dataset/1980_godard_-_sauve_qui_peut_la_vie/1980_godard_-_sauve_qui_peut_la_vie.csv',
            './dataset/1990_scorsese_-_goodfellas/1990_scorsese_-_goodfellas.csv',
            './dataset/2007_tarr_-_a_londoni_ferfi/2007_tarr_-_a_londoni_ferfi.csv']

image_dirs = ['./dataset/1950_antonioni_-_cronaca_di_un_amore/images',
              './dataset/1960_bergman_-_djavulens_oga/images',
              './dataset/1969_fellini_-_satyricon/images',
              './dataset/1980_godard_-_sauve_qui_peut_la_vie/images',
              './dataset/1990_scorsese_-_goodfellas/images',
              './dataset/2007_tarr_-_a_londoni_ferfi/images']

# for csv_dir, image_dir in zip(csv_dirs, image_dirs):
#     label_the_images(trainset_dir, testset_dir, csv_dir, image_dir)

4707 images are saved to train set;  1174 images are saved to test set.
4024 images are saved to train set;  1008 images are saved to test set.
5926 images are saved to train set;  1484 images are saved to test set.
4065 images are saved to train set;  1014 images are saved to test set.
6670 images are saved to train set;  1700 images are saved to test set.
6387 images are saved to train set;  1620 images are saved to test set.


In [ ]:
'''
output:
4707 images are saved to train set;  1174 images are saved to test set.
4024 images are saved to train set;  1008 images are saved to test set.
5926 images are saved to train set;  1484 images are saved to test set.
4065 images are saved to train set;  1014 images are saved to test set.
6670 images are saved to train set;  1700 images are saved to test set.
6387 images are saved to train set;  1620 images are saved to test set.
'''

### Load Trainset and Testset

In [29]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

train_dataset = datasets.ImageFolder(root='./dataset/Classified/trainset', transform=transform)
test_dataset = datasets.ImageFolder(root='./dataset/Classified/testset', transform=transform)

In [30]:
#===--------===#
batch_size = 784
#===--------===#

In [31]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Define VGG-16 Network Model

In [22]:
# VGG16网络模型
class Vgg16_net(nn.Module):
    def __init__(self):
        super(Vgg16_net, self).__init__()

        # 第一层卷积层
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
            # 对64通道特征图进行Batch Normalization
            nn.BatchNorm2d(64),
            # 对64通道特征图进行ReLU激活函数
            nn.ReLU(inplace=True),
            # 输入64通道特征图，输出64通道特征图，卷积核大小3x3，步长1，填充1
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            # 对64通道特征图进行Batch Normalization
            nn.BatchNorm2d(64),
            # 对64通道特征图进行ReLU激活函数
            nn.ReLU(inplace=True),

            # 进行2x2的最大池化操作，步长为2
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # 第二层卷积层
        self.layer2 = nn.Sequential(
            # 输入64通道特征图，输出128通道特征图，卷积核大小3x3，步长1，填充1
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            # 对128通道特征图进行Batch Normalization
            nn.BatchNorm2d(128),
            # 对128通道特征图进行ReLU激活函数
            nn.ReLU(inplace=True),

            # 输入128通道特征图，输出128通道特征图，卷积核大小3x3，步长1，填充1
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
            # 对128通道特征图进行Batch Normalization
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            # 进行2x2的最大池化操作，步长为2
            nn.MaxPool2d(2, 2)
        )
        # 第三层卷积层
        self.layer3 = nn.Sequential(
            # 输入为128通道，输出为256通道，卷积核大小为33，步长为1，填充大小为1
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            # 批归一化
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(2, 2)
        )

        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(2, 2)
        )

        self.layer5 = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(2, 2)
        )

        self.conv = nn.Sequential(
            self.layer1,
            self.layer2,
            self.layer3,
            self.layer4,
            self.layer5
        )

        self.fc = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),

            nn.Linear(256, 10)
        )

    def forward(self, x):
        x = self.conv(x)
        # 对张量的拉平(flatten)操作，即将卷积层输出的张量转化为二维，全连接的输入尺寸为512
        x = x.view(-1, 512)
        x = self.fc(x)
        return x

### Training

In [44]:
print(torch.cuda.device_count())
print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0))
print(torch.__version__)

0
False
2.2.2


In [38]:
#
batch_size = 16

# 每n个batch打印一次损失
num_print = 10

# 总迭代次数
epoch_num = 30
# 初始学习率
lr = 0.01
# 每n次epoch更新一次学习率
step_size = 10
# 更新学习率每次减半
gamma = 0.5

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use GPU
    print("Using GPU for computations")
else:
    device = torch.device("cpu")  # Use CPU
    print("Using CPU for computations")
    
# 创建Vgg16_net模型，并将其移动到GPU或CPU
model = Vgg16_net().to(device)
# 定义损失函数为交叉熵
criterion = nn.CrossEntropyLoss()
# 定义优化器为随机梯度下降，学习率为lr，动量为0.8，权重衰减为0.001
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.8, weight_decay=0.001)
# 定义学习率调度器，每step_size次epoch将学习率减半
schedule = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma, last_epoch=-1)

# 训练
loss_list = []

# 每个epoch循环训练一遍
for epoch in range(epoch_num):
    # 当前迭代次数
    ww = 0
    # 累计损失值
    running_loss = 0.0
    # 遍历数据加载器，获取每个batch
    for i, (inputs, labels) in enumerate(train_loader, 0):
        # 将数据和标签移动到GPU/CPU上
        inputs, labels = inputs.to(device), labels.to(device)
        # 梯度清零
        optimizer.zero_grad()
        # 输入数据进行前向传播，行到预测结果
        outputs = model(inputs)
        # 计算损失
        loss = criterion(outputs, labels).to(device)
        # 反向传播，计算每个参数的梯度
        loss.backward()
        # 更新参数
        optimizer.step()
        # 累计损失值
        running_loss += loss.item()
        # 将损失值放到loss_list
        loss_list.append(loss.item())

        # 打印当前epoch的平均损失值
        if (i + 1) % num_print == 0:
            print('[%d epoch,%d]  loss:%.6f' % (epoch + 1, i + 1, running_loss / num_print))
            running_loss = 0.0

    lr_1 = optimizer.param_groups[0]['lr']
    # 打印学习率
    print("learn_rate:%.15f" % lr_1)
    schedule.step()

Using CPU for computations


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 10070523904 bytes.