# Tanpopo 表面付着物 ViT

In [1]:
!pip install transformers
!pip list | grep transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 14.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 75.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 95.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
transformers                  4.25.1


In [2]:
# Google Colab マウント
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive
import os
os.chdir('/content/drive/MyDrive/Tanpopo')

Mounted at /content/drive
/content/drive/MyDrive


In [3]:
from PIL import Image
import requests
import matplotlib.pyplot as plt

import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import torch.utils.data as data

In [4]:
#画像サイズがが704x480 #88x60
img_size = 224

#class_names = ['1Sputter', '2Fiber', '3Block', '4Bar', '5AGFragment']

# 標準化
mean = (0.5, 0.5, 0.5)
std = (0.5, 0.5, 0.5)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [5]:
batch_size =  "16" #@param[8, 16, 32, 64, 128, 256]
batch_size = int(batch_size)

epochs = "15" #@param[8, 10, 15, 20, 22, 25, 27, 29, 30, 31, 32, 33, 35, 45, 60, 120]
epochs = int(epochs)

### 関数、クラスの定義

In [6]:
import random
import glob
from sklearn.model_selection import train_test_split

def make_filepath_list(folderpath, phase='train'):
    """
    ファイルのパスを格納したリストを返す
    """
    #tmp = 0 # .DS_Storeが最初に読み込まれる
    test_file_list = []
    train_file_list = []
    valid_file_list = []
    class_names = []

    for index, top_dir in enumerate(sorted(os.listdir(folderpath))):
        file_dir = os.path.join(folderpath, top_dir)
        file_list = glob.glob(file_dir + '/*bmp')

        if top_dir != '.DS_Store':
            class_names.append(top_dir)

            if phase == 'test': # テストデータの場合
                test_file_list += [os.path.join(folderpath, top_dir, file).replace('\\', '/') for file in file_list]
                                                            
            else:
                # 各クラス(フォルダ)ごとに8割を訓練データ、2割を検証データとする
                file_list += [os.path.join(folderpath, top_dir, file).replace('\\', '/') for file in file_list]
                train_file, valid_file = train_test_split(file_list, train_size=0.8, test_size=0.2, random_state=0)
                train_file_list += train_file
                valid_file_list += valid_file

    if phase == 'test':
        return test_file_list, class_names

    return train_file_list, valid_file_list, class_names

In [7]:
class ImageTransform(object):
    """
    画像の前処理
    """
    def __init__(self, resize, mean, std):
        self.data_transform = {
            'train': transforms.Compose([
                # データオグメンテーション, 前処理
                transforms.Resize(256), # リサイズ
                transforms.CenterCrop(resize), # 切り取り
                transforms.RandomRotation(45), # ランタムに回転
                transforms.ColorJitter(), # ランダムに明るさ、コントラスト、彩度、色相を変化
                transforms.RandomHorizontalFlip(), # ランダムに左右(水平)反転
                transforms.RandomVerticalFlip(), # ランダムに上下(垂直)反転
                transforms.ToTensor(),
                transforms.Normalize(mean, std), # zcaと交換？
                # ZCA whitening追加する
            ]),
            'valid': transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(resize),
                transforms.ToTensor(),
                transforms.Normalize(mean, std),
            ]),
            'test': transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(resize),
                transforms.ToTensor(),
                transforms.Normalize(mean, std),
            ])
        }

    def __call__(self, img, phase='train'):
        return self.data_transform[phase](img)

In [8]:
from torchvision.io import read_image

class SurfaceObjectDataset(data.Dataset):
    """
    表面付着物のDatasetクラス
    PyTorchのDatasetクラスを継承
    """
    def __init__(self, file_list, classes, transform=None, phase='train'):
        #super().__init__()
        self.file_list = file_list
        self.transform = transform
        self.classes = classes
        self.phase = phase

        #self.img = None
        #self.label = None

    def __len__(self):
        """
        画像の枚数を返す
        """
        return len(self.file_list)

    def __getitem__(self, index):
        """
        前処理した画像データのTensor形式のデータとラベルを取得
        """
        # 指定したindexの画像を読み込む
        img_path = self.file_list[index]
        img = Image.open(img_path)

        # 画像ラベルをファイル名から抜き出す
        label = self.file_list[index].split('/')[6][:11]

        # ラベル名を数値に変換
        label = self.classes.index(label)

        # 画像の前処理を実施
        if self.transform is not None:
            img_transformed = self.transform(img, self.phase)
            #img_transformed = feature_extractor(images=img_transformed, return_tensors="pt")

        return img_transformed, label

### データ読み込み・前処理

In [9]:
# 訓練、検証データへのファイルパスを格納したリストを取得
train_file_list, valid_file_list, class_names = make_filepath_list('/content/drive/MyDrive/Tanpopo/TrainingData11', 'train')

print('train_file_list: ', train_file_list)
print('class_names: ', class_names)
class_num = len(class_names) # 5

# Datasetの作成
train_dataset = SurfaceObjectDataset(
    file_list = train_file_list, classes = class_names,
    transform = ImageTransform(img_size, mean, std),
    phase = 'train'
)

valid_dataset = SurfaceObjectDataset(
    file_list = valid_file_list, classes = class_names,
    transform = ImageTransform(img_size, mean, std),
    phase = 'valid'
)

# Dataloaderの作成
train_dataloader = data.DataLoader(
    train_dataset, batch_size = batch_size, shuffle=False
)

valid_dataloader = data.DataLoader(
    valid_dataset, batch_size = int(batch_size/2), shuffle=False
)

dataloaders_dict = {
    'train': train_dataloader,
    'valid': valid_dataloader
}

# # イテレータに変換
# batch_iterator = iter(dataloaders_dict['train'])

# inputs, labels = next(batch_iterator)
# print('inputs size: ', inputs.size())

train_file_list:  ['/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SP3A0216S000.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SG3A0055S000.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SF3A0057S010.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SP3A0153S000.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SP3A0254S000.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SP3A0279S011.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SF3A0098S010.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SP3A0279S011.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SM3A0157S008.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SM3A0071S014.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SG3A0024S000.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SM3A0154S009.bmp', '/content/drive/MyDrive/Tanpopo/TrainingData11/1Sputter/SP3A0127S000.bmp', '/cont

### モデルの作成

In [10]:
from transformers import ViTFeatureExtractor, ViTModel
# ファインチューニングされたモデルをロードして使う場合はViTForImageClassificationですぐに分類問題に適用できるようですが、
# 今回はファインチューニングの実装のところからも行いたいので、こちらは使いません。
# from transformers import ViTForImageClassification

# 前処理用クラス
#feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k', return_tensors="pt")

# モデル本体
# 順伝播時の出力にAttentionの結果もほしいときはoutput_attentions=Trueを指定する。
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k', output_attentions=True)

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [11]:
#print(vit_model)

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ViTNet(nn.Module):
    def __init__(self, pretrained_vit_model, class_num):
        super(ViTNet, self).__init__()
        self.vit = pretrained_vit_model
        self.fc = nn.Linear(768, class_num) # 全結合層

    def _get_cls_vec(self, states): # fine tuning
        return states['last_hidden_state'][:, 0, :]

    def forward(self, input_ids):
        states = self.vit(input_ids)
        states = self._get_cls_vec(states)
        states = self.fc(states)
        return states
    
model_ViT = ViTNet(vit_model, class_num)
model_ViT.to(device)

ViTNet(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias

In [13]:
total_params = 0
trainable_params = 0

# まず全パラメータを勾配計算Falseにする
for param in model_ViT.parameters():
    param.requires_grad = False
    total_params += param.numel() # 全パラメータ数

# 追加したクラス分類用の全結合層を勾配計算ありに変更
for param in model_ViT.fc.parameters():
    param.requires_grad = True
    trainable_params += param.numel() # 訓練パラメータ数

optimizer = optim.AdamW(model_ViT.fc.parameters(), lr=0.001, weight_decay=0.5)

# optimizer = optim.RAdam([
#     {'params': model_ViT.fc.parameters(), 'lr': 0.001, 'weight_decay': 0.1}
# ])

criterion = nn.CrossEntropyLoss()

In [28]:
# パラメータ数を計算
print('='*20)
print('Total params:', '{:,}'.format(total_params))
print('Trainable params:', '{:,}'.format(trainable_params))
print('Non-trainable params:', '{:,}'.format(total_params - trainable_params))
print('='*20)
print('ViT')

batch_para = 134283973
print('\nVGG16+BathNorm　と　ViT　のパラメータの比較')
print('VGG16 BathNorm:', '{:,}'.format(batch_para))
print('TiT:', '{:,}'.format(total_params))
print('パラメータの差：', '{:,}'.format(batch_para - total_params))
print(f'パラメータの割合：{batch_para / total_params:.4f}倍')

Total params: 86,393,093
Trainable params: 3,845
Non-trainable params: 86,389,248
ViT

VGG16+BathNorm　と　ViT　のパラメータの比較
VGG16 BathNorm: 134,283,973
TiT: 86,393,093
パラメータの差： 47,890,880
パラメータの割合：1.5543倍


In [None]:
#!pip install torchinfo

In [None]:
# from torchsummary import summary
# #from torchinfo import summary
# model_ViT.to(device)
# summary(model=model_ViT, input_size=(3, 224, 224),
#         #col_names=['input_size', 'output_size', 'num_params'], 
#         #device=device,
#         #dtypes=['torch.IntTensor', 'torch.IntTensor', 'torch.IntTensor', 'torch.IntTensor'], 
#         #verbose=1
# )


In [None]:
import time
import copy

def train_model(model, criterion, optimizer, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))

        # 各エポックには訓練フェーズと検証フェーズがあります
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()  # モデルを訓練モードに設定します
            else:
                model.eval()   # モードを評価するモデルを設定します

            running_loss = 0.0
            running_corrects = 0

            # データをイレテートします
            for inputs, labels in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # パラメータの勾配をゼロにします
                optimizer.zero_grad()

                # 順伝播
                # 訓練の時だけ、履歴を保持します
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # 訓練の時だけ逆伝播＋オプティマイズを行います
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # 損失を計算します
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            # if phase == 'train':
            #     scheduler.step()

            epoch_loss = running_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders_dict[phase].dataset)

            print(' {} Loss: {:.4f} Acc: {:.4f} '.format(
                phase, epoch_loss, epoch_acc), end='\t')

            # モデルをディープ・コピー
            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # ベストモデルの重みをロード
    model.load_state_dict(best_model_wts)
    return model

### 学習・検証

In [None]:
PATH = '/content/drive/MyDrive/Tanpopo/model_ViT_weights.pth'
model_ViT.load_state_dict(torch.load(PATH)) # 学習前: 前回の重みを使う

<All keys matched successfully>

In [None]:
print('ViT Training :')
print('-'*10)
model_ViT = model_ViT.to(device)
#model_ViT = train_model(model_ViT, criterion, optimizer, num_epochs=epochs) # 学習

ViT Training :
----------
Epoch 0/64
 train Loss: 0.2093 Acc: 0.9462 	 valid Loss: 0.2799 Acc: 0.9400 	
Epoch 1/64
 train Loss: 0.1380 Acc: 0.9821 	 valid Loss: 0.2781 Acc: 0.9500 	
Epoch 2/64
 train Loss: 0.1612 Acc: 0.9692 	 valid Loss: 0.2782 Acc: 0.9500 	
Epoch 3/64
 train Loss: 0.1779 Acc: 0.9590 	 valid Loss: 0.2921 Acc: 0.9500 	
Epoch 4/64
 train Loss: 0.1628 Acc: 0.9667 	 valid Loss: 0.2984 Acc: 0.9500 	
Epoch 5/64
 train Loss: 0.1714 Acc: 0.9667 	 valid Loss: 0.2950 Acc: 0.9500 	
Epoch 6/64
 train Loss: 0.1789 Acc: 0.9590 	 valid Loss: 0.2930 Acc: 0.9400 	
Epoch 7/64
 train Loss: 0.1653 Acc: 0.9718 	 valid Loss: 0.2907 Acc: 0.9500 	
Epoch 8/64
 train Loss: 0.1707 Acc: 0.9641 	 valid Loss: 0.2992 Acc: 0.9400 	
Epoch 9/64
 train Loss: 0.1836 Acc: 0.9538 	 valid Loss: 0.3010 Acc: 0.9400 	
Epoch 10/64
 train Loss: 0.1671 Acc: 0.9564 	 valid Loss: 0.3025 Acc: 0.9400 	
Epoch 11/64
 train Loss: 0.1652 Acc: 0.9692 	 valid Loss: 0.2884 Acc: 0.9600 	
Epoch 12/64
 train Loss: 0.1559 Acc:

In [None]:
PATH = '/content/drive/MyDrive/Tanpopo/model_ViT_weights.pth'
#torch.save(model_ViT.state_dict(), PATH) # 重みを保存

### テスト

In [None]:
PATH = '/content/drive/MyDrive/Tanpopo/model_ViT_weights.pth'
#net.load_state_dict(torch.load(PATH)) # 重みをロード

# テストデータ
test_file_list, class_names_test = make_filepath_list('/content/drive/MyDrive/Tanpopo/TestData11', 'test')

print('test_file_list : ', test_file_list)
print('class_names_test : ', class_names_test)

# Datasetの作成
test_dataset = SurfaceObjectDataset(
    file_list = test_file_list, classes = class_names_test,
    transform = ImageTransform(img_size, mean, std),
    phase = 'test'
)

# Dataloaderの作成
test_dataloader = data.DataLoader(
    test_dataset, batch_size = int(batch_size/2), shuffle=False
)

dataloaders_dict['test'] = test_dataloader

odict_keys(['vit.embeddings.cls_token', 'vit.embeddings.position_embeddings', 'vit.embeddings.patch_embeddings.projection.weight', 'vit.embeddings.patch_embeddings.projection.bias', 'vit.encoder.layer.0.attention.attention.query.weight', 'vit.encoder.layer.0.attention.attention.query.bias', 'vit.encoder.layer.0.attention.attention.key.weight', 'vit.encoder.layer.0.attention.attention.key.bias', 'vit.encoder.layer.0.attention.attention.value.weight', 'vit.encoder.layer.0.attention.attention.value.bias', 'vit.encoder.layer.0.attention.output.dense.weight', 'vit.encoder.layer.0.attention.output.dense.bias', 'vit.encoder.layer.0.intermediate.dense.weight', 'vit.encoder.layer.0.intermediate.dense.bias', 'vit.encoder.layer.0.output.dense.weight', 'vit.encoder.layer.0.output.dense.bias', 'vit.encoder.layer.0.layernorm_before.weight', 'vit.encoder.layer.0.layernorm_before.bias', 'vit.encoder.layer.0.layernorm_after.weight', 'vit.encoder.layer.0.layernorm_after.bias', 'vit.encoder.layer.1.atten

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

#正解率
def class_accuracy(label, conf_mat):
    return (conf_mat[label][label] + (np.sum(conf_mat) - (np.sum(conf_mat[:, label])+np.sum(conf_mat[label])-conf_mat[label][label]))) / np.sum(conf_mat)
    
#精度(適合率)
def class_precision(label, conf_mat):
    return conf_mat[label][label] / np.sum(conf_mat[label])

#再現率
def class_recall(label, conf_mat):
    return conf_mat[label][label] / np.sum(conf_mat[:, label])

labels_sum = None
predicted_sum = None

with torch.no_grad():
    for data in dataloaders_dict['test']:
        images, labels = data
        images = images.to(device)
        labels = labels.to(device)

        outputs = model_ViT(images)

        #outputs = nn.Softmax(dim=1)(outputs)

        _, predicted = torch.max(outputs, 1)

        if labels_sum is None:
            labels_sum = labels
            predicted_sum = predicted
        else:
            labels_sum = torch.cat([labels_sum, labels], dim=0)
            predicted_sum = torch.cat([predicted_sum, predicted], dim=0)

In [None]:
#混同行列
labels_sum = labels_sum.cpu()
predicted_sum = predicted_sum.cpu()
conf_mat = None
print('ViT network')
print(class_names_test)

Accuracy = []
Precision = []
Recall = []

conf_mat = confusion_matrix(labels_sum, predicted_sum)
print(conf_mat)
print()
for i in range(class_num):
    Accuracy = np.append(Accuracy, class_accuracy(i, conf_mat)*100)
    Precision = np.append(Precision, class_precision(i, conf_mat)*100)
    Recall = np.append(Recall, class_recall(i, conf_mat)*100)

np.set_printoptions(precision=1)

print('Accuracy : ', Accuracy)
print('Recall : ', Recall)
print('Precision : ', Precision)

ViT network
['1Sputter', '2Fiber', '3Block', '4Bar', '5AGFragment']
[[21  0  0  0  0]
 [ 0 21  0  0  0]
 [ 0  0 19  0  2]
 [ 0  2  0 19  0]
 [ 0  1  1  0 19]]

Accuracy :  [100.   97.1  97.1  98.1  96.2]
Recall :  [100.   87.5  95.  100.   90.5]
Precision :  [100.  100.   90.5  90.5  90.5]
