<a href="https://colab.research.google.com/github/ykitaguchi77/Laboratory_course/blob/master/demo_scraping_ResNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**画像のスクレイピング --> Lightning Pytorchで分類**



#**iCrawlerを用いてスクレイピング**

https://atmarkit.itmedia.co.jp/ait/articles/2010/28/news018.html

公式： https://icrawler.readthedocs.io/en/latest/builtin.html

In [None]:
!pip install icrawler
from icrawler.builtin import BingImageCrawler
import os

# List of keywords
keywords = ["cat", "dog"]
max_num = 150

for keyword in keywords:
    output_dir = f"/content/{keyword}"

    # Create the directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    crawler = BingImageCrawler(storage={"root_dir": output_dir})
    crawler.crawl(keyword=keyword, max_num=max_num)

##**Classify dog/cat using lightning Pytorch**

https://tech.aru-zakki.com/from-pytorch-to-lightning/

In [1]:
import os
import json
import math
try:
    import japanize_matplotlib
except ModuleNotFoundError:
    !pip install japanize_matplotlib
    import japanize_matplotlib
import numpy as np
import time
import copy
import requests
from PIL import Image
from types import SimpleNamespace
from io import StringIO

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
sns.set()

from tqdm import tqdm

from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        # GPUありの場合
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    # PytorchLightningでエラーが出るので、MPSはパス
    #elif torch.backends.mps.is_built():
    #    device = torch.device("mps:0")
    else:
        device = torch.device("cpu")
    return device

In [6]:
# # Google driveをマウント
# from google.colab import drive

# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [27]:
import shutil
from sklearn.model_selection import train_test_split

# 画像のパスを取得
dog_paths = ['/content/dog/' + f for f in os.listdir('/content/dog/')]
cat_paths = ['/content/cat/' + f for f in os.listdir('/content/cat/')]

# 画像パスを結合
all_paths = dog_paths + cat_paths

# 訓練用と検証用に分割
train_paths, valid_paths = train_test_split(all_paths, test_size=0.2, random_state=42)

# ディレクトリを作成
def create_or_clean_dir(directory_path):
    if os.path.exists(directory_path):
        shutil.rmtree(directory_path)
    os.makedirs(directory_path)
create_or_clean_dir('/content/train/dog')
create_or_clean_dir('/content/train/cat')
create_or_clean_dir('/content/valid/dog')
create_or_clean_dir('/content/valid/cat')

# 画像をコピー
for path in train_paths:
  if 'dog' in path:
    shutil.copy(path, '/content/train/dog')
  elif 'cat' in path:
    shutil.copy(path, '/content/train/cat')

for path in valid_paths:
  if 'dog' in path:
    shutil.copy(path, '/content/valid/dog')
  elif 'cat' in path:
    shutil.copy(path, '/content/valid/cat')

In [25]:
import os
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import ImageFolder
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ProgressBar, LearningRateMonitor
from pytorch_lightning.core.datamodule import LightningDataModule
import torch.nn.functional as F
from torchmetrics import Accuracy

class CatDogDataModule(LightningDataModule):

  def __init__(self, batch_size=32):
    super().__init__()
    self.batch_size = batch_size
    self.data_transform = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

  def setup(self, stage=None):
    if stage == 'fit' or stage is None:
      self.train_dataset = ImageFolder(root='/content/train', transform=self.data_transform)
      self.valid_dataset = ImageFolder(root='/content/valid', transform=self.data_transform)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False)

class Classifier(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
    self.model.fc = torch.nn.Linear(self.model.fc.in_features, 2)

  def forward(self, x):
    return self.model(x)

  def training_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)

    preds = torch.argmax(y_hat, dim=1)
    acc = (preds == y).float().mean()

    self.log('train_loss', loss, prog_bar=True)
    self.log('train_acc', acc, prog_bar=True)

    return loss

  def validation_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)

    preds = torch.argmax(y_hat, dim=1)
    acc = (preds == y).float().mean()

    self.log('val_loss', loss, prog_bar=True)
    self.log('val_acc', acc, prog_bar=True)

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=1e-3)

class PrintMetricsCallback(pl.Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        # Retrieves the logged metrics for the training epoch and prints them
        metrics = trainer.logged_metrics
        print(f"Epoch: {trainer.current_epoch}, Train Loss: {metrics['train_loss'].item():.4f}, Train Acc: {metrics['train_acc'].item():.4f}")

    def on_validation_epoch_end(self, trainer, pl_module):
        # Retrieves the logged metrics for the validation epoch and prints them
        metrics = trainer.logged_metrics
        print(f"\nEpoch: {trainer.current_epoch}, Validation Loss: {metrics['val_loss'].item():.4f}, Validation Acc: {metrics['val_acc'].item():.4f}")


data_module = CatDogDataModule(batch_size=32)
model = Classifier()

logger = pl_loggers.TensorBoardLogger('logs/')

trainer = pl.Trainer(
  max_epochs=10,
  accelerator='gpu',
  devices=1,
  callbacks=[PrintMetricsCallback()],
  logger=logger
)

trainer.fit(model, datamodule=data_module)

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 11.2 M
---------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.710    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
