<a href="https://colab.research.google.com/github/ykitaguchi77/Laboratory_course/blob/master/demo_scraping_ResNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**画像のスクレイピング**



#**iCrawlerを使う方法**

https://atmarkit.itmedia.co.jp/ait/articles/2010/28/news018.html

公式： https://icrawler.readthedocs.io/en/latest/builtin.html

In [None]:
!pip install icrawler
from icrawler.builtin import BingImageCrawler
import os

# List of keywords
keywords = ["cat", "dog", "bird"]
max_num = 300

for keyword in keywords:
    output_dir = f"/content/{keyword}"

    # Create the directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    crawler = BingImageCrawler(storage={"root_dir": output_dir})
    crawler.crawl(keyword=keyword, max_num=max_num)

##**Classify dog/cat using lightning Pytorch**

In [1]:
import os
import json
import math
try:
    import japanize_matplotlib
except ModuleNotFoundError:
    !pip install japanize_matplotlib
    import japanize_matplotlib
import numpy as np
import time
import copy
import requests
from PIL import Image
from types import SimpleNamespace
from io import StringIO

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
sns.set()

from tqdm import tqdm

from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        # GPUありの場合
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    # PytorchLightningでエラーが出るので、MPSはパス
    #elif torch.backends.mps.is_built():
    #    device = torch.device("mps:0")
    else:
        device = torch.device("cpu")
    return device

Collecting japanize_matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: japanize_matplotlib
  Building wheel for japanize_matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize_matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120257 sha256=571dd410332c3d7803a29afb046e75a6c3846fe174a4756ce3379f6ceb2ea333
  Stored in directory: /root/.cache/pip/wheels/61/7a/6b/df1f79be9c59862525070e157e62b08eab8ece27c1b68fbb94
Successfully built japanize_matplotlib
Installing collected packages: japanize_matplotlib
Successfully installed japanize_matplotlib-1.1.3


In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from torchvision.datasets import ImageFolder
!pip install pytorch_lightning --q
from pytorch_lightning import LightningModule, Trainer
from PIL import Image

class CatsAndDogsDataset(Dataset):
    def __init__(self, cat_dir, dog_dir, transform=None):
        self.transform = transform

        cat_paths = [os.path.join(cat_dir, fname) for fname in os.listdir(cat_dir) if os.path.isfile(os.path.join(cat_dir, fname)) and (fname.endswith('.jpg') or fname.endswith('.png'))]
        dog_paths = [os.path.join(dog_dir, fname) for fname in os.listdir(dog_dir) if os.path.isfile(os.path.join(dog_dir, fname)) and (fname.endswith('.jpg') or fname.endswith('.png'))]

        self.image_paths = cat_paths + dog_paths
        self.labels = [0] * len(cat_paths) + [1] * len(dog_paths)  # 0 for cat, 1 for dog

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')

        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label



class CatsAndDogsClassifier(LightningModule):
    def __init__(self, num_classes=2):
        super(CatsAndDogsClassifier, self).__init__()
        self.model = models.resnet18(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
        return optimizer


if __name__ == "__main__":
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])



    train_dataset = CatsAndDogsDataset(cat_dir='/content/cat', dog_dir='/content/dog', transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    model = CatsAndDogsClassifier()

    trainer = Trainer(max_epochs=5, gpus=1, accelerator="ddp", plugins=DDPPlugin(find_unused_parameters=False))

    trainer.fit(model, train_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type             | Params
-------------------------------------------
0 | model | ResNet           | 11.2 M
1 | loss  | CrossEntropyLoss | 0     
-------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.710    Total estimated model params size (MB)
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (15) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
import csv

from requests import exceptions
import argparse
import requests
import cv2
import os

API_KEY = f"{bing_api_key}"
MAX_SIZE = 10
GROUP_SIZE = 5

# 取得したエンドポイントURL
URL = "https://api.bing.microsoft.com/v7.0/images/search"
OUTPUT = '/content/save_dir'

if not os.path.isdir(OUTPUT):
    os.mkdir(OUTPUT)

EXCEPTIONS = set([IOError, FileNotFoundError,
    exceptions.RequestException, exceptions.HTTPError,
    exceptions.ConnectionError, exceptions.Timeout])

search_terms = ["forest", "river", "house"]

# set the output csv file name
csv_file = "url_list.csv"

# create the csv file and write the headers
with open(csv_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Search term', 'Image URL'])

# loop over each search term and download images
for term in search_terms:
    print(f"[INFO] searching Bing API for '{term}'")

    # create the directory to save the images for the current search term
    output_dir = os.path.join(OUTPUT, term)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    headers = {"Ocp-Apim-Subscription-Key": API_KEY}
    params = {"q": term, "offset": 0, "count": GROUP_SIZE, "imageType": "Photo", "color": "ColorOnly"}

    # make the search
    search = requests.get(URL, headers=headers, params=params)
    search.raise_for_status()

    # grab the results from the search, including the total number of
    # estimated results returned by the Bing API
    results = search.json()
    est_num_results = min(results["totalEstimatedMatches"], MAX_RESULTS)
    print(f"[INFO] {est_num_results} total results for '{term}'")

    # initialize the total number of images downloaded thus far
    total = 0

    # loop over the estimated number of results in `GROUP_SIZE` groups
    for offset in range(0, est_num_results, GROUP_SIZE):
        # update the search parameters using the current offset, then
        # make the request to fetch the results
        params["offset"] = offset
        search = requests.get(URL, headers=headers, params=params)
        search.raise_for_status()
        results = search.json()

        # loop over the results
        for v in results["value"]:
            # try to download the image
            try:
                # make a request to download the image
                print("[INFO] fetching: {}".format(v["contentUrl"]))
                r = requests.get(v["contentUrl"], timeout=30)

                # build the path to the output image
                ext = v["contentUrl"][v["contentUrl"].rfind("."):]
                filename = f"{term}_{str(total).zfill(3)}{ext}"
                output_path = os.path.join(output_dir, filename)

                # write the image to disk
                with open(output_path, "wb") as f:
                    f.write(r.content)

                # write the URL to the csv file
                with open(csv_file, 'a', newline='') as f:
                    writer = csv.writer(f)
                    writer.writerow([term, v["contentUrl"]])

            # catch any errors that would not unable us to download the
            # image
            except Exception as e:
                print(f"[INFO] skipping: {v['contentUrl']}")

            # if we have reached the maximum number of images, break out
            # of the loop
            total += 1
            print(f"{total} images downloaded!")
            if total >= MAX_SIZE:
                break

        # if we have reached the maximum number of images, break out of
        # the loop
        if total >= MAX_SIZE:
            break


##**Chromedriverを用いる方法**

In [None]:
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium==4.1.0 #新しいバージョンだとエラーが出るので旧バージョンにする

In [None]:
# これだとサムネイルしか取得できない

import requests
from bs4 import BeautifulSoup
import os

# Search query
search_query = "flowers"

# Number of images to download
num_images = 10

# Create a new folder for the images
if not os.path.exists(search_query):
    os.makedirs(search_query)

# URL to search Google Images
url = f"https://www.google.com/search?q={search_query}&tbm=isch"

# Send GET request
response = requests.get(url)

# Parse the HTML using Beautiful Soup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all image tags
images = soup.find_all('img')

# Iterate through the images and download them
for i, img in enumerate(images[:num_images]):
    url = img['src']
    print(i)
    try:
        response = requests.get(url)
        open(f"{search_query}/{search_query}_{i}.jpg", "wb").write(response.content)
    except:
        print("download error")

In [None]:
!apt-get update
!curl -O https://chromedriver.storage.googleapis.com/110.0.5481.77/chromedriver_linux64.zip #Chromeのバージョンに合ったchromedriverのアドレスを設定
!unzip chromedriver_linux64.zip
!chmod +x chromedriver
!mv chromedriver /usr/local/bin/
!pip install selenium

from selenium import webdriver

# Chromeドライバーの設定
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--disable-browser-side-navigation')

# Googleで検索する
search_query = 'flowers'
url = f"https://www.google.com/search?q={search_query}&tbm=isch"
browser = webdriver.Chrome('chromedriver',options=options)
browser.get(url)


import os
from bs4 import BeautifulSoup
import base64
import requests
from io import BytesIO
from PIL import Image


# 画像のURLを取得する
soup = BeautifulSoup(browser.page_source, 'html.parser')
img_tags = soup.find_all('img', class_='rg_i')


urls = []
for img in img_tags:
    try:
        urls.append(img["src"])
    except:
        pass


# 画像をダウンロードする
if not os.path.exists(search_query):
    os.makedirs(search_query)

num_images = 10

counter = 0
for i in range(num_images):
    print(urls[i])
    image_data = base64.b64decode(urls[i].split(',')[1])

    # バイナリデータをBytesIOオブジェクトに書き込む
    image_stream = BytesIO(image_data)

    # PILで画像オブジェクトを作成する
    image = Image.open(image_stream)
    image_format = image.format

    # 画像のネーミング
    num= "{:04d}".format(i)
    file_name = f"{search_query}_{num}"
    new_image_path = f"{search_query}/{file_name}.{image_format}"


    # Save image to file
    image.save(new_image_path)
