In [1]:
from bs4 import BeautifulSoup # thư viện phân tích cú pháp HTML & XML và cấu trúc của web
from urllib.parse import urljoin, urlparse # các thư viện phân tích cú pháp url theo các components, và tạo các url theo form
import urllib.request # thư viện tạo http request và nhận vè response  
import time
from threading import Thread
import os


In [7]:
# Class lấy ảnh từ trang
from typing import Any


class GetImageFromPages():
    def __init__(self, nThreads, nPages, urlPage):
        self.nThreads = nThreads
        self.nPages = nPages
        self.urlPage = urlPage
        self.urlResults = []

    def isValid(self, url):
        urlParsed = urlparse(url)
        return bool(urlParsed.netloc) and bool(urlParsed.scheme) 
        # Check network location and valid scheme
        #  The netloc attribute of this object represents the network location part of the URL (e.g., "example.com" 
        # in "https://example.com/path"). The scheme attribute represents the scheme part of the URL (e.g., "https" in "https://example.com/path").

    def getAllImageUrls(self, url):
        """ 
        Return all image URLs from a single url 
        """
        soup = BeautifulSoup(urllib.request.urlopen(url), "html.parser")
        urls = []
        for img in soup.find_all("img"):
            img_url = img.attrs.get("src")
            if not img_url:
                continue

            img_url = urljoin(url, img_url)

            try:
                pos = img_url.index('?')
                img_url = img_url[:pos]
            except ValueError:
                pass

            if self.isValid(img_url):
                urls.append(img_url)
        return urls

    def main(self, start, end):
        for i in range(start, end):
            try:
                self.urlResults.extend(self.getAllImageUrls(self.urlPage + str(i)))
            except:
                pass
    
    def __call__(self):
        threads = []
        nPages1Batch = self.nPages // self.nThreads
        for i in range(0, self.nPages, nPages1Batch):
            start = i
            end =  self.nPages if (i + nPages1Batch) >= self.nPages else  i + nPages1Batch
            threads.append(Thread(target=self.main, args=(start, end)))
        
        start = time.time()
        for i in range(self.nThreads):
            threads[i].start()
        for i in range(self.nThreads):
            threads[i].join()
            """
            join() là kiểm tra các threads xem đã xong chưa
            The join() method is called on each thread. This blocks the current thread until the thread represented by threads[i] has completed its execution.
            """
        end = time.time()

        print(f"Handling those pages takes:  {end - start} seconds")

        return self.urlResults                

In [8]:
def UrlsToText(topicNames, topics, urlTopic, nPages, nThreads):
        for dir, names in zip(topicNames, topics):
            dirPathUrls = f"data/{dir}/urls"
            if not os.path.exists(dirPathUrls):
                os.makedirs(dirPathUrls)
            for name in names:
                resultOfName = []
                for key in urlTopic.keys():
                    res = GetImageFromPages(min(nThreads, nPages//2), nPages, urlTopic[key].format(name = name))()

                    if len(res) > 0:
                        res = list(set(res))
                        resultOfName.extend(res)
                
                print(f"{dirPathUrls}/{dir}_{name}.txt have {len(resultOfName)} images \n")
                strResult = '\n'.join(resultOfName)
                with open(f"{dirPathUrls}/{dir}_{name}.txt", "w") as f:
                    f.write(strResult)
    
urltopic = {
"freeimages": "https://www.freeimages.com/search/{name}/"
}
animal = ['cat', 'dog']
topic_names = ["animal"]
topics = [animal]
n_threads = os.cpu_count()
print(n_threads)
n_page = 6
UrlsToText(topicNames=topic_names, topics=topics, urlTopic=urltopic,
nPages=n_page, nThreads=n_threads)


20
Handling those pages takes:  1.190073013305664 seconds
data/animal/urls/animal_cat.txt have 314 images 

Handling those pages takes:  1.116774082183838 seconds
data/animal/urls/animal_dog.txt have 313 images 

