In [1]:
import re
import time
import json
import asyncio
import logging

import requests
import pandas as pd
from PIL import Image
import hashlib
import aiohttp

In [2]:
async with aiohttp.ClientSession() as session:
    print(type(session))

<class 'aiohttp.client.ClientSession'>


In [2]:
logging.basicConfig(
    format="%(asctime)s ~ %(name)s ~ %(levelname)s ~ %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)

logger = logging.getLogger(__name__)

In [3]:
BASE_URL = "http://www.humansnotinvited.com/"

In [4]:
def find_all_image_urls(content: str):
    return re.findall('captcha/image.php\\?image_name=.*?&id=.', str(content))


def find_tag(content: str):
    return re.findall('value="(.*?)" name="category"', str(content))[0]


async def load_image(session, image_url):
    async with session.get(BASE_URL + image_url, allow_redirects=True) as response:
        data = await response.read()
    _hash = hashlib.md5(data).hexdigest()
    
    return _hash


async def load_page(session):
    async with session.get(BASE_URL, allow_redirects=True) as response:
        data = await response.read()
    
    image_urls = find_all_image_urls(data)
    tasks = []
    for image_url in image_urls:
        tasks.append(asyncio.create_task(load_image(session, image_url)))
    
    tag = find_tag(data)
    return tag, await asyncio.gather(*tasks)


def create_image_tag_df(result):
    count_image_tag = {}
    for tag, image_ids in result:
        tmp_tag = count_image_tag.get(tag, {})
        for image_id in image_ids:
            tmp_tag[image_id] = tmp_tag.get(image_id, 0) + 1
        count_image_tag[tag] = tmp_tag
    df = pd.DataFrame(count_image_tag)
    df.to_csv("test.csv")
    df = df.fillna(0)
    return df


def create_tags2images_dict(df):
    tags = list(df.columns)
    images = list(df.index)
    image2tag = list(df.values.argmax(axis=1, ))
    
    tags2images = {}
    for i_tag, tag in enumerate(tags):
        tags2images[tag] = []
        for image_id, image_tag in zip(images, image2tag):
            if image_tag == i_tag:
                tags2images[tag].append(image_id)
    
    return tags2images


def save_json(data: dict, filename: str = "tags2images.json") -> None:
    with open(filename, "w") as file:
        json.dump(data, file, indent=4)
        

def load_json(filename: str = "tags2images.json") -> dict:
    with open(filename, "r") as file:
        return json.load(file)
    
    
async def collect_dataset(n_iterations: int, n_parallel_tasks: int) -> list:
    result = []
    
    time_full_start = time.time()
    i_iteration = 1
    while i_iteration <= n_iterations:
        try:
            time_current_start = time.time()

            async with aiohttp.ClientSession() as session:
                tasks = [load_page(session) for _ in range(n_parallel_tasks)] 

                result.extend(await asyncio.gather(*tasks))

            current_time = time.time() - time_current_start
            total_time = time.time() - time_full_start
            logger.info(
                "iteration: {}/{}; batch time: {:.2f}s; total time: {:.2f}s".format(
                    i_iteration, n_iterations, current_time, total_time
                )
            )

            i_iteration += 1
            
        except aiohttp.ServerDisconnectedError as error:
            logger.warning(str(error))
            time.sleep(2)
    
    return result

In [5]:
async def run_collection_process(n_iterations: int, n_parallel_tasks: int) -> None:
    result = await collect_dataset(n_iterations, n_parallel_tasks)
    df = create_image_tag_df(result)
    tags2images = create_tags2images_dict(df)
    save_json(tags2images)

In [6]:
N_ITERATIONS = 100
N_PARALLEL_TASKS = 10

await run_collection_process(N_ITERATIONS, N_PARALLEL_TASKS)

2021-08-08 15:36:48 ~ __main__ ~ INFO ~ iteration: 1/100; batch time: 4.50s; total time: 4.50s
2021-08-08 15:36:54 ~ __main__ ~ INFO ~ iteration: 2/100; batch time: 6.06s; total time: 10.57s
2021-08-08 15:37:00 ~ __main__ ~ INFO ~ iteration: 3/100; batch time: 5.95s; total time: 16.52s
2021-08-08 15:37:07 ~ __main__ ~ INFO ~ iteration: 4/100; batch time: 7.08s; total time: 23.60s
2021-08-08 15:37:13 ~ __main__ ~ INFO ~ iteration: 5/100; batch time: 6.07s; total time: 29.67s
2021-08-08 15:37:17 ~ __main__ ~ INFO ~ iteration: 6/100; batch time: 4.04s; total time: 33.72s
2021-08-08 15:37:23 ~ __main__ ~ INFO ~ iteration: 7/100; batch time: 6.07s; total time: 39.79s
2021-08-08 15:37:28 ~ __main__ ~ INFO ~ iteration: 8/100; batch time: 4.97s; total time: 44.75s
2021-08-08 15:37:33 ~ __main__ ~ INFO ~ iteration: 9/100; batch time: 4.95s; total time: 49.71s
2021-08-08 15:37:38 ~ __main__ ~ INFO ~ iteration: 10/100; batch time: 5.03s; total time: 54.74s
2021-08-08 15:37:43 ~ __main__ ~ INFO ~ 

2021-08-08 15:44:24 ~ __main__ ~ INFO ~ iteration: 85/100; batch time: 5.03s; total time: 460.86s
2021-08-08 15:44:30 ~ __main__ ~ INFO ~ iteration: 86/100; batch time: 6.06s; total time: 466.92s
2021-08-08 15:44:37 ~ __main__ ~ INFO ~ iteration: 87/100; batch time: 7.06s; total time: 473.98s
2021-08-08 15:44:43 ~ __main__ ~ INFO ~ iteration: 88/100; batch time: 6.03s; total time: 480.01s
2021-08-08 15:44:49 ~ __main__ ~ INFO ~ iteration: 89/100; batch time: 6.04s; total time: 486.05s
2021-08-08 15:44:54 ~ __main__ ~ INFO ~ iteration: 90/100; batch time: 5.11s; total time: 491.16s
2021-08-08 15:45:00 ~ __main__ ~ INFO ~ iteration: 91/100; batch time: 6.01s; total time: 497.17s
2021-08-08 15:45:06 ~ __main__ ~ INFO ~ iteration: 92/100; batch time: 5.15s; total time: 502.32s
2021-08-08 15:45:11 ~ __main__ ~ INFO ~ iteration: 93/100; batch time: 5.03s; total time: 507.35s
2021-08-08 15:45:17 ~ __main__ ~ INFO ~ iteration: 94/100; batch time: 6.01s; total time: 513.36s
2021-08-08 15:45:23 

In [26]:
def find_all_image_urls(content: str):
    return re.findall('captcha/image.php\\?image_name=.*?&id=.', str(content))


async def get_correct_images_by_content(content):
    tags2images = load_json()
    hashes = []
    new_images = find_all_image_urls(content)
    new_tag = find_tag(content)
    
    async with aiohttp.ClientSession() as session:
        for img in new_images:
            hashes.append(await load_image(session, img))
            
    print("GRID:\n| 1 | 2 | 3 |\n| 4 | 5 | 6 |\n| 7 | 8 | 9 |")
    print("SELECT NEXT IMAGES:")
    for ind, _hash in enumerate(hashes, 1):
        if _hash in tags2images[new_tag]:
            print(ind)

In [29]:
content = """
<div class="captcha-container">

          <div class="header">
            <p>Select all squares with <strong>spinners</strong></p>
            <input type="hidden" value="spinners" name="category">
          </div>

          <div class="content" style="width: 520.452px;">
            <div class="captcha-image" data-token="$1$EDovgQgs$zMN3zLDuSFQu45hEM8h.R/" data-id="3" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$EDovgQgs$zMN3zLDuSFQu45hEM8h.R/&amp;id=3" alt=""></div><div class="captcha-image" data-token="$1$13rVZfBz$xWRevp/Cj7hDIY1kkJyKj." data-id="1" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$13rVZfBz$xWRevp/Cj7hDIY1kkJyKj.&amp;id=1" alt=""></div><div class="captcha-image" data-token="$1$0vlFPzLn$1cXUeswM45B9ff50jkPbP." data-id="8" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$0vlFPzLn$1cXUeswM45B9ff50jkPbP.&amp;id=8" alt=""></div><div class="captcha-image" data-token="$1$pB83T1Lk$qxPKKzjib2caplDn0D0ai1" data-id="1" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$pB83T1Lk$qxPKKzjib2caplDn0D0ai1&amp;id=1" alt=""></div><div class="captcha-image" data-token="$1$OzlYMAxz$fmizM9Fur6pTuo6D1eIwv." data-id="3" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$OzlYMAxz$fmizM9Fur6pTuo6D1eIwv.&amp;id=3" alt=""></div><div class="captcha-image" data-token="$1$MVDstfXS$Jg3qqITbIThRLtMJD5zGy/" data-id="4" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$MVDstfXS$Jg3qqITbIThRLtMJD5zGy/&amp;id=4" alt=""></div><div class="captcha-image" data-token="$1$mDKdgvKk$abmJMVVYA6uW2F053rqiQ0" data-id="9" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$mDKdgvKk$abmJMVVYA6uW2F053rqiQ0&amp;id=9" alt=""></div><div class="captcha-image" data-token="$1$JR1LqcFl$HnQTa0DMNoMBeNfw6SbS/." data-id="8" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$JR1LqcFl$HnQTa0DMNoMBeNfw6SbS/.&amp;id=8" alt=""></div><div class="captcha-image" data-token="$1$OwoOpPfP$GWxnL.IQ2rrhIuZglyc67." data-id="5" style="width: 169.484px;"><img src="captcha/image.php?image_name=$1$OwoOpPfP$GWxnL.IQ2rrhIuZglyc67.&amp;id=5" alt=""></div>          </div>
        </div>
""".replace("&amp;", "&")

In [30]:
await get_correct_images_by_content(content)

GRID:
| 1 | 2 | 3 |
| 4 | 5 | 6 |
| 7 | 8 | 9 |
SELECT NEXT IMAGES:
3
4
5
6
7
9
