Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Danbooru download Cloudflare 403 error #90

Merged
merged 2 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
ARG PYTHON_VERSION=3.8

ARG CURL_IMPERSONATE_VERSION=0.5-chrome
FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl

# Builder
FROM python:${PYTHON_VERSION}-alpine as builder

RUN apk add --update git build-base libffi-dev
RUN apk add --update git build-base libffi-dev curl-dev

WORKDIR /root

COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/
COPY --from=curl /usr/local/lib/ /usr/local/lib/

# Install requirements
COPY requirements.txt /root
RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt
Expand Down Expand Up @@ -34,9 +40,18 @@ RUN apk add --no-cache curl
# Install FFmpeg
COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/

# cURL Impersonate libraries
COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/
COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/

# Copy pip requirements
COPY --from=builder /install /usr/local

# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released
RUN PYTHON_LIB_PATH="$(python -c 'import site; print(site.getsitepackages()[0])')" &&\
CA_FILE="$(python -c 'import certifi; print(certifi.where())')" && \
cp "$CA_FILE" "$PYTHON_LIB_PATH"/curl_cffi/

WORKDIR /app
COPY nazurin ./nazurin

Expand Down
19 changes: 18 additions & 1 deletion Dockerfile.debian
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
ARG PYTHON_VERSION=3.8

ARG CURL_IMPERSONATE_VERSION=0.5-chrome
FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl

# Builder
FROM python:${PYTHON_VERSION}-slim as builder

RUN apt-get update && apt-get install -y --no-install-recommends git wget gcc xz-utils
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git wget gcc xz-utils libcurl4-openssl-dev

WORKDIR /root

COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/
COPY --from=curl /usr/local/lib/ /usr/local/lib/

# Install requirements
COPY requirements.txt /root
RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt
Expand Down Expand Up @@ -34,9 +42,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl
# Install FFmpeg
COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/

# cURL Impersonate libraries
COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/
COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/

# Copy pip requirements
COPY --from=builder /install /usr/local

# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released
RUN PYTHON_LIB_PATH="$(python -c 'import site; print(site.getsitepackages()[0])')" &&\
CA_FILE="$(python -c 'import certifi; print(certifi.where())')" && \
cp "$CA_FILE" "$PYTHON_LIB_PATH"/curl_cffi/

WORKDIR /app
COPY nazurin ./nazurin

Expand Down
53 changes: 27 additions & 26 deletions nazurin/config.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,51 @@
from os import path
from typing import List, Optional

from environs import Env

env = Env()
# read config from .env file if exists
env.read_env()

ENV = env.str("ENV", default="production")
TOKEN = env.str("TOKEN")
ENV: str = env.str("ENV", default="production")
TOKEN: str = env.str("TOKEN")

# Webhook url, eg: https://xxx.fly.dev/, should end with '/'
WEBHOOK_URL = env.str("WEBHOOK_URL", default=None)
HOST = env.str("HOST", default="0.0.0.0")
WEBHOOK_URL: str = env.str("WEBHOOK_URL", default=None)
HOST: str = env.str("HOST", default="0.0.0.0")
# Port is automatically set if on Heroku or fly.io
PORT = env.int("PORT", default=80)
PORT: int = env.int("PORT", default=80)

STORAGE = env.list("STORAGE", subcast=str, default=["Local"])
STORAGE_DIR = env.str("STORAGE_DIR", default="Pictures")
STORAGE: List[str] = env.list("STORAGE", subcast=str, default=["Local"])
STORAGE_DIR: str = env.str("STORAGE_DIR", default="Pictures")

DATABASE = env.str("DATABASE", default="Local")
DATABASE: str = env.str("DATABASE", default="Local")
# Nazurin data collection in database
NAZURIN_DATA = "nazurin"
NAZURIN_DATA: str = "nazurin"
# Ignored items in image caption
CAPTION_IGNORE = env.list("CAPTION_IGNORE", subcast=str, default=[])
CAPTION_IGNORE: List[str] = env.list("CAPTION_IGNORE", subcast=str, default=[])

GALLERY_ID = env.int("GALLERY_ID", default=None)
GALLERY_ID: Optional[int] = env.int("GALLERY_ID", default=None)

ADMIN_ID = env.int("ADMIN_ID")
IS_PUBLIC = env.bool("IS_PUBLIC", default=False)
ADMIN_ID: int = env.int("ADMIN_ID")
IS_PUBLIC: bool = env.bool("IS_PUBLIC", default=False)
# If IS_PUBLIC is True, the following items will be ignored
ALLOW_ID = env.list("ALLOW_ID", subcast=int, default=[])
ALLOW_USERNAME = env.list("ALLOW_USERNAME", default=[])
ALLOW_GROUP = env.list("ALLOW_GROUP", subcast=int, default=[])

RETRIES = env.int("RETRIES", default=5)
TIMEOUT = env.int("TIMEOUT", default=20)
DOWNLOAD_CHUNK_SIZE = env.int("DOWNLOAD_CHUNK_SIZE", default=4096)
PROXY = env.str("HTTP_PROXY", default=None)
UA = (
ALLOW_ID: List[int] = env.list("ALLOW_ID", subcast=int, default=[])
ALLOW_USERNAME: List[str] = env.list("ALLOW_USERNAME", default=[])
ALLOW_GROUP: List[int] = env.list("ALLOW_GROUP", subcast=int, default=[])

RETRIES: int = env.int("RETRIES", default=5)
TIMEOUT: int = env.int("TIMEOUT", default=20)
DOWNLOAD_CHUNK_SIZE: int = env.int("DOWNLOAD_CHUNK_SIZE", default=4096)
PROXY: str = env.str("HTTP_PROXY", default=None)
UA: str = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)

# Local directory to store database and temporary files
DATA_DIR = "data"
TEMP_DIR = path.join(DATA_DIR, "temp")
CLEANUP_INTERVAL = env.int("CLEANUP_INTERVAL", default=7)
ACCESS_LOG_FORMAT = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"'
DATA_DIR: str = "data"
TEMP_DIR: str = path.join(DATA_DIR, "temp")
CLEANUP_INTERVAL: int = env.int("CLEANUP_INTERVAL", default=7)
ACCESS_LOG_FORMAT: str = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"'
14 changes: 5 additions & 9 deletions nazurin/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@

import aiofiles
import aiofiles.os
import aiohttp

from nazurin.config import DOWNLOAD_CHUNK_SIZE, STORAGE_DIR, TEMP_DIR
from nazurin.config import STORAGE_DIR, TEMP_DIR
from nazurin.utils import logger
from nazurin.utils.decorators import network_retry
from nazurin.utils.helpers import (
ensure_existence_async,
sanitize_filename,
sanitize_path,
)
from nazurin.utils.network import NazurinRequestSession


@dataclass
Expand Down Expand Up @@ -63,15 +63,11 @@ async def exists(self) -> bool:
return False

@network_retry
async def download(self, session: aiohttp.ClientSession):
async def download(self, session: NazurinRequestSession):
if await self.exists():
logger.info("File {} already exists", self.path)
return True
await ensure_existence_async(TEMP_DIR)
async with session.get(self.url) as response:
logger.info("Downloading {} to {}...", self.url, self.path)
response.raise_for_status()
async with aiofiles.open(self.path, "wb") as f:
async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE):
await f.write(chunk)
logger.info("Downloading {} to {}...", self.url, self.path)
await session.download(self.url, self.path)
logger.info("Downloaded to {}", self.path)
7 changes: 5 additions & 2 deletions nazurin/models/illust.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List

from nazurin.utils import Request
from nazurin.utils.network import NazurinRequestSession

from .caption import Caption
from .file import File
Expand All @@ -26,8 +27,10 @@ def has_image(self) -> bool:
def has_multiple_images(self) -> bool:
return len(self.images) > 1

async def download(self, **kwargs):
async with Request(**kwargs) as session:
async def download(
self, *, request_class: NazurinRequestSession = Request, **kwargs
):
async with request_class(**kwargs) as session:
tasks = []
for file in self.all_files:
if not file.url:
Expand Down
24 changes: 24 additions & 0 deletions nazurin/models/image.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
from dataclasses import dataclass

import aiohttp
from humanize import naturalsize

from nazurin.utils import Request, logger
from nazurin.utils.exceptions import NazurinError
from nazurin.utils.helpers import check_image

from .file import File

Expand Down Expand Up @@ -81,3 +84,24 @@ def set_size(self, value: int):
if value % 1 != 0:
raise TypeError("Image size must be an integer")
self._size = int(value)

async def download(self, session: aiohttp.ClientSession):
RETRIES = 3
for i in range(RETRIES):
await super().download(session)
is_valid = await check_image(self.path)
if is_valid:
break
logger.warning(
"Downloaded image {} is not valid, retry {} / {}",
self.path,
i + 1,
RETRIES,
)
if i < RETRIES - 1:
# Keep the last one for debugging
os.remove(self.path)
if not is_valid:
raise NazurinError(
"Download failed with invalid image, please check logs for details"
)
9 changes: 5 additions & 4 deletions nazurin/sites/danbooru/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from pybooru import Danbooru as danbooru
from pybooru import PybooruHTTPError

from nazurin.models import Caption, File, Illust, Image
from nazurin.models import Caption, File, Image
from nazurin.sites.danbooru.models import DanbooruIllust
from nazurin.utils.decorators import async_wrap
from nazurin.utils.exceptions import NazurinError
from nazurin.utils.helpers import is_image
Expand Down Expand Up @@ -43,12 +44,12 @@ async def get_post(self, post_id: Optional[int] = None, md5: Optional[str] = Non

async def view(
self, post_id: Optional[int] = None, md5: Optional[str] = None
) -> Illust:
) -> DanbooruIllust:
post = await self.get_post(post_id, md5)
illust = self.parse_post(post)
return illust

def parse_post(self, post) -> Illust:
def parse_post(self, post) -> DanbooruIllust:
"""Get images and build caption."""
# Get images
url = post["file_url"]
Expand Down Expand Up @@ -88,7 +89,7 @@ def parse_post(self, post) -> Illust:
"has_children": post["has_children"],
}
)
return Illust(imgs, caption, post, files)
return DanbooruIllust(imgs, caption, post, files)

@staticmethod
def get_storage_dest(post: dict, filename: str) -> Tuple[str, str]:
Expand Down
10 changes: 10 additions & 0 deletions nazurin/sites/danbooru/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from dataclasses import dataclass

from nazurin.models import Illust
from nazurin.utils.network import CurlRequest


@dataclass
class DanbooruIllust(Illust):
async def download(self, **kwargs):
await super().download(request_class=CurlRequest, **kwargs)
23 changes: 22 additions & 1 deletion nazurin/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from mimetypes import guess_type
from pathlib import Path
from string import capwords
from typing import Callable, List
from typing import Callable, List, Union

import aiofiles
import aiofiles.os
Expand All @@ -18,8 +18,10 @@
InvalidHTTPUrlContent,
WrongFileIdentifier,
)
from PIL import Image

from nazurin.models import Caption
from nazurin.utils.decorators import async_wrap

from . import logger

Expand Down Expand Up @@ -190,3 +192,22 @@ async def remove_files_older_than(path: str, days: int):
await aiofiles.os.remove(entry.path)
elif entry.is_dir():
shutil.rmtree(entry.path)


@async_wrap
def check_image(path: Union[str, os.PathLike]) -> bool:
"""
Check if file is a valid image and not truncated.
"""

try:
with Image.open(path) as image:
image.verify()

with Image.open(path) as image:
image = Image.open(path)
image.load()
return True
except OSError as error:
logger.warning("Invalid image {}: {}", path, error)
return False
Loading
Loading