-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add variants and locations images download
- Loading branch information
Showing
5 changed files
with
85 additions
and
211 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -134,4 +134,4 @@ dmypy.json | |
*.iml | ||
*.ipr | ||
out/ | ||
marvel-snap-cards | ||
marvel-snap |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,11 @@ | ||
![](https://github.com/vlmaier/marvel-snap-scrapr/actions/workflows/build.yml/badge.svg) | ||
# ![build](https://github.com/vlmaier/marvel-snap-scrapr/actions/workflows/build.yml/badge.svg) | ||
|
||
# Marvel SNAP Scrapr | ||
## Marvel SNAP Scrapr | ||
|
||
Scraper for https://marvelsnapzone.com to retrieve metadata of Marvel SNAP cards. | ||
Scraper for <https://marvelsnapzone.com> to retrieve metadata of Marvel SNAP cards. | ||
|
||
## How does it work? | ||
### How does it work? | ||
|
||
The script uses the Beautiful Soup Python library, which pulls data out of HTML or XML files. | ||
It scraps the website https://marvelsnapzone.com which is well-structured and provides all required metadata about Marvel SNAP cards. | ||
Selenium web driver is required because of the dynamic loading on the website. Otherwise, the card links are not available when going for a static approach. | ||
In the end, a list of dictionaries is created for all available cards. It can be used elsewhere to create a custom card database. | ||
The origin version used to scrap the website and pull out the card metadata from the HTML page including the image URL. | ||
|
||
If you only want to download the images then uncomment the `download_images()` function call. | ||
The new version uses the [API endpoint](https://marvelsnapzone.com/getinfo/?searchtype=cards&searchcardstype=true) (found by [@mlilback](https://github.com/mlilback)) to retrieve the data in JSON format. The API endpoint is used by the website to retrieve the data as well. Since JSON is already structured data, it is much easier to parse and extract the data. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1 @@ | ||
requests==2.31.0 | ||
beautifulsoup4~=4.12.2 | ||
selenium~=4.11.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,213 +1,97 @@ | ||
import re | ||
from concurrent.futures import ThreadPoolExecutor | ||
import requests | ||
import os | ||
from bs4 import BeautifulSoup | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.options import Options | ||
from threading import Thread | ||
from datetime import datetime | ||
|
||
CARDS_API_URL = "http://localhost:8080/v1/cards" | ||
MARVELSNAPZONE_URL = 'https://marvelsnapzone.com/cards' | ||
MARVELSNAPZONE_API_URL = 'https://marvelsnapzone.com/getinfo/?searchtype=cards&searchcardstype=true' | ||
CARDS_API_URL = 'https://marvelsnapzone.com/getinfo/?searchtype=cards&searchcardstype=true' | ||
LOCATIONS_API_URL = 'https://marvelsnapzone.com/getinfo/?searchtype=locations&searchcardstype=true' | ||
ROOT_DIR = 'marvel-snap' | ||
CARDS_DIR = 'cards' | ||
VARIANTS_DIR = 'variants' | ||
LOCATIONS_DIR = 'locations' | ||
|
||
|
||
def get_cards(): | ||
print("[%s] %s" % (datetime.now(), "Starting retrieving cards ...")) | ||
response = requests.get(MARVELSNAPZONE_API_URL) | ||
def get_cards(url: str = CARDS_API_URL): | ||
""" | ||
Retrieves a list of cards from the Marvel SNAP Zone API. | ||
Returns: | ||
A list of cards, where each card is represented as a dictionary. | ||
""" | ||
print("[%s] %s" % | ||
(datetime.now(), f"Starting retrieving cards from {url}")) | ||
response = requests.get(url) | ||
|
||
if response.status_code == 200: | ||
json_data = response.json() | ||
success = json_data.get("success", {}) | ||
print("[%s] %s" % (datetime.now(), "Finished retrieving cards.")) | ||
return success.get("cards", []) | ||
else: | ||
print(f"Error: Request failed with status code {response.status_code}") | ||
|
||
|
||
def scrap(): | ||
print("[%s] %s" % (datetime.now(), "Starting scraping ...")) | ||
|
||
chrome_options = Options() | ||
chrome_options.add_argument("--headless=new") | ||
chrome_options.add_argument('--disable-dev-shm-usage') | ||
chrome_options.add_argument('--disable-extensions') | ||
chrome_options.add_argument('--disable-gpu') | ||
browser = webdriver.Chrome(options=chrome_options) | ||
browser.get(MARVELSNAPZONE_URL) | ||
html = browser.page_source | ||
soup = BeautifulSoup(html, 'html.parser') | ||
|
||
# Only look for link with a 'simple-card' class; those are the cards. | ||
links = soup.findAll('a', {'class': 'simple-card'}) | ||
|
||
characters = [] | ||
for link in links: | ||
character = { | ||
# Capitalize every word. | ||
'name': link['data-name'].title(), | ||
'cost': link['data-cost'], | ||
'power': link['data-power'], | ||
# Strip HTML tags and capitalize. | ||
'ability': capitalize(BeautifulSoup(link['data-ability'], 'html.parser').text), | ||
# Remove query string. | ||
'url': link['data-src'].split('?')[0], | ||
'status': link['data-status'], | ||
'source': link['data-source'] | ||
} | ||
characters.append(character) | ||
print("[%s] %s" % (datetime.now(), f"Found {character['name']}")) | ||
|
||
image_urls = [character['url'] for character in characters] | ||
download_images(image_urls) | ||
|
||
return characters | ||
|
||
|
||
def capitalize(text): | ||
punctuation_filter = re.compile('([.!?;:]\s*)') | ||
split_with_punctuation = punctuation_filter.split(text) | ||
for i, j in enumerate(split_with_punctuation): | ||
if len(j) > 1: | ||
split_with_punctuation[i] = j[0].upper() + j[1:] | ||
text = ''.join(split_with_punctuation) | ||
return text | ||
|
||
|
||
def download_images(urls, dir_name='marvel-snap-cards'): | ||
if not os.path.exists(dir_name): | ||
os.mkdir(dir_name) | ||
print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' created.")) | ||
else: | ||
print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' already exists.")) | ||
|
||
threads = [] | ||
for url in urls: | ||
threads.append(Thread(target=download_image, args=(url, dir_name))) | ||
threads[-1].start() | ||
for thread in threads: | ||
thread.join() | ||
|
||
print("[%s] %s" % (datetime.now(), f"Finished downloading. Check '{dir_name}' directory.")) | ||
|
||
|
||
def download_image(url, dir_name): | ||
print("[%s] %s" % (datetime.now(), f"Download image from {url}")) | ||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
file_name = url.rsplit('/', 1)[-1].rsplit('?', 1)[0] | ||
file_path = os.path.join(dir_name, file_name) | ||
with open(file_path, 'wb') as file: | ||
file.write(response.content) | ||
except requests.exceptions.RequestException as e: | ||
print("[%s] %s" % (datetime.now(), f"Error downloading image from URL '{url}': {e}")) | ||
|
||
|
||
def create_cards(cards): | ||
for card in cards: | ||
if card["status"] != "released": | ||
continue | ||
|
||
body = { | ||
"name": parse_name(card["name"]), | ||
"cost": card["cost"], | ||
"power": card["power"], | ||
"ability": parse_ability(card["ability"]), | ||
"series": parse_source(card["source"]), | ||
"imageUrl": card["url"], | ||
} | ||
|
||
response = requests.post(CARDS_API_URL, json=body) | ||
if response.status_code == requests.codes.created: | ||
print("[%s] %s" % (datetime.now(), f"Created card: {card['name']}")) | ||
else: | ||
print("[%s] %s" % (datetime.now(), f"Failed to create card: {card['name']} - {response.text}")) | ||
|
||
|
||
def parse_name(name): | ||
name = name.strip() | ||
|
||
name_mappings = { | ||
"Ant Man": "Ant-Man", | ||
"Jane Foster Mighty Thor": "Jane Foster The Mighty Thor", | ||
"Miles Morales": "Miles Morales: Spider-Man", | ||
"Super-Skrull": "Super Skrull", | ||
} | ||
|
||
return name_mappings.get(name, name) | ||
|
||
|
||
def parse_ability(ability): | ||
ability = ability.strip() | ||
|
||
# Provide 'No ability' instead of empty string. | ||
if not ability: | ||
ability = "No ability" | ||
|
||
# All following words should be shown in bold. | ||
bold_candidates = [ | ||
"On Reveal", | ||
"Ongoing", | ||
"Widow's Bite", | ||
"Rock", | ||
"Rocks", | ||
"Doombot", | ||
"Squirrel", | ||
"Demon", | ||
"Drone", | ||
"Mjolnir", | ||
"Tiger", | ||
"Limbo", | ||
"No ability", | ||
] | ||
|
||
for candidate in bold_candidates: | ||
if candidate.lower() in ability.lower(): | ||
ability = re.sub( | ||
candidate.lower(), | ||
f"<span class='fw-bold'>{candidate}</span>", | ||
ability, | ||
flags=re.IGNORECASE, | ||
) | ||
|
||
for i in range(1, 10): | ||
# +[1-9] should be shown in bold and green color. | ||
ability = re.sub( | ||
fr"[+][{i}]", | ||
f"<span class='fw-bold' style='color: green;'>+{i}</span>", | ||
ability, | ||
) | ||
# -[1-9] should be shown in bold and red color. | ||
ability = re.sub( | ||
fr"[-][{i}]", | ||
f"<span class='fw-bold' style='color: red;'>-{i}</span>", | ||
ability, | ||
) | ||
|
||
return ability | ||
|
||
|
||
def parse_source(source): | ||
series_map = { | ||
'Collection Level 1-14': 'Starter', | ||
'Starter Card': 'Starter', | ||
'Recruit Season': 'Starter', | ||
'Pool 1': 'Series 1', | ||
'Pool 2': 'Series 2', | ||
'Pool 3': 'Series 3', | ||
'Pool 4': 'Series 4', | ||
'Pool 5': 'Series 5', | ||
'Season Pass': 'Season Pass' | ||
} | ||
for key in series_map: | ||
if key in source: | ||
return series_map[key] | ||
return '' | ||
def download_images(urls, dir: str = ROOT_DIR): | ||
""" | ||
Downloads images from the given URLs and stores them in the given directory. | ||
Args: | ||
urls: A list of URLs to download images from. | ||
dir: The directory to store the images in. | ||
""" | ||
|
||
def download_image(url, dir: str = ROOT_DIR): | ||
print("[%s] %s" % (datetime.now(), f"Download image from {url}")) | ||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
file_name = url.rsplit('/', 1)[-1].rsplit('?', 1)[0] | ||
file_path = os.path.join(dir, file_name) | ||
with open(file_path, 'wb') as file: | ||
file.write(response.content) | ||
except requests.exceptions.RequestException as e: | ||
print("[%s] %s" % (datetime.now(), | ||
f"Error downloading image from URL '{url}': {e}")) | ||
|
||
with ThreadPoolExecutor(max_workers=5) as executor: | ||
for url in urls: | ||
executor.submit(download_image, url, dir) | ||
|
||
print("[%s] %s" % | ||
(datetime.now(), f"Finished downloading. Check '{dir}' directory.")) | ||
|
||
|
||
def create_directories(): | ||
""" | ||
Creates the directories for the card images. | ||
ROOT_DIR | ||
├── CARDS_DIR | ||
├── LOCATIONS_DIR | ||
└── VARIANTS_DIR | ||
""" | ||
if not os.path.exists(ROOT_DIR): | ||
os.mkdir(ROOT_DIR) | ||
|
||
directories = [CARDS_DIR, VARIANTS_DIR, LOCATIONS_DIR] | ||
|
||
for directory in directories: | ||
path = os.path.join(ROOT_DIR, directory) | ||
if not os.path.exists(path): | ||
os.mkdir(path) | ||
|
||
|
||
if __name__ == '__main__': | ||
cards = get_cards() | ||
image_urls = [card['art'] for card in cards] | ||
download_images(image_urls) | ||
# characters = scrap() | ||
# create_cards(characters) | ||
card_image_urls = [card['art'] for card in cards] | ||
variant_image_urls = [variant['art'] for card in cards for variant in card.get('variants', [])] | ||
|
||
locations = get_cards(LOCATIONS_API_URL) | ||
location_image_urls = [location['art'] for location in locations] | ||
|
||
create_directories() | ||
|
||
download_images(card_image_urls, os.path.join(ROOT_DIR, CARDS_DIR)) | ||
download_images(variant_image_urls, os.path.join(ROOT_DIR, VARIANTS_DIR)) | ||
download_images(location_image_urls, os.path.join(ROOT_DIR, LOCATIONS_DIR)) |