Skip to content

Commit

Permalink
Add variants and locations images download
Browse files Browse the repository at this point in the history
  • Loading branch information
vlmaier committed Nov 2, 2023
1 parent 810e175 commit 821faaa
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 211 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,4 @@ dmypy.json
*.iml
*.ipr
out/
marvel-snap-cards
marvel-snap
5 changes: 0 additions & 5 deletions Dockerfile

This file was deleted.

15 changes: 6 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
![](https://github.com/vlmaier/marvel-snap-scrapr/actions/workflows/build.yml/badge.svg)
# ![build](https://github.com/vlmaier/marvel-snap-scrapr/actions/workflows/build.yml/badge.svg)

# Marvel SNAP Scrapr
## Marvel SNAP Scrapr

Scraper for https://marvelsnapzone.com to retrieve metadata of Marvel SNAP cards.
Scraper for <https://marvelsnapzone.com> to retrieve metadata of Marvel SNAP cards.

## How does it work?
### How does it work?

The script uses the Beautiful Soup Python library, which pulls data out of HTML or XML files.
It scraps the website https://marvelsnapzone.com which is well-structured and provides all required metadata about Marvel SNAP cards.
Selenium web driver is required because of the dynamic loading on the website. Otherwise, the card links are not available when going for a static approach.
In the end, a list of dictionaries is created for all available cards. It can be used elsewhere to create a custom card database.
The origin version used to scrap the website and pull out the card metadata from the HTML page including the image URL.

If you only want to download the images then uncomment the `download_images()` function call.
The new version uses the [API endpoint](https://marvelsnapzone.com/getinfo/?searchtype=cards&searchcardstype=true) (found by [@mlilback](https://github.com/mlilback)) to retrieve the data in JSON format. The API endpoint is used by the website to retrieve the data as well. Since JSON is already structured data, it is much easier to parse and extract the data.
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
requests==2.31.0
beautifulsoup4~=4.12.2
selenium~=4.11.2
272 changes: 78 additions & 194 deletions scrapr.py
Original file line number Diff line number Diff line change
@@ -1,213 +1,97 @@
import re
from concurrent.futures import ThreadPoolExecutor
import requests
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from threading import Thread
from datetime import datetime

CARDS_API_URL = "http://localhost:8080/v1/cards"
MARVELSNAPZONE_URL = 'https://marvelsnapzone.com/cards'
MARVELSNAPZONE_API_URL = 'https://marvelsnapzone.com/getinfo/?searchtype=cards&searchcardstype=true'
CARDS_API_URL = 'https://marvelsnapzone.com/getinfo/?searchtype=cards&searchcardstype=true'
LOCATIONS_API_URL = 'https://marvelsnapzone.com/getinfo/?searchtype=locations&searchcardstype=true'
ROOT_DIR = 'marvel-snap'
CARDS_DIR = 'cards'
VARIANTS_DIR = 'variants'
LOCATIONS_DIR = 'locations'


def get_cards():
print("[%s] %s" % (datetime.now(), "Starting retrieving cards ..."))
response = requests.get(MARVELSNAPZONE_API_URL)
def get_cards(url: str = CARDS_API_URL):
"""
Retrieves a list of cards from the Marvel SNAP Zone API.
Returns:
A list of cards, where each card is represented as a dictionary.
"""
print("[%s] %s" %
(datetime.now(), f"Starting retrieving cards from {url}"))
response = requests.get(url)

if response.status_code == 200:
json_data = response.json()
success = json_data.get("success", {})
print("[%s] %s" % (datetime.now(), "Finished retrieving cards."))
return success.get("cards", [])
else:
print(f"Error: Request failed with status code {response.status_code}")


def scrap():
print("[%s] %s" % (datetime.now(), "Starting scraping ..."))

chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(options=chrome_options)
browser.get(MARVELSNAPZONE_URL)
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

# Only look for link with a 'simple-card' class; those are the cards.
links = soup.findAll('a', {'class': 'simple-card'})

characters = []
for link in links:
character = {
# Capitalize every word.
'name': link['data-name'].title(),
'cost': link['data-cost'],
'power': link['data-power'],
# Strip HTML tags and capitalize.
'ability': capitalize(BeautifulSoup(link['data-ability'], 'html.parser').text),
# Remove query string.
'url': link['data-src'].split('?')[0],
'status': link['data-status'],
'source': link['data-source']
}
characters.append(character)
print("[%s] %s" % (datetime.now(), f"Found {character['name']}"))

image_urls = [character['url'] for character in characters]
download_images(image_urls)

return characters


def capitalize(text):
punctuation_filter = re.compile('([.!?;:]\s*)')
split_with_punctuation = punctuation_filter.split(text)
for i, j in enumerate(split_with_punctuation):
if len(j) > 1:
split_with_punctuation[i] = j[0].upper() + j[1:]
text = ''.join(split_with_punctuation)
return text


def download_images(urls, dir_name='marvel-snap-cards'):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' created."))
else:
print("[%s] %s" % (datetime.now(), f"Directory '{dir_name}' already exists."))

threads = []
for url in urls:
threads.append(Thread(target=download_image, args=(url, dir_name)))
threads[-1].start()
for thread in threads:
thread.join()

print("[%s] %s" % (datetime.now(), f"Finished downloading. Check '{dir_name}' directory."))


def download_image(url, dir_name):
print("[%s] %s" % (datetime.now(), f"Download image from {url}"))
try:
response = requests.get(url)
response.raise_for_status()
file_name = url.rsplit('/', 1)[-1].rsplit('?', 1)[0]
file_path = os.path.join(dir_name, file_name)
with open(file_path, 'wb') as file:
file.write(response.content)
except requests.exceptions.RequestException as e:
print("[%s] %s" % (datetime.now(), f"Error downloading image from URL '{url}': {e}"))


def create_cards(cards):
for card in cards:
if card["status"] != "released":
continue

body = {
"name": parse_name(card["name"]),
"cost": card["cost"],
"power": card["power"],
"ability": parse_ability(card["ability"]),
"series": parse_source(card["source"]),
"imageUrl": card["url"],
}

response = requests.post(CARDS_API_URL, json=body)
if response.status_code == requests.codes.created:
print("[%s] %s" % (datetime.now(), f"Created card: {card['name']}"))
else:
print("[%s] %s" % (datetime.now(), f"Failed to create card: {card['name']} - {response.text}"))


def parse_name(name):
name = name.strip()

name_mappings = {
"Ant Man": "Ant-Man",
"Jane Foster Mighty Thor": "Jane Foster The Mighty Thor",
"Miles Morales": "Miles Morales: Spider-Man",
"Super-Skrull": "Super Skrull",
}

return name_mappings.get(name, name)


def parse_ability(ability):
ability = ability.strip()

# Provide 'No ability' instead of empty string.
if not ability:
ability = "No ability"

# All following words should be shown in bold.
bold_candidates = [
"On Reveal",
"Ongoing",
"Widow's Bite",
"Rock",
"Rocks",
"Doombot",
"Squirrel",
"Demon",
"Drone",
"Mjolnir",
"Tiger",
"Limbo",
"No ability",
]

for candidate in bold_candidates:
if candidate.lower() in ability.lower():
ability = re.sub(
candidate.lower(),
f"<span class='fw-bold'>{candidate}</span>",
ability,
flags=re.IGNORECASE,
)

for i in range(1, 10):
# +[1-9] should be shown in bold and green color.
ability = re.sub(
fr"[+][{i}]",
f"<span class='fw-bold' style='color: green;'>+{i}</span>",
ability,
)
# -[1-9] should be shown in bold and red color.
ability = re.sub(
fr"[-][{i}]",
f"<span class='fw-bold' style='color: red;'>-{i}</span>",
ability,
)

return ability


def parse_source(source):
series_map = {
'Collection Level 1-14': 'Starter',
'Starter Card': 'Starter',
'Recruit Season': 'Starter',
'Pool 1': 'Series 1',
'Pool 2': 'Series 2',
'Pool 3': 'Series 3',
'Pool 4': 'Series 4',
'Pool 5': 'Series 5',
'Season Pass': 'Season Pass'
}
for key in series_map:
if key in source:
return series_map[key]
return ''
def download_images(urls, dir: str = ROOT_DIR):
"""
Downloads images from the given URLs and stores them in the given directory.
Args:
urls: A list of URLs to download images from.
dir: The directory to store the images in.
"""

def download_image(url, dir: str = ROOT_DIR):
print("[%s] %s" % (datetime.now(), f"Download image from {url}"))
try:
response = requests.get(url)
response.raise_for_status()
file_name = url.rsplit('/', 1)[-1].rsplit('?', 1)[0]
file_path = os.path.join(dir, file_name)
with open(file_path, 'wb') as file:
file.write(response.content)
except requests.exceptions.RequestException as e:
print("[%s] %s" % (datetime.now(),
f"Error downloading image from URL '{url}': {e}"))

with ThreadPoolExecutor(max_workers=5) as executor:
for url in urls:
executor.submit(download_image, url, dir)

print("[%s] %s" %
(datetime.now(), f"Finished downloading. Check '{dir}' directory."))


def create_directories():
"""
Creates the directories for the card images.
ROOT_DIR
├── CARDS_DIR
├── LOCATIONS_DIR
└── VARIANTS_DIR
"""
if not os.path.exists(ROOT_DIR):
os.mkdir(ROOT_DIR)

directories = [CARDS_DIR, VARIANTS_DIR, LOCATIONS_DIR]

for directory in directories:
path = os.path.join(ROOT_DIR, directory)
if not os.path.exists(path):
os.mkdir(path)


if __name__ == '__main__':
cards = get_cards()
image_urls = [card['art'] for card in cards]
download_images(image_urls)
# characters = scrap()
# create_cards(characters)
card_image_urls = [card['art'] for card in cards]
variant_image_urls = [variant['art'] for card in cards for variant in card.get('variants', [])]

locations = get_cards(LOCATIONS_API_URL)
location_image_urls = [location['art'] for location in locations]

create_directories()

download_images(card_image_urls, os.path.join(ROOT_DIR, CARDS_DIR))
download_images(variant_image_urls, os.path.join(ROOT_DIR, VARIANTS_DIR))
download_images(location_image_urls, os.path.join(ROOT_DIR, LOCATIONS_DIR))

0 comments on commit 821faaa

Please sign in to comment.