## Arknights - Webscraping Project

<img src="https://gamepress.gg/arknights/sites/arknights/files/2021-01/WhoIsRealBanner_0.jpeg" width="100%">

In [1]:
import requests
import pandas as pd

from pprint import pprint
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

In [2]:
URL = 'https://arknights.fandom.com/wiki/Operator/{}-star'

In [3]:
STARS = range(1, 7)

In [4]:
operators_urls = []

In [5]:
for star in STARS:
    url = URL.format(star)

    req = requests.get(url)
    if req.status_code != 200:
        raise requests.ConnectionError('connection failed.')

    soup = BeautifulSoup(req.text, 'html.parser')

    table = soup.find('table', attrs={'class': 'mrfz-btable'}).tbody
    for row in table.findAll('tr'):
        columns = row.findAll('td')
        if not columns:
            continue

        operators_urls.append('https://arknights.fandom.com' + columns[1].a['href'])

In [6]:
operators_urls[:5]

['https://arknights.fandom.com/wiki/Castle-3',
 'https://arknights.fandom.com/wiki/Justice_Knight',
 'https://arknights.fandom.com/wiki/Lancet-2',
 'https://arknights.fandom.com/wiki/THRM-EX',
 'https://arknights.fandom.com/wiki/12F']

In [7]:
len(operators_urls)

235

In [8]:
operators = []

In [9]:
for url in tqdm(operators_urls):
    req = requests.get(url)
    if req.status_code != 200:
        raise requests.ConnectionError('connection failed.')

    req.encoding = 'utf-8'
        
    soup = BeautifulSoup(req.text, 'html.parser')
    
    operator = {}
    
    op_info = soup.find('div', attrs={'class': 'op-info'}).table.tbody
    op_info_rows = op_info.findAll('tr')
    img_info = op_info_rows[0].td.findAll('span', recursive=False)
    
    operator['name'] = op_info_rows[0].find('br').nextSibling.text.strip()
    operator['class'] = img_info[0].a['title']
    operator['branch'] = img_info[1].a['title']
    operator['faction'] = img_info[2].a['title']
    operator['stars'] = op_info_rows[0].find('a', attrs={'class': 'mw-redirect'})['title']
    operator['position'] = op_info_rows[1].findAll('td')[-1].text.strip()
    operator['tags'] = [t.text.strip() for t in op_info_rows[2].findAll('td')[-1].findAll('div')]
    operator['trait'] = op_info_rows[3].findAll('td')[-1].text.strip()
    operator['availability'] = op_info_rows[4].findAll('td')[-1].text.strip()
    operator['icon'] = op_info.find('div', attrs={'class': 'floatnone'}).img['data-src']
    operator['description'] = op_info_rows[0].findAll('td')[-1].findAll('div', recursive=False)[-1].div.text.strip()
    operator['phrase'] = op_info_rows[0].findAll('td')[-1].find('i').text.strip()

    for info in soup.findAll('div', attrs={'class': 'pi-item'}):
        operator[info.h3.text.strip().replace(' ', '_').lower()] = info.div.text.strip()
    
    attrs = ['base', 'elite_1', 'elite_2', 'max', 'trust']
    attrs_table = soup.findAll('table', attrs={'class': 'mrfz-btable'})[1].tbody
    for attr in attrs_table.findAll('tr')[1:]:
        attr_text = attr.th.text.strip().replace(' ', '_').lower()
        columns = attr.findAll('td')
        for i, col in enumerate(columns):
            operator[attrs[i] + '_' + attr_text] = col.text.strip()
    
    image_collection = soup.find('div', attrs={'class': 'pi-image-collection'})
    if image_collection:
        images = [img.a['href'] for img in image_collection.findAll('figure')]
        captions = [c.text.strip() for c in image_collection.findAll('li')]
        operator['images'] = {c: i for c, i in zip(captions, images)}
    else:
        operator['images'] = {operator['name']: soup.find('figure', attrs={'class': 'pi-item pi-image'}).a['href']}
    
    operators.append(operator)

  0%|          | 0/235 [00:00<?, ?it/s]

In [10]:
df = pd.DataFrame(operators)
df.head()

Unnamed: 0,name,class,branch,faction,stars,position,tags,trait,availability,icon,...,arts_adaptability,paradox_sim.,community_nickname(s),age,real_name,operator_rec._1,operator_rec._2,leitmotif,full_name,english
0,Castle-3,Guard,Dreadnought,Rhodes Island,1-star,Melee,"[Support, Robot]",Blocks 1 enemy and ignores the Deployment Limi...,Recruitment,https://static.wikia.nocookie.net/mrfz/images/...,...,,,,,,,,,,
1,"""Justice Knight""",Sniper,Marksman,Pinus Sylvestris,1-star,Ranged,"[Support, Robot]",Attacks aerial enemies first and ignores the D...,Recruitment,https://static.wikia.nocookie.net/mrfz/images/...,...,,,,,,,,,,
2,Lancet-2,Medic,Medic,Rhodes Island,1-star,Ranged,"[Healing, Robot]",Restores the HP of allies and ignores the Depl...,Recruitment; TR-10,https://static.wikia.nocookie.net/mrfz/images/...,...,,,,,,,,,,
3,THRM-EX,Specialist,Executor,Rhodes Island,1-star,Melee,"[Nuker, Robot]",Does not attack and ignores the Deployment Lim...,Recruitment; 7-2,https://static.wikia.nocookie.net/mrfz/images/...,...,,,,,,,,,,
4,12F,Caster,Splash,Rhodes Island,2-star,Ranged,[Starter],Deals AOE Arts Damage,Recruitment; TR-6,https://static.wikia.nocookie.net/mrfz/images/...,...,Standard,Survivor,"[CN] 太子 (""The Prince"")[1]",,,,,,,


In [12]:
df.to_csv('src/data.csv', index=False)