In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import pickle
import re
from tqdm import tqdm

This section we will be scraping all Wookieepedia for all Canon articles, and storing them as pickles.

In [3]:
page_url = 'https://starwars.fandom.com/wiki/Category:Canon_articles'
base_url = 'https://starwars.fandom.com'
pages = {}
page_num = 1

while page_url is not None:
    result = requests.get(page_url)
    content = result.content
    soup = BS(content, "html.parser")
    
    # extract urls
    links = soup.find_all('a', class_='category-page__member-link')
    links_before = len(pages)
    if links:
        for link in links:
            url = base_url + link.get('href')
            key = link.get('href').split('/')[-1]
            if 'Category:' not in key:
                pages[key] = url
        print(f'Page {page_num} - {len(pages) - links_before} new links ({page_url})')
        page_num += 1

    # get next page button
    next_urls = soup.find_all("a", class_='category-page__pagination-next')
    if next_urls:
        new_url = next_urls[0].get('href')
        if new_url == page_url:
            break
        else:
            page_url = new_url
    else:
        page_url = None

print(f'Number of pages: {len(pages)}')

# Save to disk
with open('canon_articles.pkl', 'wb') as f:
    pickle.dump(pages, f, protocol=pickle.HIGHEST_PROTOCOL)

Page 1 - 199 new links (https://starwars.fandom.com/wiki/Category:Canon_articles)
Page 2 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=234+BBY%0A234+BBY)
Page 3 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=511+BBY%0A511+BBY)
Page 4 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=Aargonar%0AAargonar)
Page 5 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=Administrator%0AAdministrator)
Page 6 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=Airship+%28Carnelion+IV%29%0AAirship+%28Carnelion+IV%29)
Page 7 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=All+Terrain+Armored+Transport+%28original%29%0AAll+Terrain+Armored+Transport+%28original%29)
Page 8 - 200 new links (https://starwars.fandom.com/wiki/Category:Canon_articles?from=Ambush+of+a+rebel+convoy+at+Derra%0AAmbush+of+a+rebel+convoy+at+Derra)
Page 

We have created the dictionary with all current canon articles.  Now we will scrape them for Title, Sidebar information, and links to other canon articles.

In [4]:
scraped = {}
failed = {}
partition_size = 5000
folder = '../WPscraped/'

for ix, (key, page_url) in tqdm(enumerate(pages.items()), total=(len(pages))):
    try:
        # Get page
        result = requests.get(page_url)
        content = result.content
        soup = BS(content, "html.parser")

        # Get title
        heading = soup.find('h1', id='firstHeading')
        if heading is None: continue
        heading = heading.text

        # Extract Sidebar
        is_character = False
        side_bar = {}
        sec = soup.find_all('section', class_='pi-item')
        for s in sec:
            title = s.find('h2')
            if title is None:
                title = '<no category>'
            else:
                title = title.text
            side_bar[title] = {}
            items = s.find_all('div', class_='pi-item')
            for item in items:
                attr = item.find('h3', class_='pi-data-label')
                if attr is None:
                    attr = '<no attribute>'
                else:
                    attr = attr.text
                if attr == 'Species': is_character = True
                value = re.sub("[\(\[].*?[\)\]]" ,'', '], '.join(item.find('div', class_='pi-data-value').text.split(']')))
                value = value.strip()[:-1].replace(',,', ',')
                if ',' in value:
                    value = [i.strip() for i in value.split(',') if i.strip() != '']
                side_bar[title][attr] = value

        # Raw page content
        raw_content = soup.find('div', class_='mw-parser-output')
        if raw_content is not None:
            for raw_paragraph in raw_content.find_all('p', recursive=False):
                if 'aside' in str(raw_paragraph): continue
                break
            paragraph = value = re.sub("[\(\[].*?[\)\]]" ,'', raw_paragraph.text)

            # cross-links
            keywords = []
            for link in raw_content.find_all('a'):
                part = link.get('href')
                if part is not None:
                    part = part.split('/')[-1] 
                    if part in pages.keys() and part != key:
                        keywords.append(part)
            keywords = list(set(keywords))
        else:
            # Empty page
            keywords = []
            paragraph = ''

        # Data object
        scraped[key] = {
            'url': page_url,
            'title': heading,
            'is_character': is_character,
            'side_bar': side_bar,
            'paragraph': paragraph,
            'crosslinks': keywords,
        }

        # save partition
        if (ix + 1) % partition_size == 0:
            last_number = (ix+1) // partition_size
            fn = folder + f'starwars_all_canon_data_{last_number}.pickle'
            with open(fn, 'wb') as f:
                pickle.dump(scraped, f, protocol=pickle.HIGHEST_PROTOCOL)
            scraped = {}
    except:
        print('Failed!')
        failed[key] = page_url
    
# Save final part to disk
fn = folder + f'starwars_all_canon_data_{last_number + 1}.pickle'
with open(fn, 'wb') as f:
    pickle.dump(scraped, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 39460/39460 [4:51:26<00:00,  2.26it/s]  


Now that we have the canon data pulled and scraped, and unnecessary pieces of data pulled out, we have our clean datasets we can use to extract dataframes for characters, planets, and vehicles.

In [5]:
from pathlib import Path
import urllib


files = sorted(Path('../WPscraped').glob('*.pickle'))
files

[WindowsPath('../WPscraped/starwars_all_canon_data_1.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_2.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_3.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_4.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_5.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_6.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_7.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_8.pickle')]

In [6]:
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

len(data)

39460

In [7]:
def remove_url_stuff(text):
    return urllib.parse.unquote(text).replace('"', '').replace("'", '')

In [8]:
cleaned = {}
for key, value in tqdm(data.items()):
    new_key = remove_url_stuff(key)
    cleaned[new_key] = value
    cleaned[new_key]['crosslinks'] = [remove_url_stuff(crosslink) for crosslink in value['crosslinks']]
data = cleaned

100%|██████████| 39460/39460 [00:01<00:00, 36189.95it/s]
