In [2]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import pickle
import re
from tqdm import tqdm
from pathlib import Path
import urllib
import pyarrow

We will pull the Star Wars pickles to extract just the planets/locales information

In [3]:
files = sorted(Path('../WPscraped').glob('*.pickle'))
files

[WindowsPath('../WPscraped/starwars_all_canon_data_1.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_2.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_3.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_4.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_5.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_6.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_7.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_8.pickle')]

In [4]:
#create the dictionary to run
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

# The code is to open the data and to count to make sure all files in the .pickles are carried over
len(data)

39460

In [5]:
#find the key in the dictionary
def find_key(key_name, data):
    for key, value in data.items():
        if key_name == key:
            return value
        if isinstance(value, dict):
            value = find_key(key_name, value)
            if value is not None:
                return value
    return None

def get_first(key_name, data):
    result = find_key(key_name, data)
    if isinstance(result, list):
        result = result[0]
    return result

In [6]:
# test on the planet Ilum

print(data['Ilum'])

{'url': 'https://starwars.fandom.com/wiki/Ilum', 'title': '\n\t\t\t\t\tIlum\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region': 'Unknown Regions', 'Sector': '7G sector', 'System': ['Ilum system', 'Starkiller Base system'], 'Suns': ['Asar', 'Unidentified star'], 'Orbital position': '5', 'Moons': ['2', '0'], 'Grid square': ['G-7'], 'Trade routes': 'Metellos-Ilum hyperspace route', 'Rotation period': '66 hours', 'Orbital period': ['301 days', 'Variable']}, 'Physical information': {'Diameter': '660 kilometers', 'Atmosphere': ['Breathable'], 'Climate': 'Frigid', 'Primary terrain': ['Crystallized glaciers', 'Ice plains', 'Mountains', 'Forests'], 'Points of interest': ['Jedi Temple', 'Crystal Cave', 'Imperial Trench', 'Starkiller Base'], 'Flora': 'Trees', 'Fauna': ['Asharl panther', 'Blismal', 'Gorgodon', 'Razhak', 'Snowfeather bird', 'Unidentified creature']}, 'Societal information': {'Immigrated species': 'Human', 'Population': ['5', '200'], 'Major exports

In [7]:
# test on the planet Corellia

print(data['Corellia'])

{'url': 'https://starwars.fandom.com/wiki/Corellia', 'title': '\n\t\t\t\t\tCorellia\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region': 'Core Worlds', 'Sector': 'Corellian sector', 'System': 'Corellia system', 'Suns': ['1:', 'Corell'], 'Moons': ['Multiple:', "Corellia's nearest moon", 'Gus', 'Gus Talon'], 'Grid square': 'M-11', 'Trade routes': ['Corellian Run', 'Corellian Trade Spine'], 'Rotation period': '25 standard hours', 'Orbital period': '329 standard days'}, 'Physical information': {'Class': 'Terrestrial', 'Diameter': ['11', '000 kilometers'], 'Atmosphere': 'Breathable', 'Climate': 'Temperate', 'Primary terrain': ['Forests', 'Jungles', 'Oceans', 'Industrial urban'], 'Points of interest': ['Bottoms', 'Gilded Descent Casino', 'Imperial training center', 'Navigation institute', 'Santhe Shipyards'], 'Flora': 'Tree', 'Fauna': ['Bluevev glider', 'Coppergrin', 'Corellian hound', 'Creedok', 'Corellian sand panther', 'Fingerlip garpon', 'Fleek eel', 'Ga

In [8]:
# test on the planet Coruscant

print(data['Coruscant'])

{'url': 'https://starwars.fandom.com/wiki/Coruscant', 'title': '\n\t\t\t\t\tCoruscant\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region': ['Core Worlds', 'The Interior'], 'Sector': ['Corusca sector', 'Coruscant subsector'], 'System': 'Coruscant system', 'Suns': '1: Coruscant Prime', 'Moons': ['4', 'Centax-1', 'Centax-2', 'Centax-3', 'Hesperidium'], 'Grid square': 'L-9', 'XYZ coordinates': ['0', '0', '0'], 'Trade routes': ['Corellian Run', 'Koros Trunk Line', 'Metellos Trade Route', 'Namadii Corridor', 'Nexus Route', 'Perlemian Trade Route'], 'Rotation period': '24 standard hours', 'Orbital period': '365 standard days'}, 'Physical information': {'Diameter': ['12', '240 kilometers'], 'Atmosphere': 'Breathable', 'Climate': 'Temperate ', 'Primary terrain': ['Ecumenopolis', 'Dense', 'planetwide multilevel urban city'], 'Points of interest': ['Bureau of Ships and Services Heritage Museum', 'CoCo Town', 'Coruscant spaceport', 'Coruscant Imperial shipyard', '

In [9]:
# test on the moon Yavin 4

print(data['Yavin_4'])

{'url': 'https://starwars.fandom.com/wiki/Yavin_4', 'title': '\n\t\t\t\t\tYavin 4\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region(s)': 'Outer Rim Territories', 'Sector': 'Gordian Reach', 'System': 'Yavin system', 'Planet': 'Yavin Prime', 'Grid square': 'P-6', 'Orbital period': ['4', '818 days']}, 'Physical information': {'Diameter': ['10', '200 kilometers'], 'Atmosphere': 'Type I ', 'Climate': ['Temperate', 'to tropical'], 'Primary terrain': 'Jungles', 'Point(s) of interest': ['Ferra Groves', 'Flat Mountain of Yavin', 'Great Temple', 'Massassi Valley', 'Skygazer Hill', 'Yavin 4 Defense Force station'], 'Flora': ['Climbing fern', 'Bioluminescent orchids', 'Koyo', 'Massassi tree', 'Moss', 'Vines'], 'Fauna': ['Angler', 'Armored eel', 'Leviathan grub', 'Lizard crab', 'Piranha beetle', 'Runyip', 'Stintaril', 'Whisper bird', 'Woolamander', 'Unidentified worm'], 'Other lifeforms': ['Algae', 'Grenade fungi']}, 'Societal information': {'Immigrated species': 

In [15]:
# Create an empty dictionary to store the planet data
planet_dict = {}

# Loop through each key in the dictionary
for key in data:
    # Check if the item contains "Astrographical information" in its side bar before extracting data
    if "Astrographical information" in data[key]["side_bar"]:
        # Extract the planet name from the title column
        planet_name = data[key]["title"].strip()

        # Extract the suns value from the Astrographical information section
        suns = data[key]["side_bar"]["Astrographical information"].get("Suns", None) 

        # Extract the moons value from the Astrographical information section
        moons = data[key]["side_bar"]["Astrographical information"].get("Moons", None)

        # Extract the atmosphere value from the Physical information section
        physical_info = data[key]["side_bar"].get("Physical information", {})
        atmosphere = physical_info.get("Atmosphere", None)

        # Extract the climate value from the Physical information section
        climate = physical_info.get("Climate", None)

        # Extract the native species value from the Societal information section
        societal_info = data[key]["side_bar"].get("Societal information", {})
        native_species = societal_info.get("Native species", None) 

        # Extract the immigrated species value from the Societal information section
        immigrated_species = societal_info.get("Immigrated species", None)
                      
        # Extract the population value from the Societal information section
        population = societal_info.get("Population", None)
            
        # Extract the affiliation value from the Societal information section
        affiliation = societal_info.get("Affiliation", None)
        
        # Extract the major exports value from the Economic information section
        economics = data[key]["side_bar"].get("Economic information", {})
        major_exports = economics.get("Major exports", None)
            
        # Create a dictionary with all of the planets's data
        planets = {
                "suns": suns,
                "moons": moons,
                "atmosphere": atmosphere,
                "climate": climate,
                "native_species": native_species,
                "immigrated_species": immigrated_species,
                "population": population,
                "major_exports": major_exports,
                "affiliation": affiliation
                }

        # Add the planet's name and its corresponding dictionary to our main dictionary
        planet_dict[planet_name] = planets

In [16]:
# create a dataframe from the planet dictionary
astro_df = pd.DataFrame(planet_dict)

# transpose dataframe
astro_df = astro_df.T

In [17]:
astro_df

Unnamed: 0,suns,moons,atmosphere,climate,native_species,immigrated_species,population,major_exports,affiliation
3rd Moon,,,,,,Various,,,
7G sector,,,,,,,,,Galactic Empire
Yasooska,,,,,,,,,
Aakaash,,,,,,,,,
Aakaash system,Aakaash,,,,,Humans,174.2 billion,,Land & Sky Corporation
...,...,...,...,...,...,...,...,...,...
Zorbia,,,,,,,,,
Zygerria,1,1,Breathable,Temperate,Zygerrian,,,,"[Zygerrian Slave Empire, Confederacy of Indepe..."
Zygerrian system,1,,,,Zygerrian,,,,
Zyzek,,,,,,,,,


In [18]:
astro_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2562 entries, 3rd Moon to Zyzek system
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   suns                381 non-null    object
 1   moons               250 non-null    object
 2   atmosphere          403 non-null    object
 3   climate             205 non-null    object
 4   native_species      475 non-null    object
 5   immigrated_species  395 non-null    object
 6   population          102 non-null    object
 7   major_exports       0 non-null      object
 8   affiliation         789 non-null    object
dtypes: object(9)
memory usage: 200.2+ KB


In [19]:
# convert columns with lists to string
astro_df['suns'] = astro_df['suns'].astype('str')
astro_df['moons'] = astro_df['moons'].astype('str')
astro_df['atmosphere'] = astro_df['atmosphere'].astype('str')
astro_df['climate'] = astro_df['climate'].astype('str')
astro_df['native_species'] = astro_df['native_species'].astype('str')
astro_df['immigrated_species'] = astro_df['immigrated_species'].astype('str')
astro_df['population'] = astro_df['population'].astype('str')
astro_df['major_exports'] = astro_df['major_exports'].astype('str')
astro_df['affiliation'] = astro_df['affiliation'].astype('str')

# create the Planets/Regions parquet

astro_df.to_parquet('../WPscraped/StarWars_Planets.parquet', index=False)