In [23]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import pickle
import re
from tqdm import tqdm
from pathlib import Path
import urllib
import pyarrow

We will pull the Star Wars pickles to extract just the planets/locales information

In [24]:
files = sorted(Path('../WPscraped').glob('*.pickle'))
files

[WindowsPath('../WPscraped/starwars_all_canon_data_1.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_2.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_3.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_4.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_5.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_6.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_7.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_8.pickle')]

In [25]:
#create the dictionary to run
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

# The code is to open the data and to count to make sure all files in the .pickles are carried over
len(data)

39460

In [26]:
#find the key in the dictionary
def find_key(key_name, data):
    for key, value in data.items():
        if key_name == key:
            return value
        if isinstance(value, dict):
            value = find_key(key_name, value)
            if value is not None:
                return value
    return None

def get_first(key_name, data):
    result = find_key(key_name, data)
    if isinstance(result, list):
        result = result[0]
    return result

In [27]:
# test on the planet Ilum

print(data['Ilum'])

{'url': 'https://starwars.fandom.com/wiki/Ilum', 'title': '\n\t\t\t\t\tIlum\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region': 'Unknown Regions', 'Sector': '7G sector', 'System': ['Ilum system', 'Starkiller Base system'], 'Suns': ['Asar', 'Unidentified star'], 'Orbital position': '5', 'Moons': ['2', '0'], 'Grid square': ['G-7'], 'Trade routes': 'Metellos-Ilum hyperspace route', 'Rotation period': '66 hours', 'Orbital period': ['301 days', 'Variable']}, 'Physical information': {'Diameter': '660 kilometers', 'Atmosphere': ['Breathable'], 'Climate': 'Frigid', 'Primary terrain': ['Crystallized glaciers', 'Ice plains', 'Mountains', 'Forests'], 'Points of interest': ['Jedi Temple', 'Crystal Cave', 'Imperial Trench', 'Starkiller Base'], 'Flora': 'Trees', 'Fauna': ['Asharl panther', 'Blismal', 'Gorgodon', 'Razhak', 'Snowfeather bird', 'Unidentified creature']}, 'Societal information': {'Immigrated species': 'Human', 'Population': ['5', '200'], 'Major exports

In [28]:
# test on the planet Corellia

print(data['Corellia'])

{'url': 'https://starwars.fandom.com/wiki/Corellia', 'title': '\n\t\t\t\t\tCorellia\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region': 'Core Worlds', 'Sector': 'Corellian sector', 'System': 'Corellia system', 'Suns': ['1:', 'Corell'], 'Moons': ['Multiple:', "Corellia's nearest moon", 'Gus', 'Gus Talon'], 'Grid square': 'M-11', 'Trade routes': ['Corellian Run', 'Corellian Trade Spine'], 'Rotation period': '25 standard hours', 'Orbital period': '329 standard days'}, 'Physical information': {'Class': 'Terrestrial', 'Diameter': ['11', '000 kilometers'], 'Atmosphere': 'Breathable', 'Climate': 'Temperate', 'Primary terrain': ['Forests', 'Jungles', 'Oceans', 'Industrial urban'], 'Points of interest': ['Bottoms', 'Gilded Descent Casino', 'Imperial training center', 'Navigation institute', 'Santhe Shipyards'], 'Flora': 'Tree', 'Fauna': ['Bluevev glider', 'Coppergrin', 'Corellian hound', 'Creedok', 'Corellian sand panther', 'Fingerlip garpon', 'Fleek eel', 'Ga

In [29]:
# test on the planet Coruscant

print(data['Coruscant'])

{'url': 'https://starwars.fandom.com/wiki/Coruscant', 'title': '\n\t\t\t\t\tCoruscant\t\t\t\t', 'is_character': False, 'side_bar': {'Astrographical information': {'Region': ['Core Worlds', 'The Interior'], 'Sector': ['Corusca sector', 'Coruscant subsector'], 'System': 'Coruscant system', 'Suns': '1: Coruscant Prime', 'Moons': ['4', 'Centax-1', 'Centax-2', 'Centax-3', 'Hesperidium'], 'Grid square': 'L-9', 'XYZ coordinates': ['0', '0', '0'], 'Trade routes': ['Corellian Run', 'Koros Trunk Line', 'Metellos Trade Route', 'Namadii Corridor', 'Nexus Route', 'Perlemian Trade Route'], 'Rotation period': '24 standard hours', 'Orbital period': '365 standard days'}, 'Physical information': {'Diameter': ['12', '240 kilometers'], 'Atmosphere': 'Breathable', 'Climate': 'Temperate ', 'Primary terrain': ['Ecumenopolis', 'Dense', 'planetwide multilevel urban city'], 'Points of interest': ['Bureau of Ships and Services Heritage Museum', 'CoCo Town', 'Coruscant spaceport', 'Coruscant Imperial shipyard', '

In [55]:
# Create an empty dictionary to store the planet data
planet_dict = {}

# Loop through each key in the dictionary
for key in data:
    # Check if the item contains "Astrographical information" in its side bar before extracting data
    if "Astrographical information" in data[key]["side_bar"]:
        # Extract the planet name from the title column
        planet_name = data[key]["title"].strip()

        # Extract the suns value from the Astrographical information section
        if "Suns" in data[key]["side_bar"]["Astrographical information"] and ":" in data[key]["side_bar"]["Astrographical information"]["Suns"]:
            suns = data[key]["side_bar"]["Astrographical information"]["Suns"].split(": ")[1]
        else:
            continue #skip because it does not have a sun

        # Extract the moons value from the Astrographical information section
        if "Moons" in data[key]["side_bar"]["Astrographical information"]:
            moons = data[key]["side_bar"]["Astrographical information"]["Moons"]
        else:
            moons = None

        # Extract the atmosphere value from the Physical information section
        if "Physical information" in data[key]["side_bar"] and "Atmosphere" in data[key]["side_bar"]["Physical information"]:
            atmosphere = data[key]["side_bar"]["Physical information"]["Atmosphere"]
        else:
            atmosphere = None

        # Extract the climate value from the Physical information section
        if "Physical information" in data[key]["side_bar"] and "Climate" in data[key]["side_bar"]["Physical information"]:
            climate = data[key]["side_bar"]["Physical information"]["Climate"]
        else:
            climate = None

        # Extract the native species value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Native species" in data[key]["side_bar"]["Societal information"]:
            native_species = data[key]["side_bar"]["Societal information"]["Native species"]
        else:
            native_species = None

        # Extract the immigrated species value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Immigrated species" in data[key]["side_bar"]["Societal information"]:
            immigrated_species = data[key]["side_bar"]["Societal information"]["Immigrated species"]
        else:
            immigrated_species = None

        # Extract the population value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Population" in data[key]["side_bar"]["Societal information"]:
            population = data[key]["side_bar"]["Societal information"]["Population"]
        else:
            population = None

        # Extract the major exports value from the Economic information section
        if "Economic information" in data[key]["side_bar"] and "Major exports" in data[key]["side_bar"]["Economic information"]:
            major_exports = data[key]["side_bar"]["Economic information"]["Major exports"]
        else:
            major_exports = None

        # Extract the affiliation value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Affiliation" in data[key]["side_bar"]["Societal information"]:
            affiliation = data[key]["side_bar"]["Societal information"]["Affiliation"]
        else:
            affiliation = None

        # Check if planet has a sun and atmosphere before adding it to our dictionary
        if suns is not None and atmosphere is not None:
            # Create a dictionary with all of the planet's data
            planet_data = {
                "suns": suns,
                "moons": moons,
                "atmosphere": atmosphere,
                "climate": climate,
                "native_species": native_species,
                "immigrated_species": immigrated_species,
                "population": population,
                "major_exports": major_exports,
                "affiliation": affiliation
            }

            # Add the planet's name and its corresponding dictionary to our main dictionary
            planet_dict[planet_name] = planet_data

In [56]:
# creating a loop to test the first 3 items to make sure it works properly

for planet in list(planet_dict.keys())[:30]:
    print(planet)
    print(planet_dict[planet])

Alderaan
{'suns': "Alderaan's sun", 'moons': '0', 'atmosphere': 'Breathable', 'climate': None, 'native_species': None, 'immigrated_species': 'Human', 'population': ['2 billion', '95% humans', '5% other'], 'major_exports': None, 'affiliation': ['Elder Houses', 'House of Organa', 'Galactic Republic', 'Galactic Empire', 'Alliance to Restore the Republic']}
Anoat
{'suns': "Anoat's star", 'moons': '1', 'atmosphere': 'Polluted', 'climate': 'Toxic', 'native_species': None, 'immigrated_species': None, 'population': None, 'major_exports': None, 'affiliation': ['Noble Court', 'Jedi Order', 'Galactic Empire', 'Resistance']}
Atollon
{'suns': 'Ashbo', 'moons': '1', 'atmosphere': 'Breathable', 'climate': 'Arid', 'native_species': None, 'immigrated_species': ['Human', 'Lasat', "Twi'lek"], 'population': ['438', '77% Human', "19% Twi'lek", '4% other'], 'major_exports': None, 'affiliation': ['Alliance to Restore the Republic', 'Phoenix Cell']}
Cato Neimoidia
{'suns': 'Neri', 'moons': 'At least 2', 'atmo

In [51]:
planets = pd.DataFrame(planet_dict)

In [52]:
planets

Unnamed: 0,Alderaan,Anoat,Atollon,Cato Neimoidia,Christophsis,Coruscant,Crait,D'Qar,Dantooine,Dathomir,Felucia,Lothal,Mustafar,Naboo,Onderon,Pijal,Sirpar,Taanab,Utapau
suns,Alderaan's sun,Anoat's star,Ashbo,Neri,Christophsis' sun,Coruscant Prime,Crait,Ileenium,Dina,Domir,Felix,Lothal,Priate,Naboo,Prael,Pijal's sun,Sirpar's sun,Tive,Utapau
moons,0,1,1,At least 2,1: Leesis,"[4, Centax-1, Centax-2, Centax-3, Hesperidium]",0,2,2,4,8,2,,"[3, Ohma-D'un, Onoam, Veruna]","[At least 4:, Dagri, Dxun, Evas, Suthre]",1: Pijal's moon,,1: Taanab's moon,9
atmosphere,Breathable,Polluted,Breathable,Breathable,Breathable,Breathable,"[Breathable, Oxygen, Nitrogen]",Breathable,Type I,Breathable,Type I,Type I breathable,Type II,Breathable,Type I,Type I,Breathable,Type I,Type I
climate,,Toxic,Arid,,,Temperate,Temperate,Temperate,Temperate,Temperate,Hot and humid,Temperate,"[Hot, ash-laden, and stormy]",Temperate,Temperate,Temperate,Hot,Mild,Temperate
native_species,,,,,,"[Human, Taung, Zhell]",,,,"[Dathomirian, Fromprath]","[Felucian, Jungle Felucian]",Loth-wolf,Mustafarian,Gungans,,,Human,Human,"[Pau'an, Utai]"
immigrated_species,Human,,"[Human, Lasat, Twi'lek]",Neimoidian,"[Human, Christophsian]","[Besalisk, Balosar, Bith, Cerean, Cosian, Gran...",,,Humans,"[Human, Zabrak, Zeffonian, Toydarian]","[Gossam, Kyuzo, Tee-muss]","[Anx, Aqualish, Balosar, Bardottan, Chagrian, ...","[Alazmec, Falleen, Human, A large variety of o...","[Elders, Humans, Naboo]","[Bivall, Human, Ithorian, Togruta, Twi'lek]","[Abednedos, Duros, Humans, Ithorians, Nautolan...",,"[Bantha, Staga, Roba, Nerf]","[Amani, Sugi]"
population,"[2 billion, 95% humans, 5% other]",,"[438, 77% Human, 19% Twi'lek, 4% other]",,,"[Trillions, 78% humans, 22% other]",Uninhabited,,,"[Roughly 600, 90% human, 7% Zabrak, 3% other, ...",425 million,,"[Approximately 15, 000, –20, 000]","[4.5 billion:, 72% Gungan, 27% Human, 1% Other]",,,,980 million,"[95 million:, 65% Utai, 30% Pau'an, 5% other]"
major_exports,,,,,,,,,,,,,,,,,,,
affiliation,"[Elder Houses, House of Organa, Galactic Repub...","[Noble Court, Jedi Order, Galactic Empire, Res...","[Alliance to Restore the Republic, Phoenix Cell]","[Galactic Republic, Trade Federation, InterGal...","[Galactic Republic, Galactic Empire]","[Dai Bendu, Galactic Republic, Jedi Order, Sit...","[Alliance to Restore the Republic, Resistance]","[Alliance to Restore the Republic, Resistance]","[Jedi Order, Galactic Republic, Alliance to Re...","[Nightsisters, Sith, Confederacy of Independen...","[Commerce Guild, Confederacy of Independent Sy...","[Jedi Order, Galactic Republic, Galactic Empir...","[Jedi Order, Sith, Techno Union, Black Sun, Co...","[Gungan High Council, Royal House of Naboo, Ga...","[Galactic Republic, Confederacy of Independent...","[Pijali monarchy, Czerka Corporation, Galactic...","[Galactic Empire, Arkanis Academy]",,"[Techno Union, Confederacy of Independent Syst..."


In [45]:
planets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, suns to affiliation
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Alderaan        6 non-null      object
 1   Anoat           5 non-null      object
 2   Atollon         7 non-null      object
 3   Cato Neimoidia  5 non-null      object
 4   Christophsis    5 non-null      object
 5   Coruscant       8 non-null      object
 6   Crait           6 non-null      object
 7   D'Qar           6 non-null      object
 8   Dantooine       6 non-null      object
 9   Dathomir        8 non-null      object
 10  Felucia         8 non-null      object
 11  Lothal          7 non-null      object
 12  Mustafar        7 non-null      object
 13  Naboo           8 non-null      object
 14  Onderon         6 non-null      object
 15  Pijal           6 non-null      object
 16  Sirpar          5 non-null      object
 17  Taanab          7 non-null      object
 18  Utapau

In [53]:
# Create an empty dataframe to store the planet data
planets = pd.DataFrame(columns=["Planet Name", "Suns", "Moons", "Atmosphere", "Climate", "Native Species", "Immigrated Species", "Population", "Major Exports", "Affiliation"])

# Loop through each key in the dictionary
for key in data:
    # Check if the item contains "Astrographical information" in its side bar before extracting data
    if "Astrographical information" in data[key]["side_bar"]:
        # Extract the planet name from the title column
        planet_name = data[key]["title"].strip()

        # Extract the suns value from the Astrographical information section
        if "Suns" in data[key]["side_bar"]["Astrographical information"] and ":" in data[key]["side_bar"]["Astrographical information"]["Suns"]:
            suns = data[key]["side_bar"]["Astrographical information"]["Suns"].split(": ")[1]
        else:
            continue #skip if it does not have a sun

        # Extract the moons value from the Astrographical information section
        if "Moons" in data[key]["side_bar"]["Astrographical information"]:
            moons = data[key]["side_bar"]["Astrographical information"]["Moons"]
        else:
            moons = None

        # Extract the atmosphere value from the Physical information section
        if "Physical information" in data[key]["side_bar"] and "Atmosphere" in data[key]["side_bar"]["Physical information"]:
            atmosphere = data[key]["side_bar"]["Physical information"]["Atmosphere"]
        else:
            atmosphere = None

        # Extract the climate value from the Physical information section
        if "Physical information" in data[key]["side_bar"] and "Climate" in data[key]["side_bar"]["Physical information"]:
            climate = data[key]["side_bar"]["Physical information"]["Climate"]
        else:
            climate = None

        # Extract the native species value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Native species" in data[key]["side_bar"]["Societal information"]:
            native_species = data[key]["side_bar"]["Societal information"]["Native species"]
        else:
            native_species = None

        # Extract the immigrated species value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Immigrated species" in data[key]["side_bar"]["Societal information"]:
            immigrated_species = data[key]["side_bar"]["Societal information"]["Immigrated species"]
        else:
            immigrated_species = None

        # Extract the population value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Population" in data[key]["side_bar"]["Societal information"]:
            population = data[key]["side_bar"]["Societal information"]["Population"]
        else:
            population = None

        # Extract the major exports value from the Economic information section
        if "Economic information" in data[key]["side_bar"] and "Major exports" in data[key]["side_bar"]["Economic information"]:
            major_exports = data[key]["side_bar"]["Economic information"]["Major exports"]
        else:
            major_exports = None

        # Extract the affiliation value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Affiliation" in data[key]["side_bar"]["Societal information"]:
            affiliation = data[key]["side_bar"]["Societal information"]["Affiliation"]
        else:
            affiliation = None

        # Check if planet has a sun and atmosphere before adding it to our dictionary
        if suns is not None and atmosphere is not None:
            # Add a row to the planets dataframe with this planet's data
            planets.loc[len(planets)] = [planet_name, suns, moons, atmosphere, climate, native_species, immigrated_species, population, major_exports, affiliation]

In [48]:
planets

Unnamed: 0,Planet Name,Suns,Moons,Atmosphere,Climate,Native Species,Immigrated Species,Population,Major Exports,Affiliation
0,Alderaan,Alderaan's sun,0,Breathable,,,Human,"[2 billion, 95% humans, 5% other]",,"[Elder Houses, House of Organa, Galactic Repub..."
1,Anoat,Anoat's star,1,Polluted,Toxic,,,,,"[Noble Court, Jedi Order, Galactic Empire, Res..."
2,Atollon,Ashbo,1,Breathable,Arid,,"[Human, Lasat, Twi'lek]","[438, 77% Human, 19% Twi'lek, 4% other]",,"[Alliance to Restore the Republic, Phoenix Cell]"
3,Cato Neimoidia,Neri,At least 2,Breathable,,,Neimoidian,,,"[Galactic Republic, Trade Federation, InterGal..."
4,Christophsis,Christophsis' sun,1: Leesis,Breathable,,,"[Human, Christophsian]",,,"[Galactic Republic, Galactic Empire]"
5,Coruscant,Coruscant Prime,"[4, Centax-1, Centax-2, Centax-3, Hesperidium]",Breathable,Temperate,"[Human, Taung, Zhell]","[Besalisk, Balosar, Bith, Cerean, Cosian, Gran...","[Trillions, 78% humans, 22% other]",,"[Dai Bendu, Galactic Republic, Jedi Order, Sit..."
6,Crait,Crait,0,"[Breathable, Oxygen, Nitrogen]",Temperate,,,Uninhabited,,"[Alliance to Restore the Republic, Resistance]"
7,D'Qar,Ileenium,2,Breathable,Temperate,,,,,"[Alliance to Restore the Republic, Resistance]"
8,Dantooine,Dina,2,Type I,Temperate,,Humans,,,"[Jedi Order, Galactic Republic, Alliance to Re..."
9,Dathomir,Domir,4,Breathable,Temperate,"[Dathomirian, Fromprath]","[Human, Zabrak, Zeffonian, Toydarian]","[Roughly 600, 90% human, 7% Zabrak, 3% other, ...",,"[Nightsisters, Sith, Confederacy of Independen..."


In [57]:
# Create an empty dictionary to store the planet data
planet_dict = {}

# Loop through each key in the dictionary
for key in data:
    # Check if the item contains "Astrographical information" in its side bar before extracting data
    if "Astrographical information" in data[key]["side_bar"]:
        # Extract the planet name from the title column
        planet_name = data[key]["title"].strip()

        # Extract the suns value from the Astrographical information section
        if "Suns" in data[key]["side_bar"]["Astrographical information"]:
            suns = data[key]["side_bar"]["Astrographical information"]["Suns"]
        else:
            suns = None 

        # Extract the moons value from the Astrographical information section
        if "Moons" in data[key]["side_bar"]["Astrographical information"]:
            moons = data[key]["side_bar"]["Astrographical information"]["Moons"]
        else:
            moons = None

        # Extract the atmosphere value from the Physical information section
        if "Physical information" in data[key]["side_bar"] and "Atmosphere" in data[key]["side_bar"]["Physical information"]:
            atmosphere = data[key]["side_bar"]["Physical information"]["Atmosphere"]
        else:
            atmosphere = None

        # Extract the climate value from the Physical information section
        if "Physical information" in data[key]["side_bar"] and "Climate" in data[key]["side_bar"]["Physical information"]:
            climate = data[key]["side_bar"]["Physical information"]["Climate"]
        else:
            climate = None

        # Extract the native species value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Native species" in data[key]["side_bar"]["Societal information"]:
            native_species = data[key]["side_bar"]["Societal information"]["Native species"]
        else:
            native_species = None

        # Extract the immigrated species value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Immigrated species" in data[key]["side_bar"]["Societal information"]:
            immigrated_species = data[key]["side_bar"]["Societal information"]["Immigrated species"]
        else:
            immigrated_species = None

        # Extract the population value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Population" in data[key]["side_bar"]["Societal information"]:
            population = data[key]["side_bar"]["Societal information"]["Population"]
        else:
            population = None

        # Extract the major exports value from the Economic information section
        if "Economic information" in data[key]["side_bar"] and "Major exports" in data[key]["side_bar"]["Economic information"]:
            major_exports = data[key]["side_bar"]["Economic information"]["Major exports"]
        else:
            major_exports = None

        # Extract the affiliation value from the Societal information section
        if "Societal information" in data[key]["side_bar"] and "Affiliation" in data[key]["side_bar"]["Societal information"]:
            affiliation = data[key]["side_bar"]["Societal information"]["Affiliation"]
        else:
            affiliation = None

        # Check if planet has a sun and atmosphere before adding it to our dictionary
        if population is not None:
            # Create a dictionary with all of the planet's data
            planet_data = {
                "suns": suns,
                "moons": moons,
                "atmosphere": atmosphere,
                "climate": climate,
                "native_species": native_species,
                "immigrated_species": immigrated_species,
                "population": population,
                "major_exports": major_exports,
                "affiliation": affiliation
            }

            # Add the planet's name and its corresponding dictionary to our main dictionary
            planet_dict[planet_name] = planet_data