In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import pickle
import re
from tqdm import tqdm
from pathlib import Path
import urllib
import pyarrow

In [2]:
files = sorted(Path('../WPscraped').glob('*.pickle'))
files

[WindowsPath('../WPscraped/starwars_all_canon_data_1.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_2.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_3.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_4.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_5.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_6.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_7.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_8.pickle')]

In [3]:
#create the dictionary to run
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

# The code is to open the data and to count to make sure all files in the .pickles are carried over
len(data)

39460

In [4]:
#find the key in the dictionary
def find_key(key_name, data):
    for key, value in data.items():
        if key_name == key:
            return value
        if isinstance(value, dict):
            value = find_key(key_name, value)
            if value is not None:
                return value
    return None

def get_first(key_name, data):
    result = find_key(key_name, data)
    if isinstance(result, list):
        result = result[0]
    return result

In [5]:
# test on the species Weequay

print(data['Weequay'])

{'url': 'https://starwars.fandom.com/wiki/Weequay', 'title': '\n\t\t\t\t\tWeequay\t\t\t\t', 'is_character': False, 'side_bar': {'Biological classification': {'Designation': 'Sentient'}, 'Physical characteristics': {'Average height': '1.89 meters', 'Skin color': ['Blue', 'Brown', 'Gray', 'Pink', 'Red', 'Yellow'], 'Hair color': ['Black', 'Blond', 'Gray'], 'Eye color': ['Black', 'Gold', 'Gray'], 'Distinctions': ['Tough', 'leathery skin', 'that provided resistance to blasterfire']}, 'Sociocultural characteristics': {'Homeworld': 'Sriluur', 'Habitat': 'Deserts'}}, 'paragraph': 'Weequays were a sentient species from the planet Sriluur common throughout the galaxy. Weequay possessed tough, leathery skin resistant to desert environments and blaster fire. Many Weequay were mercenaries, pirates, and bodyguards for the Hutt Clan. Notable Weequays included Hondo Ohnaka, who led the Ohnaka Gang, and the Jedi Sora Bulq.\n', 'crosslinks': ['Hutt_Clan', 'Bodyguard', 'Bounty_hunter', 'Leather', 'Galact

In [8]:
# test on the species Rodian

print(data['Rodian'])

{'url': 'https://starwars.fandom.com/wiki/Rodian', 'title': '\n\t\t\t\t\tRodian\t\t\t\t', 'is_character': False, 'side_bar': {'Biological classification': {'Designation': 'Sentien', 'Classification': 'Reptilian'}, 'Physical characteristics': {'Average height': '1.75 meters', 'Skin color': ['Usually green', 'sometimes yellow', 'red', 'or turquoise'], 'Eye color': ['Usually blue', 'black or purple', 'sometimes red', 'or green'], 'Distinctions': ['Large and round pupil-less eyes', 'snouts', 'pointed ears', 'antennae', 'scaled and usually green skin']}, 'Sociocultural characteristics': {'Homeworld': 'Rodia', 'Habitat': 'Swamps', 'Language': ['Galactic Basic Standard', 'Rodian']}}, 'paragraph': "Rodians were green-skinned reptilian humanoids native to the planet Rodia. During the Clone Wars, Rodia was represented by Senator Onaconda Farr in the Galactic Republic's senate. He briefly sided with the Trade Federation during the war, under the promise of food and protection from pirates. Rodia 

In [9]:
# test on the species Zabrak

print(data['Zabrak'])

{'url': 'https://starwars.fandom.com/wiki/Zabrak', 'title': '\n\t\t\t\t\tZabrak\t\t\t\t', 'is_character': False, 'side_bar': {'Biological classification': {'Designation': 'Sentient', 'Classification': 'Near-human', 'Subspecies': ['Dathomirian', 'Iridonian']}, 'Physical characteristics': {'Skin color': ['Light to dark tones', 'orange', 'red', 'yellow'], 'Hair color': ['Black', 'Dark blue', 'Purple'], 'Distinctions': ['Vestigial horns', 'two hearts', 'facial tattoo']}, 'Sociocultural characteristics': {'Homeworld': ['Iridonia', 'Dathomir'], 'Diet': 'Carnivorous', 'Language': 'Zabraki'}}, 'paragraph': 'Zabraks were a near-human carnivorous species native to the planets Iridonia and Dathomir. Most members of the species had distinctive horns atop their heads, as well as two hearts. Although most Zabraks lived on Iridonia, some settled on the planet Dathomir, where the females of the species, known as Nightsisters, ruled over the Nightbrother males and practiced powerful dark side magick. N

In [23]:
# Create an empty dictionary to store the species data
species_dict = {}

# Loop through each key in the dictionary
for key in data:
    # Check if the item contains "Biological classification" in its side bar before extracting data
    if "Biological classification" in data[key]["side_bar"]:
        # Extract the planet name from the title column
        species_name = data[key]["title"].strip()

        # Extract the Designation value from the Biological classification section
        designation = data[key]["side_bar"]["Biological classification"].get("Designation", None) 

        # Extract the Classification value from the Biological classification section
        classification = data[key]["side_bar"]["Biological classification"].get("Classification", None)

        # Check if "Physical characteristics" exists in side_bar before trying to access it
        if "Physical characteristics" in data[key]["side_bar"]:
            # Extract the Height value from the Physical characteristics section
            height = data[key]["side_bar"]["Physical characteristics"].get("Height", None) 

            # Extract the Skin color value from the Biological classification section
            skin_color = data[key]["side_bar"]["Physical characteristics"].get("Skin color", None)

            # Extract the Hair color value from the Biological classification section
            hair_color = data[key]["side_bar"]["Physical characteristics"].get("Hair color", None)

            # Extract the Distinctions value from the Biological classification section
            distinctions = data[key]["side_bar"]["Physical characteristics"].get("Distinctions", None)

        # Check if "Sociocultural characteristics" exists in side_bar before trying to access it
        if "Sociocultural characteristics" in data[key]["side_bar"]:
            # Extract the Homeworld value from the Sociocultural characteristics section
            homeworld = data[key]["side_bar"]["Sociocultural characteristics"].get("Homeworld", None)

            # Extract the Language value from the Sociocultural characteristics section
            language = data[key]["side_bar"]["Sociocultural characteristics"].get("Language", None)

            # Create a dictionary with all of the species's data
            species = {
                "designation": designation,
                "classification": classification,
                "height": height,
                "skin_color": skin_color,
                "hair_color": hair_color,
                "distinctions": distinctions,
                "homeworld": homeworld,
                "language": language
                }

            # Add the planet's name and its corresponding dictionary to our main dictionary
            species_dict[species_name] = species


Go back and check designation for "sentient".  there is a sentien.  update.

In [26]:
# create a dataframe from the species dictionary
species_df = pd.DataFrame(species_dict)

# transpose dataframe
species_df = species_df.T

In [27]:
species_df

Unnamed: 0,designation,classification,height,skin_color,hair_color,distinctions,homeworld,language
Abednedo,Sentient,,,"[Brown, Cream, Gray, Orange, Pink, Tan]","[Blond, Brown, Gray, White]",Dangling mouth tendrils,Abednedo,Abednedish
Abersyn symbiote,Sentien,"[Parasite, Symbiote]",,"[Brown, Cream, Gray, Orange, Pink, Tan]","[Blond, Brown, Gray, White]",Dangling mouth tendrils,,
Abyssin,Sentient,,,"[Blue, Green]",Green and white,"[Cycloptic, regenerative abilities]",Byss,
Accipiptero,,Reptavian,,Brown,,,Dagobah,
Acklay,Non-sentient,Amphibious crustacean,,Green,,"[Grappling hands, Stretchy stomachs, Razor-sha...",Vendaxa,
...,...,...,...,...,...,...,...,...
Zhell,Sentient,,,Brown,,,Coruscant,
Zillo Beast,Semi-sentient,Reptile,,"[Brown, Orange, Tan]",,,Malastare,
Zixon,Sentient,,,,Green,,Minfar,Galactic Basic Standard
Zombie,,,,,Green,,"[Dandoran, Dathomir, Ktath'atn, Geonosis, Gloam]",


In [28]:
species_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051 entries, Abednedo to Zygerrian
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   designation     733 non-null    object
 1   classification  676 non-null    object
 2   height          0 non-null      object
 3   skin_color      688 non-null    object
 4   hair_color      262 non-null    object
 5   distinctions    701 non-null    object
 6   homeworld       989 non-null    object
 7   language        179 non-null    object
dtypes: object(8)
memory usage: 73.9+ KB


In [30]:
# convert columns with lists to string
species_df['designation'] = species_df['designation'].astype('str')
species_df['classification'] = species_df['classification'].astype('str')
species_df['height'] = species_df['height'].astype('str')
species_df['skin_color'] = species_df['skin_color'].astype('str')
species_df['hair_color'] = species_df['hair_color'].astype('str')
species_df['distinctions'] = species_df['distinctions'].astype('str')
species_df['homeworld'] = species_df['homeworld'].astype('str')
species_df['language'] = species_df['language'].astype('str')

# create the Species parquet

species_df.to_parquet('../WPscraped/StarWars_Species.parquet', index=False)