In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import pickle
import re
from tqdm import tqdm
from pathlib import Path
import urllib
import pyarrow

In [2]:
files = sorted(Path('../WPscraped').glob('*.pickle'))
files

[WindowsPath('../WPscraped/starwars_all_canon_data_1.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_2.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_3.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_4.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_5.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_6.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_7.pickle'),
 WindowsPath('../WPscraped/starwars_all_canon_data_8.pickle')]

In [3]:
#create the dictionary to run
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

# The code is to open the data and to count to make sure all files in the .pickles are carried over
len(data)

39460

In [4]:
#find the key in the dictionary
def find_key(key_name, data):
    for key, value in data.items():
        if key_name == key:
            return value
        if isinstance(value, dict):
            value = find_key(key_name, value)
            if value is not None:
                return value
    return None

def get_first(key_name, data):
    result = find_key(key_name, data)
    if isinstance(result, list):
        result = result[0]
    return result

In [5]:
# test on the vehicle Millennium Falcon

print(data['Millennium_Falcon'])

{'url': 'https://starwars.fandom.com/wiki/Millennium_Falcon', 'title': '\nMillennium Falcon ', 'is_character': False, 'side_bar': {'Production information': {'Manufacturer': 'Corellian Engineering Corporation', 'Product line': 'YT-series', 'Model': 'Heavily modified YT-1300fp light freighter', 'Class': 'Light freighter', 'Modified by': ['A technician', 'Lando Calrissian', 'Han Solo', 'Chewbacca', 'Gannis Ducain', 'Vanver and Toursant Irving', 'Unkar Plutt', 'Rey', 'Rose Tico', 'Klaus "Doc" Vandagante']}, 'Technical specifications': {'Length': ['c. 34.52', '–34.75 meters'], 'Width': ['25.61 meters'], 'Height/depth': ['7.8 meters'], 'MGLT': '75 MGLT', 'Maximum speed (atmosphere)': ['1', '200 kph', '1', '050 kph'], 'Engine unit(s)': ['Girodyne SRB42 sublight engines', 'Landing jets'], 'Hyperdrive rating': ['Class 1.0', 'Class 0.5', 'Class 10'], 'Hyperdrive system': ['Avatar-10 hyperdrive', 'Isu-Sim SSP05 hyperdrive', '401 series hyperdrive', 'Backup hyperdrive'], 'Power plant': ['Quadex p

In [6]:
# test on the vehicle Lady Luck

print(data['Lady_Luck'])

{'url': 'https://starwars.fandom.com/wiki/Lady_Luck', 'title': '\nLady Luck ', 'is_character': False, 'side_bar': {'Production information': {'Manufacturer': 'SoroSuub Corporation', 'Model': 'Luxury 3000 space yacht', 'Class': 'Yacht'}, 'Technical specifications': {'Length': 'Around 50 meters', 'Crew': 'Pilot ', 'Other systems': 'First-class autopilot'}, 'Usage': {'Owner(s)': ['An Orthellin royal mistress', 'Landonis Balthazar Calrissian']}}, 'paragraph': '\nThe Lady Luck was a Luxury 3000 space yacht belonging to Lando Calrissian, who bought it shortly after the Battle of Endor from an Orthellin royal mistress. Calrissian, Kaasha Bateen, Florx Biggles and Lobot used the ship to travel to Chandrila. Many years later, while living as a "hermit" on the planet Pasaana, he kept the Lady Luck under an unmarked tent owned by an intermediary group. Calrissian later traveled to the moon Ajan Kloss aboard the Lady Luck to help the Resistance.\n', 'crosslinks': ['Moon', 'Yacht', 'Gender', 'Stand

In [7]:
# test on the vehicle Lady Luck

print(data['All_Terrain_Armored_Transport'])

{'url': 'https://starwars.fandom.com/wiki/All_Terrain_Armored_Transport', 'title': '\n\t\t\t\t\tAll Terrain Armored Transport\t\t\t\t', 'is_character': False, 'side_bar': {'Production information': {'Manufacturer': ['Kuat Drive Yards', 'Kuat-Entralla Drive Yards'], 'Line': 'All-terrain vehicle', 'Class': 'Combat walker', 'Models': ['All Terrain Armored Transport', 'All Terrain Armored Transport', 'All Terrain Armored Cargo Transport', 'Elite AT-AT', 'First Order All Terrain Armored Transport']}, 'Usage': {'Role(s)': ['Military Walker', 'Mechanized infantry', 'Self-propelled artillery', 'Troop transport'], 'Year introduced': 'During the Clone Wars', 'Affiliation': ['Galactic Republic', 'Galactic Empire', 'Mantis crew', 'Alliance to Restore the Republic', 'Imperial territory G5-623', 'New Republic', 'First Order']}}, 'paragraph': "The All Terrain Armored Transport  was a four-legged combat walker of the All-terrain vehicle line used by the ground forces of the Galactic Republic, Galactic

In [23]:
# Create an empty dictionary to store the vehicle data
vehicle_dict = {}

# Loop through each key in the dictionary
for key in data:
    # Check if the item contains "Production information" in its side bar before extracting data
    if "Production information" in data[key]["side_bar"]:
        # Extract the vehicle name from the title column
        vehicle_name = data[key]["title"].strip()
        
        # Extract the manufacturer value from the Production information section
        manufacturer = data[key]["side_bar"]["Production information"].get("Manufacturer", None) 

        # Extract the model value from the Production information section
        model = data[key]["side_bar"]["Production information"].get("Model", None)

        # Extract the class value from the Production information section
        vclass = data[key]["side_bar"]["Production information"].get("Class", None)

        # Check if "Technical specifications" exists in side_bar before trying to access it
        if "Technical specifications" in data[key]["side_bar"]:
            # Extract the length value from the Technical specifications section
            length = data[key]["side_bar"]["Technical specifications"].get("Length", None)

            # Extract the crew value from the Technical specifications section
            crew = data[key]["side_bar"]["Technical specifications"].get("Crew", None)

        # Check if "Usage" exists in side_bar before trying to access it
        if "Usage" in data[key]["side_bar"]:
            # Extract the owners value from the Usage section
            owners = data[key]["side_bar"]["Usage"].get("Owner(s)", None)

            # Extract the affiliation value from the Usage section
            affiliation = data[key]["side_bar"]["Usage"].get("Affiliation", None)

            # Check if both model and class are not None and class does not contain "droid" before adding to vehicle_dict
            if model is not None and vclass is not None and "droid" not in vclass:
                # Create a dictionary with all of the vehicles' data
                vehicles = {
                "vehicle_name": vehicle_name,
                "manufacturer": manufacturer,
                "model": model,
                "class": vclass,
                "length": length,
                "crew": crew,
                "owner(s)": owners,
                "affiliation": affiliation
                }
                
            # Add the vehicle's name and its corresponding dictionary to the main dictionary
                vehicle_dict[vehicle_name] = vehicles
            
    

In [25]:
# create a dataframe from the vehicles dictionary
vehicle_df = pd.DataFrame(vehicle_dict)

# transpose dataframe
vehicle_df = vehicle_df.T

In [26]:
vehicle_df

Unnamed: 0,vehicle_name,manufacturer,model,class,length,crew,owner(s),affiliation
"""Changeling"" Mark 71NB","""Changeling"" Mark 71NB",Ravager Mechanics,"""Changeling"" Mark 71NB",Racing starfighter,,2,,"[Ace Squadron, Colossus resistance]"
125-Z treadspeeder bike,125-Z treadspeeder bike,Aratech-Loratus Corporation,125-Z treadspeeder bik,Speeder,4.17 meters,1,,"[First Order, Resistance]"
49AX3,49AX3,,Dreadnought-class heavy cruiser,Heavy cruiser,,,,"[Galactic Empire, Alliance to Restore the Repu..."
614-AvA speeder bike,614-AvA speeder bike,Aratech Repulsor Company,614-AvA,Speeder bike,4.4 meters,[Pilot],,"[Galactic Empire, Lothal resistance group, Spe..."
712-AvA speeder bike,712-AvA speeder bike,Aratech Repulsor Company,712-AvA,Speeder bike,,,,
...,...,...,...,...,...,...,...,...
Zephyr-G swoop,Zephyr-G swoop,Mobquet Swoops and Speeders,Zephyr-G,Repulsorcraft,3.68 meters,1 pilot,,"[Skywalker family, Lars family]"
Zephyr-K,Zephyr-K,,Zephyr-K,Repulsorcraft,,1,,
Zeta-class Heavy Cargo Shuttle,Zeta-class Heavy Cargo Shuttle,"[Sienar Fleet Systems, Telgorn Corporation]",Zeta-class cargo shuttle,"[Cargo, shuttle]",35.50 meters,"[Pilot, Co-pilot]",,"[Galactic Empire, Imperial Navy, Alliance to R..."
Zeta-class shuttle,Zeta-class shuttle,,Zeta-class shuttle,Shuttle,,,,"[Galactic Empire, Imperial Navy, Inquisitorius..."


In [27]:
vehicle_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1049 entries, "Changeling" Mark 71NB to Zeva's ship
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   vehicle_name  1049 non-null   object
 1   manufacturer  621 non-null    object
 2   model         1049 non-null   object
 3   class         1049 non-null   object
 4   length        552 non-null    object
 5   crew          618 non-null    object
 6   owner(s)      180 non-null    object
 7   affiliation   965 non-null    object
dtypes: object(8)
memory usage: 73.8+ KB


In [28]:
# convert columns with lists to string
vehicle_df['manufacturer'] = vehicle_df['manufacturer'].astype('str')
vehicle_df['model'] = vehicle_df['model'].astype('str')
vehicle_df['class'] = vehicle_df['class'].astype('str')
vehicle_df['length'] = vehicle_df['length'].astype('str')
vehicle_df['crew'] = vehicle_df['crew'].astype('str')
vehicle_df['owner(s)'] = vehicle_df['owner(s)'].astype('str')
vehicle_df['affiliation'] = vehicle_df['affiliation'].astype('str')


# create the Vehicles parquet

vehicle_df.to_parquet('../WPscraped/StarWars_Vehicles.parquet', index=False)