In [None]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import pickle
import re
from tqdm import tqdm
from pathlib import Path
import urllib

We will import the pickled data to pull just the Star Wars characters.

In [None]:
files = sorted(Path('../WPscraped').glob('*.pickle'))
files

In [None]:
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

len(data)

In [None]:
def remove_url_shizzle(text):
    return urllib.parse.unquote(text).replace('"', '').replace("'", '')

In [None]:
cleaned = {}
for key, value in tqdm(data.items()):
    new_key = remove_url_shizzle(key)
    cleaned[new_key] = value
    cleaned[new_key]['crosslinks'] = [remove_url_shizzle(crosslink) for crosslink in value['crosslinks']]
data = cleaned

In [None]:
def find_key(key_name, data):
    for key, value in data.items():
        if key_name == key:
            return value
        if isinstance(value, dict):
            value = find_key(key_name, value)
            if value is not None:
                return value
    return None

def get_first(key_name, data):
    result = find_key(key_name, data)
    if isinstance(result, list):
        result = result[0]
    return result

In [None]:
result = []
for key, part in data.items():
    if not part['is_character']: continue
    row = {
        'name': part['title'],
        'key': key,
        'url': part['url'],
        'description': part['paragraph']
    }
    
    species  = find_key('Species', part['side_bar'])
    row['species_2nd'] = None
    row['species_3rd'] = None
    if isinstance(species, list):
        row['species'] = species[0]
        if len(species) > 1:
            row['species_2nd'] = species[1]
        if len(species) > 2:
            row['species_3rd'] = species[2]
        if len(species) > 3:
            print(species)
    else:
        row['species'] = species.strip()
    row['home_world'] = get_first('Homeworld', part['side_bar'])
    row['gender'] = get_first('Gender', part['side_bar'])

    row['height'] = get_first('Height', part['side_bar'])
    row['eye_color'] = get_first('Eye color', part['side_bar'])
    row['skin_color'] = get_first('Skin color', part['side_bar'])
    row['hair_color'] = get_first('Hair color', part['side_bar'])
    row['weight'] = get_first('Mass', part['side_bar'])

    result.append(row)
df = pd.DataFrame(result)

# fix gender some errors
gender_map = {
    'Male': 'Male',
    'Female': 'Female',
    'Mal': 'Male',
    'Femal': 'Female',
    'Non-binary': 'Non-binary',
    'male': 'Male',
    'Males': 'Male',
    'female': 'Female',
    'Femle': 'Female',
}
df.loc[:, 'gender'] = df.gender.map(gender_map)
df['gender'] = df['gender'].fillna('None')

# normalize height
translate = {None: None}
for m in df.height.unique().tolist()[1:]:
    if 'meter' in m:
        try:
            split = m.split()
            if len(split) == 2:
                if '/' in split[0]:
                    split[0] = split[0].split('/')[0]
                translate[m] = float(split[0])
            elif split[0] == 'Around' or split[0] == 'Over':
                translate[m] = float(split[1])
            elif split[0] == 'At':
                translate[m] = float(split[2])
            elif split[-1] == 'shoulder':
                translate[m] = float(split[0])
            elif split[-1] == 'meters':
                translate[m] = float(split[-2])
            elif split[1] == 'millimeters':
                translate[m] = 1.7015
            elif split[1] == 'meters':
                translate[m] = float(split[0])
            else:
                print(split)
                break
        except:
            print(m)
            break
    elif 'feet' in m or 'ft' in m:
        try:
            split = m.split()
            if split[0] == 'Around' or split[0] == 'Almost':
                translate[m] = 0.3 * int(split[1])
            elif len(split) == 4:
                translate[m] = 0.3 * int(split[0]) + 0.0254 * int(split[2])
            elif len(split) == 2:
                translate[m] = 0.3 * int(split[0])
            else:
                print(split)
                break
        except:
            print(m)
            break     
    elif m[-1] == 'c':
        translate[m] = float(m[:-1]) / 100
    elif m == '5:1':
        translate[m] = None
    else:
        try:
            translate[m] = float(m)
        except:
            print(m)
            break     
df['height'] = df.height.map(translate)

In [None]:
df.to_parquet('../WPscraped/StarWars_Characters.parquet', index=False)