In [1]:
import pandas as pd
import pyarrow
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [21]:
# Read in the Star Trek Astronomical Objects

sta = pd.read_parquet('../WPscraped/StarTrek_Astronomical_Objects.parquet')

# Read in the Star Wars Astronomical Objects

swa = pd.read_parquet('../WPscraped/StarWars_Planets.parquet')

In [19]:
# View sta info
sta.info()

# View sta
sta.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2404 entries, 0 to 2403
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    2404 non-null   object
 1   astronomicalObjectType  2363 non-null   object
dtypes: object(2)
memory usage: 37.7+ KB


Unnamed: 0,name,astronomicalObjectType
0,'aucdet IX,PLANET
1,'etnap Nebula,NEBULA
2,1 Centauri,STAR_SYSTEM
3,11 Leonis Minoris,STAR_SYSTEM
4,1889 V,COMET


In [22]:
# View swa info
swa.info()

# View swa
swa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2562 entries, 0 to 2561
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                2562 non-null   object
 1   suns                2562 non-null   object
 2   moons               2562 non-null   object
 3   atmosphere          2562 non-null   object
 4   climate             2562 non-null   object
 5   native_species      2562 non-null   object
 6   immigrated_species  2562 non-null   object
 7   population          2562 non-null   object
 8   major_exports       2562 non-null   object
 9   affiliation         2562 non-null   object
 10  description         2562 non-null   object
dtypes: object(11)
memory usage: 220.3+ KB


Unnamed: 0,name,suns,moons,atmosphere,climate,native_species,immigrated_species,population,major_exports,affiliation,description
0,3rd Moon,,,,,,Various,,,,The 3rd Moon was a terrestrial moon located in...
1,7G sector,,,,,,,,,Galactic Empire,The 7G sector was a sector of space located in...
2,Yasooska,,,,,,,,,,Yasooska was one of the four moons of the plan...
3,Aakaash,,,,,,,,,,Aakaash was a star of the star system of the s...
4,Aakaash system,Aakaash,,,,,Humans,174.2 billion,,Land & Sky Corporation,The Aakaash system was located in the Oplovis ...


In [9]:
# Let's look into the Star Trek astronomical objects

stcount = sta['astronomicalObjectType'].value_counts()

stcount

PLANET              1061
STAR_SYSTEM          529
SECTOR               190
STAR                 125
REGION               113
M_CLASS_PLANET        95
NEBULA                62
CLUSTER               37
CONSTELLATION         30
MOON                  29
COMET                 17
PLANETOID             17
ASTEROID_BELT         12
GAS_GIANT_PLANET      11
ASTEROID               8
GALAXY                 8
L_CLASS_PLANET         6
M_CLASS_MOON           3
QUASAR                 3
ROGUE_PLANET           2
Y_CLASS_PLANET         2
K_CLASS_PLANET         2
H_CLASS_PLANET         1
Name: astronomicalObjectType, dtype: int64

It looks like there are 23 types of astronomical objects listed in the Star Trek data.  We will combine some of the information into consolidated objects. All planets will be grouped together, all moons will be grouped together, and asteroids and asteroid belts will be grouped together.

In [13]:
# We will start by creating a boolean column for all planets

sta['planet'] = sta['astronomicalObjectType'].str.contains('PLANET')

# We will create a boolean column for all moons

sta['moon'] = sta['astronomicalObjectType'].str.contains('MOON')

# We will create a boolean column for all asteroids

sta['asteroid'] = sta['astronomicalObjectType'].str.contains('ASTEROID')

# view updated dataframe

sta.head()


Unnamed: 0,name,astronomicalObjectType,planet,moon,asteroid
0,'aucdet IX,PLANET,True,False,False
1,'etnap Nebula,NEBULA,False,False,False
2,1 Centauri,STAR_SYSTEM,False,False,False
3,11 Leonis Minoris,STAR_SYSTEM,False,False,False
4,1889 V,COMET,False,False,False


In [17]:
# Create a count for # of planets

stplanets = sta['planet'].sum()
print(f' There are {stplanets} planets listed')

# Create a count for # of moons
stmoons = sta['moon'].sum()
print(f' There are {stmoons} moons listed')

# Create a count for # of asteroids
stasteroid = sta['asteroid'].sum()
print(f' There are {stasteroid} asteroids listed')

 There are 1197 planets listed
 There are 32 moons listed
 There are 20 asteroids listed


We have the Star Trek data, so let's work on the Star Wars data.  There are no fields which easily identify the type of astronomincal object.  There are a lot of columns and data for Star Wars' objects, so we will create a new dataframe and see if we can group items together through the description paragraph.

In [29]:
# Create a new dataframe with name and description

swa_short = swa[['name', 'description']]

# Test swa_short
swa_short.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2562 entries, 0 to 2561
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         2562 non-null   object
 1   description  2562 non-null   object
dtypes: object(2)
memory usage: 40.2+ KB


In [32]:
# We will start by dropping items that have a name with sector or system

swa_2 = swa_short[swa_short['name'].str.contains('sector|system') == False]

swa_2.head(10)

Unnamed: 0,name,description
0,3rd Moon,The 3rd Moon was a terrestrial moon located in...
2,Yasooska,Yasooska was one of the four moons of the plan...
3,Aakaash,Aakaash was a star of the star system of the s...
5,Aaloth,Aaloth was a Twi'lek colony planet in the Gaul...
6,Aargau,Aargau was an astronomical object located in t...
7,Ab Dalis,Ab Dalis was a swampy wasteland world situated...
9,Abafar,Abafar was a remote desert planet located with...
11,Abafed,Abafed was a yellow star situated in the Abafa...
12,Abednedo (planet),Abednedo was a planet located on the Corellian...
13,Abelor,Abelor was a planet located in the Mid Rim and...


In [36]:
# We will drop items that have sun in the name

swa_2 = swa_2[swa_2['name'].str.contains('sun') == False]

swa_2.head(10)

Unnamed: 0,name,description,planet,moon,asteroid
0,3rd Moon,The 3rd Moon was a terrestrial moon located in...,False,True,False
2,Yasooska,Yasooska was one of the four moons of the plan...,True,True,False
3,Aakaash,Aakaash was a star of the star system of the s...,False,False,False
5,Aaloth,Aaloth was a Twi'lek colony planet in the Gaul...,True,False,False
6,Aargau,Aargau was an astronomical object located in t...,False,False,False
7,Ab Dalis,Ab Dalis was a swampy wasteland world situated...,False,False,False
9,Abafar,Abafar was a remote desert planet located with...,True,False,False
11,Abafed,Abafed was a yellow star situated in the Abafa...,True,False,False
12,Abednedo (planet),Abednedo was a planet located on the Corellian...,True,False,False
13,Abelor,Abelor was a planet located in the Mid Rim and...,True,False,False


In [33]:
# We will create a boolean column for all planets

swa_2['planet'] = swa_2['description'].str.contains('planet')

# We will create a boolean column for all moons

swa_2['moon'] = swa_2['description'].str.contains('moon')

# We will create a boolean column for all asteroids

swa_2['asteroid'] = swa_2['description'].str.contains('asteroid')

# view updated dataframe

swa_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swa_2['planet'] = swa_2['description'].str.contains('planet')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swa_2['moon'] = swa_2['description'].str.contains('moon')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  swa_2['asteroid'] = swa_2['description'].str.contains('asteroid')


Unnamed: 0,name,description,planet,moon,asteroid
0,3rd Moon,The 3rd Moon was a terrestrial moon located in...,False,True,False
2,Yasooska,Yasooska was one of the four moons of the plan...,True,True,False
3,Aakaash,Aakaash was a star of the star system of the s...,False,False,False
5,Aaloth,Aaloth was a Twi'lek colony planet in the Gaul...,True,False,False
6,Aargau,Aargau was an astronomical object located in t...,False,False,False


We've created the subset for Star Wars astronomical objects, however, it appears that some of the descriptions may contain both planet and moon.  We will flag those and create a new dataframe for those to review to see how many there are to determine if there is a practical solution without reviewing each item.

In [37]:
# Create a dataframe with both planet and moon true booleans

swa_true = swa_2[(swa_2['planet'] == True) & (swa_2['moon'] == True)]

swa_true.info()
swa_true.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 2 to 2527
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         201 non-null    object
 1   description  201 non-null    object
 2   planet       201 non-null    bool  
 3   moon         201 non-null    bool  
 4   asteroid     201 non-null    bool  
dtypes: bool(3), object(2)
memory usage: 5.3+ KB


Unnamed: 0,name,description,planet,moon,asteroid
2,Yasooska,Yasooska was one of the four moons of the plan...,True,True,False
37,Agaris' moon,A moon orbited the planet Agaris in Wild Space.\n,True,True,False
49,Ajara,Ajara was a gas giant planet located in the Ca...,True,True,False
56,Alaris,Alaris was a planet of the Mytaranor sector's ...,True,True,False
57,Alaris Prime,Alaris Prime was one of the several moons that...,True,True,False
