# Data Collection - Falcon 9 WikiPedia Page (Webscraping)

In [102]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import datetime

Normally when doing webscraping, we have to deal with HTML content with different text formatting and many unwanted content in it. Following helper functions helps to extract relevant data from scrapped HTML content.

In [3]:
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name    


When examine Falcon 9 Wikipedia page I can see that, there are two pages that I can fetch data from.
- Old launches from 2010 - 2019 - https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches_(2010%E2%80%932019)
- Latest launches 2020 to present date - https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches

(It seems Wikipedia archives/sparate data from orginal page after 10 years)

However, in this project I will use both HTML pages and extract whole dataset.

In [4]:
url_old = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches_(2010%E2%80%932019)"
url_new = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

In [5]:
# Load html content from two sites
html_old = requests.get(url_old)
html_old_respose = html_old.text

In [6]:
html_new = requests.get(url_new)
html_new_response = html_new.text

#### Process HTML content

In [7]:
soup_old = BeautifulSoup(html_old_respose, 'html.parser')
soup_old.title

<title>List of Falcon 9 and Falcon Heavy launches (2010–2019) - Wikipedia</title>

*soup_old* BeautifulSoup object has been created.

In [23]:
soup_new = BeautifulSoup(html_new_response, 'html.parser')
soup_new.title

<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>

*soup_new* BeautifulSoup object has been created

In [13]:
table_headers_list = soup_old.find(name='table', class_="wikitable plainrowheaders collapsible").find_all('th')

In [19]:
# collect column names from HTML table
column_names = []
for theader in table_headers_list:
    col_header = extract_column_from_header(theader)
    if col_header is not None and len(col_header) > 0:
        column_names.append(col_header)

In [20]:
column_names

['Flight No.',
 'Date and time ( )',
 'Launch site',
 'Payload',
 'Payload mass',
 'Orbit',
 'Customer',
 'Launch outcome']

In [37]:
# create a dictionary for storing data
launch_dict= dict.fromkeys(column_names)
launch_dict

{'Flight No.': None,
 'Date and time ( )': None,
 'Launch site': None,
 'Payload': None,
 'Payload mass': None,
 'Orbit': None,
 'Customer': None,
 'Launch outcome': None}

*Date and time ( )* column is not ncessary. Let's delete it and add some more usable columns to the dictionary.

In [38]:
del(launch_dict['Date and time ( )'])

# add more usable columns and initialize them with empty lists
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster'] = []
launch_dict['Booster landing'] = []
launch_dict['Date'] = []
launch_dict['Time'] = []

Let's extract data from both BeautifulSoup objects and fill up dictionary.

Usually, HTML tables in Wiki pages are likely to contain unexpected annotations and other types of noises, such as reference links `B0004.1[8]`, missing values `N/A [e]`, inconsistent formatting, etc.

Folowing code snippet will extract and filter relevant content and store them in the dict object.

In [39]:
# following loop will extract and store old data first, and new data second.
soup_objects = [soup_old, soup_new]

for soup in soup_objects:
    #Extract each table 
    for table_number, table in enumerate(soup.find_all(name='table', class_="wikitable plainrowheaders collapsible")):
       # get table row 
        for rows in table.find_all("tr"):
            #check to see if first table heading is as number corresponding to launch a number 
            if rows.th:
                if rows.th.string:
                    flight_number = rows.th.string.strip()
                    flag = flight_number.isdigit()
            else:
                flag = False
            #get table element 
            row = rows.find_all('td')
            #if it is number save cells in a dictonary 
            if flag:
                # Flight Number value
                launch_dict['Flight No.'].append(flight_number)

                datatimelist = date_time(row[0])

                # Date value
                date = datatimelist[0].strip(',')
                launch_dict['Date'].append(date)

                # Time value
                time = datatimelist[1]
                launch_dict['Time'].append(time)

                # Booster version
                bv = booster_version(row[1])
                if not(bv):
                    bv = row[1].a.string
                launch_dict['Version Booster'].append(bv)

                # Launch Site
                launch_site = row[2].a.string
                launch_dict['Launch site'].append(launch_site)

                # Payload
                payload = row[3].a.string
                launch_dict['Payload'].append(payload)

                # Payload Mass
                payload_mass = get_mass(row[4])
                launch_dict['Payload mass'].append(payload_mass)

                # Orbit
                orbit = row[5].a.string
                launch_dict['Orbit'].append(orbit)

                # Customer
                if row[6].a is not None:
                    customer = row[6].a.string
                else:
                    customer = row[6].text
                launch_dict['Customer'].append(customer)

                # Launch outcome
                launch_outcome = list(row[7].strings)[0]        
                launch_dict['Launch outcome'].append(launch_outcome)

                # Booster landing
                booster_landing = landing_status(row[8])
                launch_dict['Booster landing'].append(booster_landing)

Check for the length of each List in the dict object. They must be the same. Otherwise an error occurs when creating Pandas DataFrame.

In [40]:
for launch_dict_item in launch_dict:
    print(launch_dict_item, len(launch_dict[launch_dict_item]))

Flight No. 241
Launch site 241
Payload 241
Payload mass 241
Orbit 241
Customer 241
Launch outcome 241
Version Booster 241
Booster landing 241
Date 241
Time 241


Every List object has the same length of 241. Now let's create the Pandas DataFrame.

In [65]:
data_from_wiki = pd.DataFrame(launch_dict)

In [66]:
data_from_wiki.head()

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCSFS,Dragon Spacecraft Qualification Unit,N,LEO,SpaceX,Success\n,F9 v1.0,Failure,4 June 2010,18:45
1,2,CCSFS,SpaceX COTS Demo Flight 1,U,LEO,NASA,Success,F9 v1.0,Failure,8 December 2010,15:43
2,3,CCSFS,SpaceX COTS Demo Flight 2,525 kg,LEO,NASA,Success,F9 v1.0,No attempt\n,22 May 2012,07:44
3,4,CCSFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success\n,F9 v1.0,No attempt,8 October 2012,00:35
4,5,CCSFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success\n,F9 v1.0,No attempt\n,1 March 2013,15:10


Looks like *\n* line break is still there in some columns. Let's look for them and remove them.

In [67]:
for col in launch_dict:
    num_of_line_breaks = (data_from_wiki[col].str.find('\n') > -1).sum()
    if num_of_line_breaks > 0:
        data_from_wiki[col] = data_from_wiki[col].str.strip('\n')

Convert *Date* and *Time* values to "YYYY-MM-DD" and "HH:mm:ss" format.

In [68]:
data_from_wiki['Date'] = pd.to_datetime(data_from_wiki['Date']).dt.date

In [69]:
data_from_wiki['Time'] = pd.to_datetime(data_from_wiki['Time']).dt.time

In [70]:
data_from_wiki.head()

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCSFS,Dragon Spacecraft Qualification Unit,N,LEO,SpaceX,Success,F9 v1.0,Failure,2010-06-04,18:45:00
1,2,CCSFS,SpaceX COTS Demo Flight 1,U,LEO,NASA,Success,F9 v1.0,Failure,2010-12-08,15:43:00
2,3,CCSFS,SpaceX COTS Demo Flight 2,525 kg,LEO,NASA,Success,F9 v1.0,No attempt,2012-05-22,07:44:00
3,4,CCSFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success,F9 v1.0,No attempt,2012-10-08,00:35:00
4,5,CCSFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success,F9 v1.0,No attempt,2013-03-01,15:10:00


In [71]:
data_from_wiki.shape

(241, 11)

In [87]:
data_from_wiki.dtypes

Flight No.         object
Launch site        object
Payload            object
Payload mass       object
Orbit              object
Customer           object
Launch outcome     object
Version Booster    object
Booster landing    object
Date               object
Time               object
dtype: object

Now the dataset is clear. Let save it as a *csv* file for future use.

In [106]:
data_from_wiki.to_csv('spacex_falcon9_wiki.csv', index=False)

Now we have the dataset in a CSV file. We can do further processing using this csv file.

- Please refer [Data_collection_spacex_api.ipynb](Data_collection_spacex_api.ipynb) for data collection SpaceX API.
- Please refer [Complete_dataset.ipynb](Complete_dataset.ipynb) to see the implementatin of the final dataset.
- Please refer [SpaceX.ipynb](SpaceX.ipynb) Notebook file for the rest of the project.