## Web scraping Falcon 9 and Falcon Heavy Launches Records from Wikipedia


In [None]:
!pip3 install beautifulsoup4
!pip3 install requests



In [None]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [None]:
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    colunm_name = ' '.join(row.contents)

    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name

In [None]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"
response =requests.get(static_url)  #fetch the HTML content of the Wikipedia page

#create a beautifulsoup object from the HTML response
soup = BeautifulSoup(response.text, 'html.parser')  #BeautifulSoup: parse解析 the HTML content from the resposne

print(soup.title.string)

List of Falcon 9 and Falcon Heavy launches - Wikipedia


**Task 2**


In [None]:

# Define the extract_column_from_header function
#remove unnecessary tag, only display the meaningful column names
def extract_column_from_header(row):
    """
    This function extracts and cleans column names from table headers.
    Input: HTML row element
    Output: Cleaned column name as a string
    """

    #unnecessary tags (<br>, <a>, <sup>)
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    column_name = ' '.join(row.contents)

    # Filter out digits and empty names
    if not(column_name.strip().isdigit()):
        column_name = column_name.strip()
        return column_name


In [None]:
# Locate the target table containing Falcon 9 launch records
html_tables = soup.find_all('table', {'class': 'wikitable'})
first_launch_table = html_tables[2]  # Assuming the third table contains relevant data

In [None]:
# Extract column names using the provided extract_column_from_header function
column_names = []
for row in first_launch_table.find_all('th'):
    column_name = extract_column_from_header(row)
    if column_name is not None and len(column_name) > 0:
        column_names.append(column_name)

# Print the extracted column names
print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [None]:
launch_dict= dict.fromkeys(column_names)


# Let's initial the launch_dict with each value to be an empty list
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

In [None]:
 # Initialize extracted_row counter
extracted_row = 0

# Iterate through each table
for table_number, table in enumerate(soup.find_all('table', 'wikitable plainrowheaders collapsible')):
    # Get table rows
    for rows in table.find_all('tr'):
        # Check to see if first table heading is a number corresponding to a launch number
        if rows.th:
            if rows.th.string:
                flag = rows.th.string.strip()
                flag = flag.isdigit()
        else:
            flag = False

        # If it's a valid row, extract data
        if flag:
            extracted_row += 1
            row = rows.find_all('td')

            # Flight Number
            flight_number = rows.th.string.strip()
            launch_dict['Flight No.'].append(flight_number)

            # Date and Time
            datatimelist = date_time(row[0])
            date = datatimelist[0].strip()
            time = datatimelist[1].strip()
            launch_dict['Date'].append(date)
            launch_dict['Time'].append(time)

            # Booster Version
            bv = booster_version(row[1])
            launch_dict['Version Booster'].append(bv)

            # Launch Site
            launch_site = row[2].a.string.strip() if row[2].a else row[2].string.strip()
            launch_dict['Launch site'].append(launch_site)

            # Payload
            payload = row[3].string.strip() if row[3].string else None
            launch_dict['Payload'].append(payload)

            # Payload Mass
            payload_mass = get_mass(row[4])
            launch_dict['Payload mass'].append(payload_mass)

            # Orbit
            orbit = row[5].string.strip() if row[5].string else None
            launch_dict['Orbit'].append(orbit)

            # Customer
            customer = row[6].string.strip() if row[6].string else None
            launch_dict['Customer'].append(customer)

            # Launch Outcome
            launch_outcome = landing_status(row[7])
            launch_dict['Launch outcome'].append(launch_outcome)

            # Booster Landing Outcome
            booster_landing = landing_status(row[8])
            launch_dict['Booster landing'].append(booster_landing)

# Convert the dictionary into a Pandas DataFrame
launch_df = pd.DataFrame(launch_dict)

# Display the DataFrame to verify results
print(launch_df.head())


  Flight No. Launch site Payload  Payload mass Orbit Customer Launch outcome  \
0          1       CCAFS    None           NaN  None     None        Success   
1          2       CCAFS    None           NaN  None     None        Success   
2          3       CCAFS    None         525.0  None     None        Success   
3          4       CCAFS    None        4700.0  None     None        Success   
4          5       CCAFS    None        4877.0  None     None        Success   

    Version Booster Booster landing              Date   Time  
0  F9 v1.07B0003.18         Failure      4 June 2010,  18:45  
1  F9 v1.07B0004.18         Failure  8 December 2010,  15:43  
2  F9 v1.07B0005.18      No attempt      22 May 2012,  07:44  
3  F9 v1.07B0006.18      No attempt   8 October 2012,  00:35  
4  F9 v1.07B0007.18      No attempt     1 March 2013,  15:10  


In [None]:
import requests
import pandas as pd

# Perform GET request on SpaceX API
url = "https://api.spacexdata.com/v4/launches"
response = requests.get(url)
data = response.json()

# Convert response to DataFrame using pd.json_normalize
df = pd.json_normalize(data)

# Extract the year from the first row in the column static_fire_date_utc
first_row_date = df.loc[0, 'static_fire_date_utc']
first_row_year = pd.to_datetime(first_row_date).year

print(first_row_year)  # Output: 2006


2006


In [1]:


# Count missing values in the 'landpad' column
missing_values = df['landpad'].isnull().sum()

print(missing_values)  # Output: 26


NameError: name 'df' is not defined