In [1]:
!pip3 install beautifulsoup4
!pip3 install requests



In [2]:
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input:  the element of a table data cell extracts extra row
    """
    try:
        return [data_time.strip() for data_time in list(table_cells.strings)][0:2]
    except IndexError:
        return [None, None]  # Handle cases where the cell doesn't contain enough data

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell
    Input:  the element of a table data cell extracts extra row
    """
    output = ''.join([s for s in table_cells.strings if not s.isdigit() and s.strip() != ''])
    return output.replace(" ", "")

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell
    Input:  the element of a table data cell extracts extra row
    """
    out = [i for i in table_cells.strings][0]
    return out

def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass_match = re.search(r'([0-9,]+)', mass)
        if mass_match:
            return mass_match.group(1).replace(",", "")
    return 0

def extract_column_from_header(row):
    """
    This function extracts column name from the HTML table cell
    Input: the element of a table data cell extracts extra row
    """
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    column_name = ''.join(row.contents)

    # Filter the digit and empty names
    if not(column_name.strip().isdigit()):
        column_name = column_name.strip()
        return column_name

# TASK 1: Request the Falcon9 Launch Wiki page from its URL
static_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"
response = requests.get(static_url)

# Check for successful request
if response.status_code == 200:
    print("Successfully retrieved the Wiki page.")
else:
    print(f"Failed to retrieve the Wiki page. Status code: {response.status_code}")
    exit()

# Create a BeautifulSoup object from the HTML response
soup = BeautifulSoup(response.text, 'html.parser')

# Print the page title to verify if the BeautifulSoup object was created properly
print(soup.title)

# TASK 2: Extract all column/variable names from the HTML table header
html_tables = soup.find_all('table')

# Locate the correct table using a more robust method
first_launch_table = None # Initialize first_launch_table
for table in html_tables:
    first_launch_table = table
    break


#Here, after that you can use the table

column_names = []

# Now check that first_launch_table is not None before using it

# Apply find_all() function with `th` element on first_launch_table
for row in first_launch_table.find_all('th'):
    name = extract_column_from_header(row)
    if name is not None and len(name) > 0:
        column_names.append(name)

# Check the extracted column names
print("Extracted Column Names:", column_names)

# TASK 3: Create a data frame by parsing the launch HTML tables

# Initialize the launch dictionary
launch_dict= dict.fromkeys(column_names)

# Correctly identify the column to be removed
date_and_time_col = next((col for col in column_names if "Date" in str(col) and "time" in str(col)), None)

if date_and_time_col:
    del launch_dict[date_and_time_col]

if '' in launch_dict:
    del launch_dict['']

launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
launch_dict['Version Booster']=[]
launch_dict['Booster landing'] = []

extracted_row = 0

# Extract the data from the rows
for rows in first_launch_table.find_all('tr'):
    # Get table element
    row = rows.find_all('td')
    # If it is number save cells in a dictonary
    if len(row) != 0:
        extracted_row = extracted_row + 1

        # Flight Number value
        launch_dict['Flight No.'].append(extracted_row)

        # Date value
        if len(row) > 0:
            datatimelist = date_time(row[0])
            if datatimelist:
              date = datatimelist[0].strip(',')
            else:
              date = None  # Handle cases where datatimelist is None
            
        else:
            datatimelist = [None, None]  # Handle missing date

        # Launch site
        if len(row) > 1:
            launch_dict['Launch site'].append(row[1].a.string if row[1].a else None)
        else:
            launch_dict['Launch site'].append(None)

        # Payload
        launch_dict['Payload'].append(row[2].a.string if len(row) > 2 and row[2].a else None)

        # Payload Mass
        launch_dict['Payload mass'].append(get_mass(row[3]) if len(row) > 3 else None)

        # Orbit
        launch_dict['Orbit'].append(row[4].a.string if len(row) > 4 and row[4].a else None)

        # Customer
        launch_dict['Customer'].append(row[5].string if len(row) > 5 and row[5].string else None)

        # Launch Outcome
        launch_dict['Launch outcome'].append(landing_status(row[6]) if len(row) > 6 else None)

        # Booster Version
        launch_dict['Version Booster'].append(booster_version(row[7]) if len(row) > 7 else None)

        # Booster Landing
        launch_dict['Booster landing'].append(landing_status(row[8]) if len(row) > 8 else None)

print("Extracted Rows:", extracted_row)
df = pd.DataFrame(launch_dict)
df.to_csv("spacex_web_scraped.csv", index=False)

Successfully retrieved the Wiki page.
<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>
Extracted Column Names: ['Flight No.', 'Date andtime ()', 'Launchsite', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launchoutcome', 'FH 5', 'FH 6', 'FH 7', 'FH 8', 'FH 9']
Extracted Rows: 203
