In [None]:
import boto3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import requests
import io
from bs4 import BeautifulSoup
from scipy.stats import zscore
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
from dotenv import load_dotenv
import os

In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Retrieving European Space Agency (ESA) Data

In [None]:
load_dotenv()

# API Endpoint
esa_api_url = "https://discosweb.esoc.esa.int/api/objects"

# Your Personal Access Token
token = os.getenv('esa_token')

# Set up authentication headers
headers = {
    "Authorization": f"Bearer {token}",
    "DiscosWeb-Api-Version": "2",
    "Accept": "application/vnd.api+json"
}

# Query Parameters (Retrieving all object attributes)
params = {
    "page[size]": 100,  # Max allowed per page
}

# List to store all object records
all_data = []
page = 1  # Start with page 1

with tqdm(desc="Fetching Data", unit=" records", smoothing=0.1) as pbar:
    while True:
        params["page[number]"] = page  # Set current page number
        response = requests.get(esa_api_url, headers=headers, params=params)

        if response.status_code == 200:
            data = response.json()
            
            if "data" in data and data["data"]:
                all_data.extend(data["data"])  # Store raw data
                pbar.update(len(data["data"]))  # Update progress bar
            else:
                break  # Stop if no more data
            
            page += 1  # Move to next page
        else:
            print(f"❌ Error: {response.status_code}, {response.text}")
            break

# Extract all object attributes
esa_df = pd.DataFrame([obj["attributes"] for obj in all_data])

# Save as Parquet for efficiency
# esa_df.to_parquet("Data to use/full_esa_data.parquet", index=False)

print(f"✅ Retrieved {len(df)} records and saved as Parquet!")

# Scrape Lost Object Data from CelesTrek

In [None]:
# Base URL for lost satellites
base_url = "https://celestrak.org/satcat/lost.php"

# Initialize empty list to store data
all_rows = []

# Fetch the first page to determine pagination
response = requests.get(base_url)
if response.status_code != 200:
    print(f"Failed to retrieve data: {response.status_code}")
else:
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the total number of pages (if pagination exists)
    pagination = soup.find("div", class_="pagination")
    if pagination:
        pages = [a.text for a in pagination.find_all("a") if a.text.isdigit()]
        total_pages = max(map(int, pages)) if pages else 1
    else:
        total_pages = 1  # If no pagination, assume one page

    print(f"Total pages found: {total_pages}")

    # Loop through all pages
    for page in range(1, total_pages + 1):
        print(f"Scraping page {page}...")
        page_url = f"{base_url}?page={page}" if total_pages > 1 else base_url
        response = requests.get(page_url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve page {page}: {response.status_code}")
            continue
        
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table")
        
        if table:
            # Extract headers from first page only
            if page == 1:
                headers = [header.text.strip() for header in table.find_all("th")]

            # Extract data rows
            for row in table.find_all("tr")[1:]:  # Skip header row
                cols = [col.text.strip() for col in row.find_all("td")]
                if cols:
                    all_rows.append(cols)

# Convert to DataFrame
lost_object_df = pd.DataFrame(all_rows, columns=headers)

#Ingest to folder
lost_object_df.to_csv("Data to use/lost_objects.csv", index = False)


# Retrieve Low Earth Object data from Space Track

In [None]:
# Space-Track login credentials
USERNAME = os.getenv('user')
PASSWORD = os.getenv('pass')

# Space-Track API URL for querying LEO satellites (sample query from website)
url = "https://www.space-track.org/basicspacedata/query/class/satcat/PERIOD/<128/DECAY/null-val/CURRENT/Y/"

# Create a session for authentication
session = requests.Session()

# Authenticate with space-track.org
login_url = "https://www.space-track.org/ajaxauth/login"
login_data = {"identity": USERNAME, "password": PASSWORD}
session.post(login_url, data=login_data)

# Fetch JSON data
response = session.get(url)

# Ensure request was successful
if response.status_code == 200:
    json_data = response.json()  # Convert response to JSON format

    # Convert JSON to Pandas DataFrame
    leo_df = pd.DataFrame(json_data)

    #leo_df.to_csv("Data to use/leo_objects.csv", index = False)

else:
    print("Failed to retrieve data. Check credentials or API access.")
    print(response.text)
