# Final Project

Find all QSO in SDSS DR18 in my given redshift ratio.

In [1]:
import os
import pandas as pd
from astroquery.sdss import SDSS
from astroquery.gaia import Gaia

In [2]:
query1 = """
SELECT COUNT(*)
FROM SpecObj AS s
WHERE
s.Class = 'QSO'  
AND s.z > 0 AND s.z < 1.1
"""

In [3]:
QSO_df = SDSS.query_sql(query1).to_pandas()
QSO_df

Unnamed: 0,Column1
0,295722


In [4]:
query2 = """
SELECT bestObjID
FROM SpecObj AS s
WHERE
s.Class = 'QSO'  
AND s.z > 0 AND s.z < 1.1
"""

In [5]:
QSO = SDSS.query_sql(query2).to_pandas()

Cross-match them with Gaia DR3. For cross match, use the Gaia additional catalog with sources already matched with SDSS DR13.

In [7]:
# Configuration
batch_size = 1000  # Number of IDs per batch
output_file = "gaia_results_all_batches.csv"  # Output file for results
failed_batches_file = "failed_batches.txt"  # Log file for failed batches
max_retries = 3  # Number of retries for failed batches

# Main batch processing loop
for i in range(0, len(QSO), batch_size):
    # Create the batch query
    batch_ids = "','".join(map(str, QSO['bestObjID'][i:i + batch_size]))
    
    query3 = f"""
    SELECT s.source_id, s.original_ext_source_id
    FROM gaiadr3.sdssdr13_best_neighbour AS s
    WHERE s.original_ext_source_id IN ('{batch_ids}')
    """
    
    retries = 0
    success = False

    while retries < max_retries and not success:
        try:
            print(f"Processing batch {i // batch_size + 1} ({i} to {i + batch_size} IDs)... Attempt {retries + 1}")
            job = Gaia.launch_job(query3)
            batch_results = job.get_results().to_pandas()
            print(f"Batch {i // batch_size + 1} returned {len(batch_results)} rows.")
            
            # Reload existing results
            existing_results = pd.read_csv(output_file) if os.path.exists(output_file) else pd.DataFrame()
            
            # Combine results and remove duplicates
            combined_results = pd.concat([existing_results, batch_results], ignore_index=True).drop_duplicates()
            
            # Save the updated results
            combined_results.to_csv(output_file, index=False)
            print(f"Batch {i // batch_size + 1} completed. Total rows saved: {len(combined_results)}")
            
            success = True  # Mark batch as successful
        except Exception as e:
            retries += 1
            print(f"Error processing batch {i} to {i + batch_size}: {e}")
    
    if not success:
        # Log failed batch to file
        with open(failed_batches_file, "a") as log_file:
            log_file.write(f"{i},{i + batch_size}\n")
        print(f"Batch {i} to {i + batch_size} failed after {max_retries} retries. Moving to the next batch.")

# Reprocess failed batches
print("Reprocessing failed batches...")
try:
    with open(failed_batches_file, "r") as log_file:
        failed_batches = log_file.readlines()
except FileNotFoundError:
    failed_batches = []

for batch_range in failed_batches:
    start, end = map(int, batch_range.strip().split(","))
    batch_ids = "','".join(map(str, QSO['bestObjID'][start:end]))
    query3 = f"""
    SELECT s.source_id, s.original_ext_source_id
    FROM gaiadr3.sdssdr13_best_neighbour AS s
    WHERE s.original_ext_source_id IN ('{batch_ids}')
    """
    try:
        print(f"Reprocessing batch {start} to {end}...")
        job = Gaia.launch_job(query3)
        batch_results = job.get_results().to_pandas()
        
        # Reload existing results
        existing_results = pd.read_csv(output_file) if os.path.exists(output_file) else pd.DataFrame()
        
        # Combine results and remove duplicates
        combined_results = pd.concat([existing_results, batch_results], ignore_index=True).drop_duplicates()
        
        # Save the updated results
        combined_results.to_csv(output_file, index=False)
        print(f"Batch {start} to {end} reprocessed successfully. Total rows saved: {len(combined_results)}")
        
        # Remove successfully reprocessed batch from log
        with open(failed_batches_file, "r") as log_file:
            lines = log_file.readlines()
        with open(failed_batches_file, "w") as log_file:
            log_file.writelines(line for line in lines if line.strip() != f"{start},{end}")
    except Exception as e:
        print(f"Failed to reprocess batch {start} to {end}: {e}")

# Final message
print("All processing completed.")

Processing batch 1 (0 to 1000 IDs)... Attempt 1
Batch 1 returned 920 rows.
Batch 1 completed. Total rows saved: 920
Processing batch 2 (1000 to 2000 IDs)... Attempt 1
Error processing batch 1000 to 2000: [Errno 54] Connection reset by peer
Processing batch 2 (1000 to 2000 IDs)... Attempt 2
Batch 2 returned 956 rows.
Batch 2 completed. Total rows saved: 1876
Processing batch 3 (2000 to 3000 IDs)... Attempt 1
Batch 3 returned 960 rows.
Batch 3 completed. Total rows saved: 2836
Processing batch 4 (3000 to 4000 IDs)... Attempt 1
Batch 4 returned 945 rows.
Batch 4 completed. Total rows saved: 3781
Processing batch 5 (4000 to 5000 IDs)... Attempt 1
Batch 5 returned 906 rows.
Batch 5 completed. Total rows saved: 4687
Processing batch 6 (5000 to 6000 IDs)... Attempt 1
Error processing batch 5000 to 6000: [Errno 54] Connection reset by peer
Processing batch 6 (5000 to 6000 IDs)... Attempt 2
Batch 6 returned 882 rows.
Batch 6 completed. Total rows saved: 5569
Processing batch 7 (6000 to 7000 IDs

Now we will move on with downloading the spectra.