In [1]:
import tqdm
import multiprocessing as mp
import typing

class Cluster:
    def __init__(self, name: str, dm: float, log_age: float, fe_h: float, e_b_sub_v: float, memb: int):
        self.name = name
        self.dm = dm
        self.log_age = log_age
        self.fe_h = fe_h
        self.e_b_sub_v = e_b_sub_v
        self.memb = memb

    def __str__(self):
        return f"{self.name}"

    def __repr__(self):
        return f"{self.name}(dm={self.dm}, log_age={self.log_age}, fe_h={self.fe_h}, e_b_sub_v={self.e_b_sub_v}, memb={self.memb})"


clusters = [
    Cluster("IC 2391", 5.908, 7.70, -0.01, 0.030, 254),
    Cluster("NGC 6475", 7.234, 8.54, 0.02, 0.049, 874),
    Cluster("NGC 2360", 10.229, 8.98, -0.03, 0.090, 848),
    Cluster("NGC 6793", 8.894, 8.78, float("nan"), 0.272, 271),
    Cluster("NGC 2232", 7.575, 7.70, 0.11, 0.031, 241)
]

In [2]:
import sys
import os
import time
import re
import json

import requests
from urllib.parse import quote as urlencode

import numpy as np

import pprint

pp = pprint.PrettyPrinter(indent=4)


def mast_query(request):
    """Perform a MAST query.

        Parameters
        ----------
        request (dictionary): The MAST request json object

        Returns head,content where head is the response HTTP headers, and content is the returned data"""

    # Base API url
    request_url = "https://mast.stsci.edu/api/v0/invoke"

    # Grab Python Version
    version = ".".join(map(str, sys.version_info[:3]))

    # Create Http Header Variables
    headers = {"Content-type": "application/x-www-form-urlencoded",
               "Accept": "text/plain",
               "User-agent": "python-requests/" + version}

    # Encoding the request as a json string
    req_string = json.dumps(request)
    req_string = urlencode(req_string)

    # Perform the HTTP request
    resp = requests.post(request_url, data="request=" + req_string, headers=headers)

    # Pull out the headers and response content
    head = resp.headers
    content = resp.content.decode("utf-8")

    return head, content

def set_filters(parameters):
    return [{"paramName":p, "values":v} for p,v in parameters.items()]

def set_min_max(min, max):
    return [{"min": min, "max": max}]

def name_resolver(name: str) -> list[dict[str, ...]]:
    resolver_request = {
        "service": "Mast.Name.Lookup",
        "params": {
            "input": name,
            "format": "json"
        },
    }

    headers, resolved_object_string = mast_query(resolver_request)
    resolved_object = json.loads(resolved_object_string)
    return resolved_object["resolvedCoordinate"]

def cone(name: str, ra: float, dec: float, radius: float, page: int = 1, completion_count: typing.Optional[mp.Value] = None) -> dict[str, ...]:
    cone_request = {
        "service": " Mast.Catalogs.GaiaDR2.Cone",
        "params": {
            "ra": ra,
            "dec": dec,
            "radius": radius,
            "input": name
        },
        "format": "json",
        "pagesize": 5000,
        "page": page,
        "removenullcolumns": False,
        "removenullrows": False,
        "removecache": False,
        "columnsconfigid": "Mast.Catalogs.Gaia.Cone"
    }

    headers, mast_data_string = mast_query(cone_request)
    mast_data = json.loads(mast_data_string)

    if completion_count is not None:
        completion_count.value += 1

    return mast_data

In [3]:
ic_2391_a = name_resolver("IC 2391")[0]
pp.pprint(ic_2391_a)

{   'cacheDate': 'Mar 3, 2025, 1:32:45 PM',
    'cached': True,
    'canonicalName': 'IC 2391',
    'decl': -52.991,
    'majorAxis': 1.6283333333333334,
    'minorAxis': 1.6283333333333334,
    'objectType': 'OpenCluster',
    'positionAngle': 0.0,
    'ra': 130.292,
    'radius': 0.8141666666666667,
    'resolver': 'SIMBADCFA',
    'resolverTime': 284,
    'searchRadius': -1.0,
    'searchString': 'ic 2391'}


In [4]:
ic_2391_b = cone("IC 2391", ic_2391_a['ra'], ic_2391_a['decl'], ic_2391_a['radius'])
print("Query done, printing")
pp.pprint(ic_2391_b.keys())

Query done, printing
dict_keys(['status', 'msg', 'data', 'fields', 'paging'])


In [5]:
pp.pprint({
    k: ic_2391_b[k]
    for k in ['status', 'msg', 'paging', 'fields']
})

print("\n\n")

pp.pprint(ic_2391_b['data'][0])

{   'fields': [   {'name': 'solution_id', 'type': 'string'},
                  {'name': 'designation', 'type': 'string'},
                  {'name': 'source_id', 'type': 'string'},
                  {'name': 'random_index', 'type': 'string'},
                  {'name': 'ref_epoch', 'type': 'float'},
                  {'name': 'ra', 'type': 'float'},
                  {'name': 'ra_error', 'type': 'float'},
                  {'name': 'dec', 'type': 'float'},
                  {'name': 'dec_error', 'type': 'float'},
                  {'name': 'parallax', 'type': 'float'},
                  {'name': 'parallax_error', 'type': 'float'},
                  {'name': 'parallax_over_error', 'type': 'float'},
                  {'name': 'pmra', 'type': 'float'},
                  {'name': 'pmra_error', 'type': 'float'},
                  {'name': 'pmdec', 'type': 'float'},
                  {'name': 'pmdec_error', 'type': 'float'},
                  {'name': 'ra_dec_corr', 'type': 'float'},
       

In [6]:
all_data = [*ic_2391_b['data']]

page_assignments = list(range(2, ic_2391_b['paging']['pagesFiltered']+1))

manager = mp.Manager()
completed_count = manager.Value('i', 1)

def update_progress():
    while completed_count.value < len(page_assignments)+1:
        print(f"\rProgress: {completed_count.value}/{len(page_assignments)+1}", end="")
        time.sleep(1)

status_updater = mp.Process(target=update_progress)
status_updater.start()

def filtered_cone(*args, **kwargs):
    return cone(*args, **kwargs)['data']

def flatten(xss):
    return [x for xs in xss for x in xs]

with mp.Pool(int(mp.cpu_count() * 1.5)) as pool:
    all_data.extend(flatten(pool.starmap(
        filtered_cone,
        [
            ("IC 2391", ic_2391_a['ra'], ic_2391_a['decl'], ic_2391_a['radius'], page, completed_count)
            for page in page_assignments
        ]
    )))

status_updater.terminate()
print("\rAll tasks completed")

All tasks completed


In [7]:
pp.pprint(len(all_data))

197375


In [8]:
all_data[-1].keys()

dict_keys(['solution_id', 'designation', 'source_id', 'random_index', 'ref_epoch', 'ra', 'ra_error', 'dec', 'dec_error', 'parallax', 'parallax_error', 'parallax_over_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'ra_dec_corr', 'ra_parallax_corr', 'ra_pmra_corr', 'ra_pmdec_corr', 'dec_parallax_corr', 'dec_pmra_corr', 'dec_pmdec_corr', 'parallax_pmra_corr', 'parallax_pmdec_corr', 'pmra_pmdec_corr', 'astrometric_n_obs_al', 'astrometric_n_obs_ac', 'astrometric_n_good_obs_al', 'astrometric_n_bad_obs_al', 'astrometric_gof_al', 'astrometric_chi2_al', 'astrometric_excess_noise', 'astrometric_excess_noise_sig', 'astrometric_params_solved', 'astrometric_primary_flag', 'astrometric_weight_al', 'astrometric_pseudo_colour', 'astrometric_pseudo_colour_error', 'mean_varpi_factor_al', 'astrometric_matched_observations', 'visibility_periods_used', 'astrometric_sigma5d_max', 'frame_rotator_object_type', 'matched_observations', 'duplicated_source', 'phot_g_n_obs', 'phot_g_mean_flux', 'phot_g_mean

In [11]:
FILTER_COLUMNS = {'visibility_periods_used', 'astrometric_excess_noise', 'parallax_over_error', 'phot_g_mean_flux_over_error', 'phot_bp_mean_flux_over_error', 'phot_rp_mean_flux_over_error'}
DATA_COLUMNS = {'source_id', 'phot_g_mean_flux', 'phot_g_mean_mag', 'bp_rp', 'bp_g', 'g_rp'}
RELEVANT_COLUMNS = FILTER_COLUMNS | DATA_COLUMNS

relevant_data = [
    {k: row[k] for k in RELEVANT_COLUMNS}
    for row in all_data
]

def filter_row(row: dict[str, ...]) -> bool:
    if any(row[k] is None for k in RELEVANT_COLUMNS):
        return False
    return row['visibility_periods_used'] >= 9 and \
        row['astrometric_excess_noise'] < 1 and \
        row['parallax_over_error'] > 10 and \
        row['phot_g_mean_flux_over_error'] > 50 and \
        row['phot_bp_mean_flux_over_error'] > 20 and \
        row['phot_rp_mean_flux_over_error'] > 20

relevant_data = [*filter(filter_row, relevant_data)]
relevant_data = [
    {k: row[k] for k in DATA_COLUMNS}
    for row in relevant_data
]
pp.pprint(len(relevant_data))

9562
