In [277]:
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [123]:
wiki_domain = "en.wikipedia.org"

def send_wiki_request(url):
    resp = requests.get(f"https://{wiki_domain}/wiki/{url}")
    return resp

### Parse the list of telescopes available on Wikipedia

In [139]:
list_of_telescopes_url = "Lists_of_telescopes"

In [140]:
resp = send_wiki_request(list_of_telescopes_url)

In [141]:
soup = BeautifulSoup(resp.content, 'html.parser')

title = soup.find(id="firstHeading")
print(title.string)

Lists of telescopes


In [142]:
wiki_telescope_lists = []
for a in soup.find_all('a', href=True):
    link = a['href']
    if link.startswith("/wiki/List_of") and link.endswith("telescopes"):
        wiki_telescope_lists.append(link.split('/')[-1])

In [143]:
wiki_telescope_lists

['List_of_large_optical_telescopes',
 'List_of_largest_infrared_telescopes',
 'List_of_largest_optical_reflecting_telescopes',
 'List_of_largest_optical_refracting_telescopes',
 'List_of_optical_telescopes',
 'List_of_radio_telescopes',
 'List_of_solar_telescopes',
 'List_of_space_telescopes',
 'List_of_X-ray_space_telescopes',
 'List_of_optical_telescopes']

In [144]:
def parse_telescope_name(name):
    abbreviation = re.findall(r"(\(.*\))", name)
    if abbreviation:
        name = name.split(abbreviation[0])[0]
    name_itself = name.split('[')[0].split(',')[0]
    return name_itself.strip()

In [254]:
telescopes = []

for wiki_list in wiki_telescope_lists:
    print(f'Processing {wiki_list} page.')
    
    resp = send_wiki_request(wiki_list)
    soup = BeautifulSoup(resp.content, 'html.parser')
    
    found_tables = soup.find_all('table')
    pd_tables = pd.read_html(str(found_tables))
        
    for found_table in pd_tables:

        df = pd.DataFrame(found_table)
        df_cols = df.columns.tolist()
        
        name_cols = [col for col in df_cols if 'Name' in str(col)]
        if not name_cols or len(name_cols) > 1:
            continue
        else:  
            name_col = name_cols[0]
            df['name_parsed'] = df[name_col].apply(parse_telescope_name)
            telescopes.extend(df['name_parsed'].values.tolist())
            
    print(f"Processed {wiki_list} page, collected {len(telescopes)} telescopes in total")

Processing List_of_large_optical_telescopes page.
Processed List_of_large_optical_telescopes page, collected 197 telescopes in total
Processing List_of_largest_infrared_telescopes page.
Processed List_of_largest_infrared_telescopes page, collected 217 telescopes in total
Processing List_of_largest_optical_reflecting_telescopes page.
Processed List_of_largest_optical_reflecting_telescopes page, collected 271 telescopes in total
Processing List_of_largest_optical_refracting_telescopes page.
Processed List_of_largest_optical_refracting_telescopes page, collected 365 telescopes in total
Processing List_of_optical_telescopes page.
Processed List_of_optical_telescopes page, collected 365 telescopes in total
Processing List_of_radio_telescopes page.
Processed List_of_radio_telescopes page, collected 536 telescopes in total
Processing List_of_solar_telescopes page.
Processed List_of_solar_telescopes page, collected 590 telescopes in total
Processing List_of_space_telescopes page.
Processed Lis

In [255]:
len(set(telescopes))

717

### Parse the list of astronomical objects on Wikipedia (later referenced as 'sources')

In [247]:
list_of_objects_url = "Lists_of_astronomical_objects"

In [248]:
resp = send_wiki_request(list_of_objects_url)

In [249]:
soup = BeautifulSoup(resp.content, 'html.parser')

title = soup.find(id="firstHeading")
print(title.string)

Lists of astronomical objects


In [250]:
wiki_astro_obj_lists = []
for a in soup.find_all('a', href=True):
    link = a['href']
    if link.endswith("astronauts"):  # This is from a `See Also` list
        break
    if link.startswith("/wiki/List_of"):
        wiki_astro_obj_lists.append(link.split('/')[-1])

In [251]:
wiki_astro_obj_lists

['List_of_Solar_System_objects',
 'List_of_gravitationally_rounded_objects_of_the_Solar_System',
 'List_of_Solar_System_objects_most_distant_from_the_Sun',
 'List_of_Solar_System_objects_by_size',
 'List_of_natural_satellites',
 'List_of_meteor_showers',
 'List_of_minor_planets',
 'List_of_exceptional_asteroids',
 'List_of_minor_planet_moons',
 'List_of_damocloids',
 'List_of_centaurs_(small_Solar_System_bodies)',
 'List_of_trans-Neptunian_objects',
 'List_of_unnumbered_minor_planets',
 'List_of_possible_dwarf_planets',
 'List_of_largest_exoplanets',
 'List_of_brown_dwarfs',
 'List_of_nearest_stars_and_brown_dwarfs',
 'List_of_brightest_stars',
 'List_of_hottest_stars',
 'List_of_nearest_bright_stars',
 'List_of_most_luminous_stars',
 'List_of_most_massive_stars',
 'List_of_largest_known_stars',
 'List_of_smallest_stars',
 'List_of_oldest_stars',
 'List_of_stars_with_proplyds',
 'List_of_variable_stars',
 'List_of_semiregular_variable_stars',
 'List_of_stars_that_dim_oddly',
 'List_of_

In [261]:
def parse_source_name(name):
    if name and type(name)==str:
        abbreviation = re.findall(r"(\(.*\))", name)
        if abbreviation:
            name = name.split(abbreviation[0])[0]
        name_itself = name.split('[')[0].split(',')[0]
        return name_itself.strip()
    else:
        return ''

In [263]:
sources = []

for wiki_list in wiki_astro_obj_lists:
    print(f'Processing {wiki_list} page.')
    
    resp = send_wiki_request(wiki_list)
    soup = BeautifulSoup(resp.content, 'html.parser')
    
    found_tables = soup.find_all('table')
    pd_tables = pd.read_html(str(found_tables)) if found_tables else []
        
    for found_table in pd_tables:

        df = pd.DataFrame(found_table)
        df_cols = df.columns.tolist()
        
        name_cols = []
        for col in df_cols:
            if ('Name' in str(col) or 'Designation' in str(col) or 'Star' in str(col) 
                or 'binaries' in str(col) or 'binary systems' in str(col) or 'Burst' in str(col)
               or 'Constellation' in str(col) or 'Cluster' in str(col) or 'Galaxy' in str(col)):
                name_cols.append(col)
        
        if not name_cols or len(name_cols) > 1:
            continue
        else:  
            name_col = name_cols[0]
            df['name_parsed'] = df[name_col].apply(parse_source_name)
            sources.extend(df['name_parsed'].values.tolist())
            
    print(f"Processed {wiki_list} page, collected {len(sources)} sources overall")

Processing List_of_Solar_System_objects page.
Processed List_of_Solar_System_objects page, collected 0 sources overall
Processing List_of_gravitationally_rounded_objects_of_the_Solar_System page.
Processed List_of_gravitationally_rounded_objects_of_the_Solar_System page, collected 0 sources overall
Processing List_of_Solar_System_objects_most_distant_from_the_Sun page.
Processed List_of_Solar_System_objects_most_distant_from_the_Sun page, collected 0 sources overall
Processing List_of_Solar_System_objects_by_size page.
Processed List_of_Solar_System_objects_by_size page, collected 0 sources overall
Processing List_of_natural_satellites page.
Processed List_of_natural_satellites page, collected 238 sources overall
Processing List_of_meteor_showers page.
Processed List_of_meteor_showers page, collected 278 sources overall
Processing List_of_minor_planets page.
Processed List_of_minor_planets page, collected 278 sources overall
Processing List_of_exceptional_asteroids page.
Processed List

In [264]:
len(set(sources))

3070

### Save the values in jsonl spacy-friendly format

In [276]:
telescopes = list(set([t for t in telescopes if t]))
sources = list(set([s for s in sources if s]))

In [278]:
with open('patterns_found.jsonl', 'w') as f:
    for source in sources:
        json_line = {"label": "SOURCE", "pattern": [{"lower": source.lower()}]}
        f.write(f"{json.dumps(json_line)}\n")
        
    for telescope in telescopes:
        json_line = {"label": "TELESCOPE", "pattern": [{"lower": telescope.lower()}]}
        f.write(f"{json.dumps(json_line)}\n")