# EDA

### Setup

In [7]:
from urllib.parse import urlencode, quote_plus, unquote, parse_qs, parse_qsl, unquote_plus
import pandas as pd
import re
import requests
import time
import json
from tqdm import tqdm

In [8]:
token="mgczpeQ1Yprfi6uPrYBnmlC3g41nCTNEcGsYQbGm"

# 1️⃣ Examples

### Example strings

In [9]:
encoded_examples = [
    "filter_database_fq_database=AND&filter_database_fq_database=database%3A%22physics%22&fq=%7B!type%3Daqp%20v%3D%24fq_database%7D&fq_database=(database%3A%22physics%22)&q=%2Bauthor%3A%22%5ERoman%22%20%2Byear%3A2021&sort=date%20desc%2C%20bibcode%20desc&p_=0",
    "q=%20author%3A%22Roman%22%20year%3A2011&sort=date%20desc%2C%20bibcode%20desc&p_=0",
    "q=%2Bauthor%3A%22%5ERoman%22%20%2Byear%3A2021&sort=date%20desc%2C%20bibcode%20desc&p_=0",
    "q=author:\"Gardiner,+Emiko+C.\"&fl=id&start=0&sort=date+desc,bibcode+desc&rows=10&wt=json&p_=0",
    "q=full:\"substructures\"++full:\"protoplanetary+disk\"+abs:\"accretion+rate\""
]

# unquote example
for ex in encoded_examples:
    print(unquote(ex))

filter_database_fq_database=AND&filter_database_fq_database=database:"physics"&fq={!type=aqp v=$fq_database}&fq_database=(database:"physics")&q=+author:"^Roman" +year:2021&sort=date desc, bibcode desc&p_=0
q= author:"Roman" year:2011&sort=date desc, bibcode desc&p_=0
q=+author:"^Roman" +year:2021&sort=date desc, bibcode desc&p_=0
q=author:"Gardiner,+Emiko+C."&fl=id&start=0&sort=date+desc,bibcode+desc&rows=10&wt=json&p_=0
q=full:"substructures"++full:"protoplanetary+disk"+abs:"accretion+rate"


#### Make API call using example

### 3 Examples

|   SOURCE          |  QUERY                                                                                |
|-------------------|---------------------------------------------------------------------------------------|
| website url       | q=%20author%3A%22Starck%2C%20Jean-Luc%22&sort=date%20desc%2C%20bibcode%20desc&p_=0    |
| colab notebook    | q=author%3AStarck%2C+Jean-Luc                                                         |


# 2️⃣ Parse

### Get queries

#### Approach 1: Naive Query Extraction

**Note**: This code does not handle bigqueries correctly currently, or other complicated queries. I need to get some help.

It also ignores non-escaped '+' and '-' characters. So `+author:x` is treated as `author:x`

In [10]:
def extract_queries_basic(path):
    # Read the entire file
    with open(path, 'r') as f:
        content = f.read()

    # Use regex to find all the api requests that start with 'q='
    matches = re.findall(r'q=.*?&', content)
    queries = [match[2:-1] for match in matches]

    # Convert to dataframe
    df = pd.DataFrame(queries, columns=['query'])

    return df

#### Approach 2: More Complex Query Extraction

##### Constrains on what gets extracted

In [11]:
def clear_big_query(q):
    if '__clearBigQuery' in q:
        return bool(q['__clearBigQuery'][0])
    return True

def gt_1_row(q):
    if 'rows' in q:
        return int(q['rows'][0]) > 1
    return True

def not_big_query(q):
    return q['q'][0] != '*:*'

def not_identifier(q):
    return 'identifier' not in q['q'][0]

def not_bibcode(q):
    return 'bibcode' not in q['q'][0]

def not_doi(q):
    return 'doi' not in q['q'][0]

def long_fl(q):
    """Long fl parameter can be used to identify machine-generated queries"""
    return NotImplementedError

def not_all(q):
    return q['q'][0] != '*'

##### Extract queries using constraints

In [14]:
def extract_queries(path: str, constraints: list[callable]) -> list[dict]:
    queries = []

    with open(path, 'r') as f:
        for line in f.readlines():
            if line[:2] != 'q=':
                continue

            parsed = parse_qs(line.strip())

            skip = False
            for constraint in constraints:
                if not constraint(parsed):
                    skip=True
                    break
            if skip:
                continue
            
            # print(parsed)
            queries.append(parsed)

    q_params = [query['q'][0] for query in queries]
    df = pd.DataFrame(q_params, columns=['query']).drop_duplicates()
    return df
 

In [15]:
base_path = "../../data/queries/"
constraints = [clear_big_query, gt_1_row, not_big_query, not_identifier, not_bibcode, not_doi, not_all]

df = extract_queries(base_path + "searches.txt", constraints=constraints)
df['query'].to_csv(base_path + "queries.csv")
print(len(df))
df.head(20)

402


Unnamed: 0,query
0,ngc 1502
1,"abs:""Hercules"""
4,"title:""Hercules"""
7,"author:""^Ogilvie"" year:2004 property:refereed"
9,"author:""Gardiner, Emiko C."""
11,abs:(HD 183143 catalogue dibs)
12,"author:""^Roman"" year:2021"
16,"author:(""tsujimoto,t."")"
22,"R.V. Wagoner et al., Astrophys. J. 148, 3 (1967)"
24,"title:""Hercules"" abs:""dwarf"""


#### Validate Query Strings

Try to make requests to the API using a random sample of these query strings. The status is stored in the response

In [16]:
def validate_queries(df: pd.DataFrame):
    df = df.copy()
    results = []
    for _, q in tqdm(df['query'].items()):
        query = {"q": f"{q}"}
        encoded_query = urlencode(query)
        result = requests.get("https://api.adsabs.harvard.edu/v1/search/query?{}".format(encoded_query), \
                        headers={'Authorization': 'Bearer ' + token})
        results.append(result)
        time.sleep(0.1)
    
    df['results_json'] = [r.json() for r in results]
    return df

In [17]:
df_validated = validate_queries(df)
df_validated.head(5)

Unnamed: 0,query,results_json
0,ngc 1502,"{'responseHeader': {'status': 0, 'QTime': 304,..."
1,"abs:""Hercules""","{'responseHeader': {'status': 0, 'QTime': 173,..."
4,"title:""Hercules""","{'responseHeader': {'status': 0, 'QTime': 17, ..."
7,"author:""^Ogilvie"" year:2004 property:refereed","{'responseHeader': {'status': 0, 'QTime': 52, ..."
9,"author:""Gardiner, Emiko C.""","{'responseHeader': {'status': 0, 'QTime': 64, ..."


Extract information from the responses and add these to the dataframe

In [19]:
def extract_status(result):
    return result['responseHeader']['status']

def extract_num_found(result):
    try:
        return int(result['response']['numFound'])
    except KeyError:
        return None
    
df_validated['status'] = df_validated['results_json'].apply(extract_status)
df_validated['n_results_found'] = df_validated['results_json'].apply(extract_num_found)


df_validated.head(5)

Unnamed: 0,query,results_json,status,n_results_found
0,ngc 1502,"{'responseHeader': {'status': 0, 'QTime': 304,...",0,67.0
1,"abs:""Hercules""","{'responseHeader': {'status': 0, 'QTime': 173,...",0,20079.0
4,"title:""Hercules""","{'responseHeader': {'status': 0, 'QTime': 17, ...",0,5210.0
7,"author:""^Ogilvie"" year:2004 property:refereed","{'responseHeader': {'status': 0, 'QTime': 52, ...",0,1.0
9,"author:""Gardiner, Emiko C.""","{'responseHeader': {'status': 0, 'QTime': 64, ...",0,11.0


In [20]:
# check for any failed requests
df_validated[df_validated['status'] != 0]

Unnamed: 0,query,results_json,status,n_results_found
372,"bibstem:""MNRAS"", fulltext:""TESS""","{'responseHeader': {'status': 400, 'QTime': 8,...",400,
933,"author:(^""dotto"") abs:(dart)","{'responseHeader': {'status': 400, 'QTime': 2,...",400,


Save results to a file

In [22]:
# Rearrange and save to file
df_validated = df_validated[['status', 'n_results_found', 'query', 'results_json']].reset_index(drop=True)
df_validated.to_csv(base_path + "parsed_queries.csv", index=True)

#### ChatGPT

In [52]:
# Status: not working
qs1 = """q=pubdate:[2020-01+TO+9999-12]+author:("ellis,+g")&fl=identifier,[citations],abstract,author,book_author,orcid_pub,orcid_user,orcid_other,bibcode,citation_count,comment,doi,id,keyword,page,property,pub,pub_raw,pubdate,pubnote,read_count,title,volume,links_data,esources,data,citation_count_norm,email,doctype&fq_database=database:+astronomy&start=0&__fq_database=AND&__fq_database=astronomy&fq={!type%3Daqp+v%3D$fq_database
"""

qs2 = '''q=author:"Manzo-Martínez,+Ezequiel"&stats=true&fl=id&start=0&sort=date+desc,bibcode+desc&rows=10&wt=json&p_=0&stats.field=citation_count
'''

qs3 = '''q=++author:"^Roman"++year:2021&filter_database_fq_database=AND&filter_database_fq_database=database:"astronomy"&fl=identifier,[citations],abstract,author,book_author,orcid_pub,orcid_user,orcid_other,bibcode,citation_count,comment,doi,id,keyword,page,property,pub,pub_raw,pubdate,pubnote,read_count,title,volume,links_data,esources,data,citation_count_norm,email,doctype&fq_database=(database:"astronomy")&start=0&fq={!type%3Daqp+v%3D$fq_database
'''

In [55]:
# First, let's assume your query string is part of a full URL, as it typically would be in an actual HTTP request.
# If you have just the query string, you could skip the urlparse step.

# full_url = 'http://example.com?' + query_string  # Uncomment this line if you're working with a full URL.
# parsed_url = urlparse(full_url)  # Uncomment this line if you're working with a full URL.
# query_string = parsed_url.query  # Uncomment this line if you're working with a full URL.
def reconstruct_params_dict(query_string):
    # Parse the query string into a dictionary, where each key corresponds to a parameter, and each value is a list of values for that parameter.
    params = parse_qs(query_string)

    # Now, `params` is a dictionary where the values are lists. If you know each key only has one value, you might want to simplify this.
    simple_params = {k: v[0] for k, v in params.items()}

    # The query string may have URL encoding, so we should decode it to get the original characters.
    for key in simple_params:
        simple_params[key] = unquote_plus(simple_params[key])

    # If you need to convert the '+' back to spaces, particularly for the 'q' parameter, you can do so.
    if 'q' in simple_params:
        simple_params['q'] = simple_params['q'].replace('+', ' ')

    return simple_params

In [56]:
sp = reconstruct_params_dict(qs3)
sp

{'q': '  author:"^Roman"  year:2021',
 'filter_database_fq_database': 'AND',
 'fl': 'identifier,[citations],abstract,author,book_author,orcid_pub,orcid_user,orcid_other,bibcode,citation_count,comment,doi,id,keyword,page,property,pub,pub_raw,pubdate,pubnote,read_count,title,volume,links_data,esources,data,citation_count_norm,email,doctype',
 'fq_database': '(database:"astronomy")',
 'start': '0',
 'fq': '{!type=aqp v=$fq_database\n'}

##### API request using dictionary constructied from logged query string

Now actually try to do the search

In [57]:
encoded_query = urlencode(simple_params)
results = requests.get("https://api.adsabs.harvard.edu/v1/search/query?{}".format(encoded_query), \
                       headers={'Authorization': 'Bearer ' + token})

# format the response in a nicely readable format
results.json()

{'responseHeader': {'status': 400,
  'QTime': 6,
  'params': {'q': '  author:"^Roman"  year:2021',
   'filter_database_fq_database': 'AND',
   'fl': 'identifier,[citations],abstract,author,book_author,orcid_pub,orcid_user,orcid_other,bibcode,citation_count,comment,doi,id,keyword,page,property,pub,pub_raw,pubdate,pubnote,read_count,title,volume,links_data,esources,data,citation_count_norm,email,doctype',
   'fq_database': '(database:"astronomy")',
   'start': '0',
   'internal_logging_params': 'X-Amzn-Trace-Id=Root=1-652daf01-3478e6cc0e29e9351caf05d2',
   'fq': '{!type=aqp v=$fq_database\n',
   'rows': '10',
   'wt': 'json'}},
 'error': {'metadata': ['error-class',
   'org.apache.solr.common.SolrException',
   'root-error-class',
   'org.apache.solr.search.SyntaxError'],
  'msg': "org.apache.solr.search.SyntaxError: Expected identifier at pos 26 str='{!type=aqp v=$fq_database\n'",
  'code': 400}}

#### More general parsing

In [None]:
def extract_queries(path):
    # open file and use regex to extract all api requests

    # parse the query strings with regex

    # ignore everything except for the query, `q`

    # convert to dataframe

    pass

In [None]:
query_pattern = r'q=.*?&wt=json'

def extract_queries(path, pattern):
    # Read the entire file
    with open(path, 'r') as f:
        content = f.read()

    # Use regex to find all the api requests that start with 'q='
    matches = re.findall(pattern, content)

    # Parse the query strings with parse_qs and extract the 'q' parameter
    queries = [parse_qs(match)['q'][0].split() for match in matches]

    # Convert to dataframe
    df = pd.DataFrame(queries, columns=['query'])

    return df

def count_query_fields(df):
    def parse_query_str(s : str) -> list[str]:
        pass