In [None]:
import requests
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import Counter

def count_arxiv_papers(url):
    response = requests.get(url)
    
    # Parse XML response
    root = ET.fromstring(response.text)
    
    # uses OpenSearch namespace
    total_results_tag = root.find(".//{http://a9.com/-/spec/opensearch/1.1/}totalResults")
    
    if total_results_tag is not None:
        return int(total_results_tag.text)
    else:
        return 0

#print(count_arxiv_papers("http://export.arxiv.org/api/query?search_query=abs:time-series"))

def scrape_page(url):
    """
    Fetches the number of ArXiv papers matching a search query, grouped by year.
    :param query: The keyword(s) to search for.
    :param max_results: Max number of papers to retrieve (2000 per request limit).
    :return: Dictionary with years as keys and paper counts as values.
    """
    data = {"titles": [], 
            #"authors": [],
            #"arXiv_subject_codes": [],
            "dates": []
            }

    response = requests.get(url)
    root = ET.fromstring(response.text)
    year_counts = Counter()
    
    # Extract publication dates from each paper
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        published_date = entry.find("{http://www.w3.org/2005/Atom}published").text
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        date = published_date[:10]  # Extracts date
        data["titles"].append(title)
        data["dates"].append(date)

    return data  # Return sorted results


def merge_dictionaries(*dicts):
    merged = {}
    # Collect all keys from all dictionaries
    all_keys = set()
    for d in dicts:
        all_keys.update(d.keys())
    
    # Sum values for each key across all dictionaries
    for key in all_keys:
        merged[key] = sum(d.get(key, 0) for d in dicts)
    
    return merged


def merge_page_dicts(*dicts):
    if not dicts:
        return {"titles": [], "dates": [], "authors": [], "arXiv_subject_codes": []}
    
    return {
        key: [item for d in dicts for item in d[key]]
        for key in dicts[0].keys()
    }


def paginate_search(keyword, num_hits, search_type="abs"): #Works
    '''Returns a list of URLs to be sequentially passed to the API such that all articles corresponding to a search are scraped.'''
    start_index = 0
    max_index = 2000
    urls_list = []
    while start_index < num_hits and start_index < 20_000:
        urls_list.append(set_query_url(keyword, start = start_index, max = max_index))
        start_index += max_index

    return urls_list


def set_query_url(keyword, start=None, max=None): #Works
    mode_consistent = True
    
    base_url = "http://export.arxiv.org/api/query?"
    search_term = f"search_query=abs:{keyword}"

    # Check if one is None while the other isn't
    if (start is not None and max is None) or (start is None and max is not None):
        mode_consistent = False
    assert mode_consistent, "If you pass a start value, you must pass a max value, and vice versa."

    if start is None and max is None:
        extension = ""
    else:
        extension = f"&start={start}&max_results={max}"

    url = base_url + search_term + extension
    return url


def get_subfield_data(keywords):
    '''Takes a representative set of keywords, returns a dataframe of articles corresponding to the search in ArXiV'''

    subfield_dict = {}
    for keyword in keywords:
        #subfield_dict[keyword] = {} #Initialize keyword key of subfield dictionary
        keyword_dicts = []

        query = set_query_url(keyword)
        print(query)
        num_hits = count_arxiv_papers(query)
        print(num_hits)
        page_urls = paginate_search(keyword, num_hits)

        for page_url in page_urls:
            page_data = scrape_page(page_url)
            keyword_dicts.append(page_data)

        keyword_dictionary = merge_page_dicts(*keyword_dicts)
        subfield_dict[keyword] = keyword_dictionary

    return subfield_dict


2


In [None]:


dictionary = get_subfield_data(["time-series", "ARIMA"])
print(dictionary)


for key, value in dictionary.items():
    keyword_df = pd.DataFrame.from_dict(value)
    
    print(keyword_df)



http://export.arxiv.org/api/query?search_query=abs:time-series
2
http://export.arxiv.org/api/query?search_query=abs:ARIMA
401
{'time-series': {'titles': ['Dynamic clustering of time series data', 'The asteroseismic potential of CHEOPS'], 'dates': ['2020-01-28', '2018-11-05']}, 'ARIMA': {'titles': ['Wiman and Arima theorems for quasiregular mappings', 'Power Computations for Intervention Analysis', 'Using ARIMA to Predict the Expansion of Subscriber Data Consumption', 'Stock Price Correlation Coefficient Prediction with ARIMA-LSTM Hybrid\n  Model', 'Time Series Analysis and Forecasting of COVID-19 Cases Using LSTM and\n  ARIMA Models', 'Modeling Data Containing Outliers using ARIMA Additive Outlier\n  (ARIMA-AO)', 'Anomaly and Fraud Detection in Credit Card Transactions Using the ARIMA\n  Model', 'A q-Analogue of the embedding chain $U(6) \\supset G \\supset SO(3)$', 'The Role of Nuclear Physics in Understanding the Cosmos and the Origin\n  of Elements', 'Autoregressive Times Series Met

In [69]:
import re
with open("arXiv_search_terms.txt", 'r', encoding='utf-8') as f:
        keywords= [line.strip() for line in f]
        print(keywords[2:17])



# with open("arXiv_search_terms.txt", 'r', encoding='utf-8') as f:
#     text = f.read()
#     # This regex will capture sequences of letters, digits, underscores, colons, or hyphens.
#     # It does *not* convert everything to lowercase, so you keep the original case.
#     keywords = re.findall(r'[\w:-]+', text)
#     print(keywords)

# if __name__ == "__main__":
#     file_path = "survival_analysis.txt"  # Replace with your actual filename
#     keywords_list = file_to_keywords(file_path)
#     print(keywords_list)

['Survival Analysis', 'Time-to-Event', 'Cox Proportional Hazards', 'Kaplan-Meier', 'Accelerated Failure Time', 'Competing Risks', 'Frailty Models', 'Hazard Function', 'Censoring', 'Hazard Ratio', 'Parametric Survival Models', 'Cure Models', 'Left Truncation', 'Interval Censoring', 'Recurrent Events']
