<a href="https://colab.research.google.com/github/vanchanr/arXiv_analytics/blob/main/arXiv_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install xmltodict

In [None]:
import xmltodict
import requests
import json
import time
import re

In [None]:
def cleanSpaces(s):
    s = s.replace('\n', ' ')
    s = re.sub(r'\s\s+', ' ', s)
    return s

def createPaper(paper):
    for field in ['author', 'category']:
        if type(paper[field]) is not list:
            paper[field] = [paper[field]]
    return {
        'title': cleanSpaces(paper['title']),
        'author': [author['name'] for author in paper['author']],
        'abstract': cleanSpaces(paper['summary']),
        'lastUpdated': paper['updated'],
        'category': [category['@term'] for category in paper['category']],
        'url': paper['id']
    }

In [None]:
url = 'http://export.arxiv.org/api/query'

keyword = input("Enter the keyword to search papers: ")
queryParams = {
    'search_query': 'all:{}'.format(keyword),
    'start': 0,
    'max_results': 200
}

outLs = []
batch = 1
while batch <= 10:
    try:
        resp = requests.get(url, params=queryParams)
        resp = xmltodict.parse(resp.text)
        resp = resp['feed']
        if 'entry' in resp:
            for paper in resp['entry']:
                paper = createPaper(paper)
                outLs.append(paper)
            print("Batch-{}: processed {} papers".format(batch, len(resp['entry'])))
        else:
            break
    except:
        print("Error while processing batch-{}".format(batch))
    batch += 1
    queryParams['start'] += 200
    time.sleep(0.5)
print("Total processed papers:", len(outLs))
try:
    with open('{}.json'.format(keyword), mode='w', encoding='utf-8') as jsonFile:
        json.dump(outLs, jsonFile, indent=2, ensure_ascii=False)
        print("Successfully created the json dump file !")
except:
    print("Error while creating json dump file")

In [None]:
!ls

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
def plotHelper(plotBy, countData):
    plt.bar(countData.keys(), countData.values())
    for key in countData:
        plt.text(key, countData[key], str(countData[key]), va='bottom', ha='center')
    plt.title('Number of Papers by {}'.format(plotBy))
    plt.xlabel(plotBy)
    plt.ylabel('Count')
    plt.xticks(rotation=30)
    plt.show()

def plotYearWise(jsonDump):
    yearCountData = {}
    for paper in jsonDump:
        lastUpdated = paper['lastUpdated']
        dt = datetime.strptime(lastUpdated, '%Y-%m-%dT%H:%M:%SZ')
        year = str(dt.year)
        if year in yearCountData:
            yearCountData[year] += 1
        else:
            yearCountData[year] = 1
    
    yearCountData = dict(sorted(yearCountData.items()))
    plotHelper('year', yearCountData)

#field: 'author', 'category'
#plot top 10
def plotFieldWise(jsonDump, field):
    if field in ['author', 'category']:
        data = {}
        for paper in jsonDump:
            for name in paper[field]:
                if name in data:
                    data[name] += 1
                else:
                    data[name] = 1
        data = dict(sorted(data.items(), key=lambda x: x[1], reverse=True)[:10])
        plotHelper(field, data)

In [None]:
researchArea = input("Enter the filename without '.json': ")
jsonDump = {}
with open('{}.json'.format(researchArea), mode='r') as jsonFile:
    jsonDump = json.load(jsonFile)

In [None]:
plotYearWise(jsonDump)

In [None]:
plotFieldWise(jsonDump, 'author')

In [None]:
plotFieldWise(jsonDump, 'category')