In [3]:
import json

def count_json_rows(file_path):
    rows = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                rows.append(data)
            except json.JSONDecodeError:
                continue
    return len(rows)

In [4]:
rows = count_json_rows('/Users/tayjohnny/Documents/My_MTECH/PLP/arvix/arxiv-metadata-oai-snapshot.json')
print(f'There are {rows} rows in the JSON file.')

There are 2535068 rows in the JSON file.


In [8]:
import json

def get_created_range(file_path):
    created_dates = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                created_dates.extend([version['created'] for version in data.get('versions', [])])
            except json.JSONDecodeError:
                continue
    return min(created_dates), max(created_dates)

In [10]:
min_date, max_date = get_created_range('/Users/tayjohnny/Documents/My_MTECH/PLP/arvix/arxiv-metadata-oai-snapshot.json')
print(f'The range of the "created" field is from {min_date} to {max_date}.')

The range of the "created" field is from Fri, 1 Apr 1994 00:00:00 GMT to Wed, 9 Sep 2020 23:58:23 GMT.


In [2]:
import urllib.request as libreq
with libreq.urlopen('http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=1') as url:
    r = url.read()
print(r)

b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dall%3Aelectron%26id_list%3D%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=1</title>\n  <id>http://arxiv.org/api/cHxbiOdZaP56ODnBPIenZhzg5f8</id>\n  <updated>2024-08-08T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">213192</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/cond-mat/0102536v1</id>\n    <updated>2001-02-28T20:12:09Z</updated>\n    <published>2001-02-28T20:12:09Z</published>\n    <title>Impact of Electron-Electron C

In [4]:
import urllib, urllib.request
url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=5'
data = urllib.request.urlopen(url)
print(data.read().decode('utf-8'))

<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3Dall%3Aelectron%26id_list%3D%26start%3D0%26max_results%3D5" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=all:electron&amp;id_list=&amp;start=0&amp;max_results=5</title>
  <id>http://arxiv.org/api//zp8MUTg8VZFGJrhBDklDEFp5ug</id>
  <updated>2024-08-08T00:00:00-04:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">213192</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">5</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org/abs/cond-mat/0102536v1</id>
    <updated>2001-02-28T20:12:09Z</updated>
    <published>2001-02-28T20:12:09Z</published>
    <title>Impact of Electron-Electron Cusp on Configur

In [20]:
import xml.etree.ElementTree as ET
import pandas as pd

url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=10000&sortBy=lastUpdatedDate&sortOrder=descending'

# http://export.arxiv.org/api/query?search_query=ti:"electron thermal conductivity"&sortBy=lastUpdatedDate&sortOrder=ascending


response = urllib.request.urlopen(url)
data = response.read().decode('utf-8')

root = ET.fromstring(data)
entries = root.findall('.//{http://www.w3.org/2005/Atom}entry')
data = []
for entry in entries:
    title = entry.find('.//{http://www.w3.org/2005/Atom}title').text
    summary = entry.find('.//{http://www.w3.org/2005/Atom}summary').text
    published = entry.find('.//{http://www.w3.org/2005/Atom}published').text
    category = entry.find('.//{http://www.w3.org/2005/Atom}category').attrib.get('term')
    data.append({'Title': title, 'Summary': summary, 'Published': published, 'Category': category})


df = pd.DataFrame(data)

# Convert the 'Published' column to datetime format
df['Published'] = pd.to_datetime(df['Published'])

# Sort the DataFrame by 'Published' date in descending order (latest on top)
df = df.sort_values(by='Published', ascending=False)

display(df)

Unnamed: 0,Title,Summary,Published,Category
0,Manipulable compact many-body localization and...,Geometric frustration is known to completely...,2024-08-07 17:59:06+00:00,cond-mat.str-el
2,Superfluid quantum criticality and the thermal...,The neutron star starts to cool down shortly...,2024-08-07 17:46:49+00:00,nucl-th
3,Enhanced Cooper Pairing via Random Matrix Phon...,There is rich experimental evidence that gra...,2024-08-07 17:36:57+00:00,cond-mat.supr-con
4,Role of time-frequency correlations in two-pho...,Excitation energy transfer is a photophysica...,2024-08-07 17:00:23+00:00,quant-ph
6,Capturing Nonlinear Electron Dynamics with Ful...,Attosecond X-ray pulses are the key to study...,2024-08-07 15:57:51+00:00,physics.optics
...,...,...,...,...
1357,Hidden-anisotropy-induced $π$ phase shift in a...,Laser-induced magnetization precession of an...,2013-08-05 13:26:16+00:00,cond-mat.mes-hall
2117,Development of Hydrogen Bonding Magnetic React...,The proton-magnetic reaction is commonly use...,2012-10-26 10:25:03+00:00,q-bio.OT
6364,Vacuum Potentials for the Two Only Permanent F...,"The two only species of isolatable, smallest...",2011-11-14 07:49:13+00:00,physics.gen-ph
1994,Wave-Particle Duality in the Negative Informat...,Quantum theory reveals astonishing and count...,2007-01-22 20:05:32+00:00,quant-ph


In [21]:
df[df['Category'].str.contains('AI')]



Unnamed: 0,Title,Summary,Published,Category
202,Integrating Large Language Models and Knowledg...,"Aerospace manufacturing companies, such as T...",2024-08-03 07:42:53+00:00,cs.AI
1510,Predicting Heart Failure with Attention Learni...,Cardiovascular diseases (CVDs) encompass a g...,2024-07-11 08:33:42+00:00,cs.AI
1639,Cue Point Estimation using Object Detection,Cue points indicate possible temporal bounda...,2024-07-09 12:56:30+00:00,cs.AI
1644,iASiS: Towards Heterogeneous Big Data Analysis...,The vision of IASIS project is to turn the w...,2024-07-09 10:52:19+00:00,cs.AI
2015,ACR: A Benchmark for Automatic Cohort Retrieval,Identifying patient cohorts is fundamental t...,2024-06-20 23:04:06+00:00,cs.AI
2573,A Large Language Model Outperforms Other Compu...,"High-throughput phenotyping, the automated m...",2024-06-20 22:05:34+00:00,cs.AI
2733,Automatic generation of insights from workers'...,New technologies such as Machine Learning (M...,2024-06-18 15:55:11+00:00,cs.AI
3784,Leveraging Open-Source Large Language Models f...,Social Determinants of Health (SDOH) play a ...,2024-05-30 02:33:28+00:00,cs.AI
3818,Learning from Litigation: Graphs and LLMs for ...,Electronic Discovery (eDiscovery) involves i...,2024-05-29 15:08:55+00:00,cs.AI
4014,Augmented Risk Prediction for the Onset of Alz...,Alzheimer's disease (AD) is the fifth-leadin...,2024-05-26 03:05:10+00:00,cs.AI


In [22]:
df[df['Category'].str.contains('AI')][df['Summary'].str.contains('security')]


  df[df['Category'].str.contains('AI')][df['Summary'].str.contains('security')]


Unnamed: 0,Title,Summary,Published,Category
3784,Leveraging Open-Source Large Language Models f...,Social Determinants of Health (SDOH) play a ...,2024-05-30 02:33:28+00:00,cs.AI
7438,MKF-ADS: Multi-Knowledge Fusion Based Self-sup...,Control Area Network (CAN) is an essential c...,2024-03-07 07:40:53+00:00,cs.AI


In [23]:
df[(df['Category'].str.contains('cs')) & (~df['Category'].str.contains('physics'))]


Unnamed: 0,Title,Summary,Published,Category
11,MaxMind: A Memory Loop Network to Enhance Soft...,The application of large language models to ...,2024-08-07 15:27:22+00:00,cs.SE
36,Improving the quality of Persian clinical text...,Background: The accuracy of spelling in Elec...,2024-08-07 08:31:42+00:00,cs.CL
45,Automatic identification of the area covered b...,The acorn is the fruit of the oak and is an ...,2024-08-07 04:42:10+00:00,cs.CV
58,HeTraX: Energy Efficient 3D Heterogeneous Many...,Transformers have revolutionized deep learni...,2024-08-06 18:48:01+00:00,cs.AR
68,Potential and Limitation of High-Frequency Cor...,This paper explores the potential of cryogen...,2024-08-06 17:16:19+00:00,cs.AR
...,...,...,...,...
1100,Evaluation of Rounding Functions in Nearest-Ne...,A novel evaluation study of the most appropr...,2020-03-15 18:17:36+00:00,cs.CV
4631,FDive: Learning Relevance Models using Pattern...,The detection of interesting patterns in lar...,2019-07-29 15:37:43+00:00,cs.LG
9744,Model-less Active Compliance for Continuum Rob...,Endowing continuum robots with compliance wh...,2019-02-24 13:32:13+00:00,cs.RO
4053,Confidence Trigger Detection: Accelerating Rea...,Real-time object tracking necessitates a del...,2019-02-02 01:52:53+00:00,cs.CV
