In [1]:
import pandas as pd

### Load the Json Data in Chunks

In [24]:
# Load the JSON Data in Chunks
df = pd.read_json(
    '..\\archive\\arxiv-metadata-oai-snapshot.json',
    lines=True,
    chunksize=1000
)

df

<pandas.io.json._json.JsonReader at 0x28d4db7c7d0>

In [25]:
# First row in First Chunk
for chunk in df:
    print(chunk.iloc[1])
    break

id                                                         704.0002
submitter                                              Louis Theran
authors                             Ileana Streinu and Louis Theran
title                      Sparsity-certifying Graph Decompositions
comments                      To appear in Graphs and Combinatorics
journal-ref                                                    None
doi                                                            None
report-no                                                      None
categories                                            math.CO cs.CG
license           http://arxiv.org/licenses/nonexclusive-distrib...
abstract            We describe a new algorithm, the $(k,\ell)$-...
versions          [{'version': 'v1', 'created': 'Sat, 31 Mar 200...
update_date                                              2008-12-13
authors_parsed             [[Streinu, Ileana, ], [Theran, Louis, ]]
Name: 1, dtype: object


### Filtering Function

In [28]:
# function to filter NLP Papers
def check_category(category):
    return 'cs.CL' in category

### Filter all chunks and Concat them for Filtered Dataset

In [27]:
# Filter Each chunk
chunks = []
for chunk in df:
    filtered = chunk.loc[chunk['categories'].apply(check_category), ['id', 'doi', 'title', 'abstract', 'categories', 'update_date']]
    chunks.append(filtered)

len(chunks)

2661

In [29]:
df = pd.concat(chunks)
df

Unnamed: 0,id,doi,title,abstract,categories,update_date
2082,704.2083,,Introduction to Arabic Speech Recognition Usin...,In this paper Arabic was investigated from t...,cs.CL cs.AI,2007-05-23
2200,704.2201,,Arabic Speech Recognition System using CMU-Sph...,In this paper we present the creation of an ...,cs.CL cs.AI,2007-05-23
3661,704.3662,,An Automated Evaluation Metric for Chinese Tex...,"In this paper, we propose an automated evalu...",cs.HC cs.CL,2013-10-29
3664,704.3665,,On the Development of Text Input Method - Less...,Intelligent Input Methods (IM) are essential...,cs.CL cs.HC,2007-05-23
3707,704.3708,,Network statistics on early English Syntax: St...,This paper includes a reflection on the role...,cs.CL,2007-05-23
...,...,...,...,...,...,...
2414645,cs/9912009,10.1109/TAI.1996.560480,Deduction over Mixed-Level Logic Representatio...,A system is described that uses a mixed-leve...,cs.CL,2016-11-15
2414652,cs/9912016,,HMM Specialization with Selective Lexicalization,We present a technique which complements Hid...,cs.CL cs.LG,2007-05-23
2414653,cs/9912017,,Mixed-Level Knowledge Representation and Varia...,A system is described that uses a mixed-leve...,cs.CL,2007-05-23
2624826,physics/0307117,10.1103/PhysRevE.68.061107,Symbolic stochastic dynamical systems viewed a...,A theory of systems with long-range correlat...,physics.data-an cond-mat.stat-mech cs.CL math-...,2016-09-08


### Save Filtered Data to CSV

In [36]:
df.to_csv('filtered_data.csv')

# Store Filtered Data Efficiently
df.to_feather('filtered_data.feather')

In [37]:
df = pd.read_csv('filtered_data.csv', index_col=[0], low_memory=False)

print(df.shape)
df.head()

(77117, 6)


Unnamed: 0,id,doi,title,abstract,categories,update_date
2082,704.2083,,Introduction to Arabic Speech Recognition Usin...,In this paper Arabic was investigated from t...,cs.CL cs.AI,2007-05-23
2200,704.2201,,Arabic Speech Recognition System using CMU-Sph...,In this paper we present the creation of an ...,cs.CL cs.AI,2007-05-23
3661,704.3662,,An Automated Evaluation Metric for Chinese Tex...,"In this paper, we propose an automated evalu...",cs.HC cs.CL,2013-10-29
3664,704.3665,,On the Development of Text Input Method - Less...,Intelligent Input Methods (IM) are essential...,cs.CL cs.HC,2007-05-23
3707,704.3708,,Network statistics on early English Syntax: St...,This paper includes a reflection on the role...,cs.CL,2007-05-23


In [2]:
df = pd.read_feather('filtered_data.feather')

print(df.shape)
df.head()

(77117, 6)


Unnamed: 0,id,doi,title,abstract,categories,update_date
2082,704.2083,,Introduction to Arabic Speech Recognition Usin...,In this paper Arabic was investigated from t...,cs.CL cs.AI,2007-05-23
2200,704.2201,,Arabic Speech Recognition System using CMU-Sph...,In this paper we present the creation of an ...,cs.CL cs.AI,2007-05-23
3661,704.3662,,An Automated Evaluation Metric for Chinese Tex...,"In this paper, we propose an automated evalu...",cs.HC cs.CL,2013-10-29
3664,704.3665,,On the Development of Text Input Method - Less...,Intelligent Input Methods (IM) are essential...,cs.CL cs.HC,2007-05-23
3707,704.3708,,Network statistics on early English Syntax: St...,This paper includes a reflection on the role...,cs.CL,2007-05-23


In [4]:
df['title'].iloc[102]

'Offloading Cognition onto Cognitive Technology'