In [2]:
import pandas as pd
import json

In [3]:
from itertools import islice

In [5]:
def stream_jsonl_to_csv(input_file, output_file, chunk_size=10000):
    """
    Stream JSONL file to CSV without loading everything into memory
    """
    def jsonl_generator():
        with open(input_file, 'r') as f:
            for i, line in enumerate(f):
                try:
                    yield json.loads(line.strip())
                except json.JSONDecodeError:
                    continue
    
    gen = jsonl_generator()
    first_chunk = True
    
    while True:
        # Get next chunk of data
        chunk_data = list(islice(gen, chunk_size))
        if not chunk_data:
            break
            
        df = pd.DataFrame(chunk_data)
        df.to_csv(output_file, 
                 mode='w' if first_chunk else 'a', 
                 header=first_chunk, 
                 index=False)
        first_chunk = False
        print(f"Processed {len(chunk_data)} records...")


In [None]:
stream_jsonl_to_csv('arxiv-metadata-oai-snapshot.json', 'arxiv.csv')

In [4]:
data = pd.read_csv('arxiv.csv')
data.head()

  data = pd.read_csv('arxiv.csv')


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[['Balázs', 'C.', ''], ['Berger', 'E. L.', '']..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[['Streinu', 'Ileana', ''], ['Theran', 'Louis'..."
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[['Pan', 'Hongjun', '']]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[['Callan', 'David', '']]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[['Abu-Shammala', 'Wael', ''], ['Torchinsky', ..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2866787 entries, 0 to 2866786
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   id              object
 1   submitter       object
 2   authors         object
 3   title           object
 4   comments        object
 5   journal-ref     object
 6   doi             object
 7   report-no       object
 8   categories      object
 9   license         object
 10  abstract        object
 11  versions        object
 12  update_date     object
 13  authors_parsed  object
dtypes: object(14)
memory usage: 306.2+ MB


In [6]:
import re

In [7]:
keep_cols = ['id', 'title', 'abstract', 'authors', 'categories', 'update_date']
df = data[keep_cols]

# --- Step 2: Drop missing or duplicate titles/abstracts ---
df.dropna(subset=['title', 'abstract'], inplace=True)
df.drop_duplicates(subset=['title', 'abstract'], inplace=True)

# --- Step 3: Define text cleaning function with equation preservation ---
def clean_text(text):
    text = str(text).lower()

    # Step 1: Replace math expressions with a placeholder token
    text = re.sub(r'\$[^$]+\$', ' EQUATIONTOKEN ', text)

    # Step 2: Remove LaTeX commands like \cite, \ref, etc.
    text = re.sub(r'\\[a-zA-Z]+', '', text)

    # Step 3: Remove unwanted symbols but keep letters and spaces
    text = re.sub(r'[^a-z\s]', ' ', text)

    # Step 4: Restore our placeholder safely
    text = text.replace('equationtoken', '<EQUATION>')

    # Step 5: Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # # Step 6: Remove stopwords (but keep <EQUATION>)
    # words = [w for w in text.split() if w not in stop_words or w == '<EQUATION>']

    return text


# --- Step 4: Apply cleaning ---
df['clean_title'] = df['title'].apply(clean_text)
df['clean_abstract'] = df['abstract'].apply(clean_text)

# --- Step 5: Handle update_date ---
df['update_date'] = pd.to_datetime(df['update_date'], errors='coerce')
df = df[df['update_date'].notnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['title', 'abstract'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=['title', 'abstract'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = df['title'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2866364 entries, 0 to 2866786
Data columns (total 8 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              object        
 1   title           object        
 2   abstract        object        
 3   authors         object        
 4   categories      object        
 5   update_date     datetime64[ns]
 6   clean_title     object        
 7   clean_abstract  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 196.8+ MB


In [9]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2866364 entries, 0 to 2866786
Data columns (total 8 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              object        
 1   title           object        
 2   abstract        object        
 3   authors         object        
 4   categories      object        
 5   update_date     datetime64[ns]
 6   clean_title     object        
 7   clean_abstract  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 196.8+ MB


In [10]:
df.head()

Unnamed: 0,id,title,abstract,authors,categories,update_date,clean_title,clean_abstract
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",hep-ph,2008-11-26,calculation of prompt diphoton production cros...,a fully differential calculation in perturbati...
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran,math.CO cs.CG,2008-12-13,sparsity certifying graph decompositions,we describe a new algorithm the pebble game wi...
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,Hongjun Pan,physics.gen-ph,2008-01-13,the evolution of the earth moon system based o...,the evolution of earth moon system is describe...
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,David Callan,math.CO,2007-05-23,a determinant of stirling cycle numbers counts...,we show that a determinant of stirling cycle n...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,Wael Abu-Shammala and Alberto Torchinsky,math.CA math.FA,2013-10-15,from dyadic to,in this paper we show how to compute the norm ...


In [11]:
categories = (df['categories']
              .dropna()
              .str.split()    
              .explode()      
              .str.strip()
              .unique())

import pandas as pd
# pd.Series(sorted(categories), name='category').to_csv('categories_list.txt', index=False, header=False)
print(f"{categories} unique categories found.")

['hep-ph' 'math.CO' 'cs.CG' 'physics.gen-ph' 'math.CA' 'math.FA'
 'cond-mat.mes-hall' 'gr-qc' 'cond-mat.mtrl-sci' 'astro-ph' 'math.NT'
 'math.AG' 'math.AT' 'hep-th' 'math.PR' 'hep-ex' 'nlin.PS'
 'physics.chem-ph' 'q-bio.MN' 'math.NA' 'cond-mat.str-el'
 'cond-mat.stat-mech' 'math.RA' 'physics.optics' 'physics.comp-ph'
 'q-bio.PE' 'q-bio.CB' 'quant-ph' 'q-bio.QM' 'hep-lat' 'nucl-th' 'math.OA'
 'math.QA' 'math-ph' 'math.MP' 'nlin.CD' 'physics.plasm-ph'
 'physics.space-ph' 'nlin.SI' 'cs.IT' 'math.IT' 'cs.NE' 'cs.AI'
 'physics.ed-ph' 'math.DG' 'cond-mat.soft' 'physics.pop-ph' 'cs.DS'
 'math.CV' 'math.DS' 'physics.soc-ph' 'nucl-ex' 'math.RT' 'cond-mat.other'
 'physics.flu-dyn' 'physics.data-an' 'cs.CE' 'cs.MS' 'cs.NA' 'math.GR'
 'cond-mat.supr-con' 'math.AC' 'math.SG' 'cs.CC' 'math.KT' 'math.GT'
 'math.AP' 'physics.class-ph' 'q-bio.OT' 'physics.bio-ph' 'q-bio.BM'
 'nlin.CG' 'cs.DM' 'cs.LO' 'cond-mat.dis-nn' 'math.MG' 'physics.atom-ph'
 'math.SP' 'math.ST' 'stat.TH' 'physics.ao-ph' 'physics.i

In [12]:
category_map = {
    # ---------- Computer Science ----------
    'cs.AI': 'Computer Science – Artificial Intelligence',
    'cs.AR': 'Computer Science – Architecture',
    'cs.CC': 'Computer Science – Computational Complexity',
    'cs.CE': 'Computer Science – Computational Engineering',
    'cs.CG': 'Computer Science – Computational Geometry',
    'cs.CL': 'Computer Science – Computation and Language',
    'cs.CR': 'Computer Science – Cryptography and Security',
    'cs.CV': 'Computer Science – Computer Vision and Pattern Recognition',
    'cs.CY': 'Computer Science – Computers and Society',
    'cs.DB': 'Computer Science – Databases',
    'cs.DC': 'Computer Science – Distributed Computing',
    'cs.DL': 'Computer Science – Digital Libraries',
    'cs.DM': 'Computer Science – Discrete Mathematics',
    'cs.DS': 'Computer Science – Data Structures and Algorithms',
    'cs.ET': 'Computer Science – Emerging Technologies',
    'cs.FL': 'Computer Science – Formal Languages',
    'cs.GL': 'Computer Science – General Literature',
    'cs.GR': 'Computer Science – Graphics',
    'cs.GT': 'Computer Science – Game Theory',
    'cs.HC': 'Computer Science – Human-Computer Interaction',
    'cs.IR': 'Computer Science – Information Retrieval',
    'cs.IT': 'Computer Science – Information Theory',
    'cs.LG': 'Computer Science – Machine Learning',
    'cs.LO': 'Computer Science – Logic in Computer Science',
    'cs.MA': 'Computer Science – Multiagent Systems',
    'cs.MM': 'Computer Science – Multimedia',
    'cs.MS': 'Computer Science – Mathematical Software',
    'cs.NA': 'Computer Science – Numerical Analysis',
    'cs.NE': 'Computer Science – Neural and Evolutionary Computing',
    'cs.NI': 'Computer Science – Networking and Internet Architecture',
    'cs.OH': 'Computer Science – Other Computer Science',
    'cs.OS': 'Computer Science – Operating Systems',
    'cs.PF': 'Computer Science – Performance',
    'cs.PL': 'Computer Science – Programming Languages',
    'cs.RO': 'Computer Science – Robotics',
    'cs.SC': 'Computer Science – Symbolic Computation',
    'cs.SD': 'Computer Science – Sound',
    'cs.SE': 'Computer Science – Software Engineering',
    'cs.SI': 'Computer Science – Social and Information Networks',
    'cs.SY': 'Computer Science – Systems and Control',

    # ---------- Physics ----------
    'acc-phys': 'Accelerator Physics',
    'physics.acc-ph': 'Physics – Accelerator Physics',
    'physics.ao-ph': 'Physics – Atmospheric and Oceanic Physics',
    'physics.app-ph': 'Physics – Applied Physics',
    'physics.atm-clus': 'Physics – Atomic and Molecular Clusters',
    'physics.atom-ph': 'Physics – Atomic Physics',
    'physics.bio-ph': 'Physics – Biological Physics',
    'physics.chem-ph': 'Physics – Chemical Physics',
    'physics.class-ph': 'Physics – Classical Physics',
    'physics.comp-ph': 'Physics – Computational Physics',
    'physics.data-an': 'Physics – Data Analysis, Statistics and Probability',
    'physics.ed-ph': 'Physics – Education',
    'physics.flu-dyn': 'Physics – Fluid Dynamics',
    'physics.gen-ph': 'Physics – General Physics',
    'physics.geo-ph': 'Physics – Geophysics',
    'physics.hist-ph': 'Physics – History and Philosophy of Physics',
    'physics.ins-det': 'Physics – Instrumentation and Detectors',
    'physics.med-ph': 'Physics – Medical Physics',
    'physics.optics': 'Physics – Optics',
    'physics.plasm-ph': 'Physics – Plasma Physics',
    'physics.pop-ph': 'Physics – Popular Physics',
    'physics.soc-ph': 'Physics – Physics and Society',
    'physics.space-ph': 'Physics – Space Physics',
    
    # High Energy Physics
    'hep-ex': 'High Energy Physics – Experiment',
    'hep-lat': 'High Energy Physics – Lattice',
    'hep-ph': 'High Energy Physics – Phenomenology',
    'hep-th': 'High Energy Physics – Theory',
    
    # General Relativity and Cosmology
    'gr-qc': 'General Relativity and Quantum Cosmology',
    
    # Nuclear Physics
    'nucl-ex': 'Nuclear Physics – Experiment',
    'nucl-th': 'Nuclear Physics – Theory',
    
    # Condensed Matter
    'cond-mat': 'Condensed Matter',
    'cond-mat.dis-nn': 'Condensed Matter – Disordered Systems and Neural Networks',
    'cond-mat.mes-hall': 'Condensed Matter – Mesoscopic Systems and Quantum Hall Effect',
    'cond-mat.mtrl-sci': 'Condensed Matter – Materials Science',
    'cond-mat.other': 'Condensed Matter – Other',
    'cond-mat.quant-gas': 'Condensed Matter – Quantum Gases',
    'cond-mat.soft': 'Condensed Matter – Soft Condensed Matter',
    'cond-mat.stat-mech': 'Condensed Matter – Statistical Mechanics',
    'cond-mat.str-el': 'Condensed Matter – Strongly Correlated Electrons',
    'cond-mat.supr-con': 'Condensed Matter – Superconductivity',
    
    # Astrophysics
    'astro-ph': 'Astrophysics',
    'astro-ph.CO': 'Astrophysics – Cosmology and Extragalactic Astrophysics',
    'astro-ph.EP': 'Astrophysics – Earth and Planetary Astrophysics',
    'astro-ph.GA': 'Astrophysics – Galaxy Astrophysics',
    'astro-ph.HE': 'Astrophysics – High Energy Astrophysical Phenomena',
    'astro-ph.IM': 'Astrophysics – Instrumentation and Methods for Astrophysics',
    'astro-ph.SR': 'Astrophysics – Solar and Stellar Astrophysics',
    
    # Quantum Physics
    'quant-ph': 'Quantum Physics',
    'atom-ph': 'Atomic Physics',

    # ---------- Mathematics ----------
    'math.AC': 'Mathematics – Commutative Algebra',
    'math.AG': 'Mathematics – Algebraic Geometry',
    'math.AP': 'Mathematics – Analysis of PDEs',
    'math.AT': 'Mathematics – Algebraic Topology',
    'math.CA': 'Mathematics – Classical Analysis and ODEs',
    'math.CO': 'Mathematics – Combinatorics',
    'math.CT': 'Mathematics – Category Theory',
    'math.CV': 'Mathematics – Complex Variables',
    'math.DG': 'Mathematics – Differential Geometry',
    'math.DS': 'Mathematics – Dynamical Systems',
    'math.FA': 'Mathematics – Functional Analysis',
    'math.GM': 'Mathematics – General Mathematics',
    'math.GN': 'Mathematics – General Topology',
    'math.GR': 'Mathematics – Group Theory',
    'math.GT': 'Mathematics – Geometric Topology',
    'math.HO': 'Mathematics – History and Overview',
    'math.IT': 'Mathematics – Information Theory',
    'math.KT': 'Mathematics – K-Theory and Homology',
    'math.LO': 'Mathematics – Logic',
    'math.MG': 'Mathematics – Metric Geometry',
    'math.MP': 'Mathematics – Mathematical Physics',
    'math.NA': 'Mathematics – Numerical Analysis',
    'math.NT': 'Mathematics – Number Theory',
    'math.OA': 'Mathematics – Operator Algebras',
    'math.OC': 'Mathematics – Optimization and Control',
    'math.PR': 'Mathematics – Probability',
    'math.QA': 'Mathematics – Quantum Algebra',
    'math.RA': 'Mathematics – Rings and Algebras',
    'math.RT': 'Mathematics – Representation Theory',
    'math.SG': 'Mathematics – Symplectic Geometry',
    'math.SP': 'Mathematics – Spectral Theory',
    'math.ST': 'Mathematics – Statistics Theory',
    
    # Mathematical Physics
    'math-ph': 'Mathematical Physics',

    # ---------- Statistics ----------
    'stat.AP': 'Statistics – Applications',
    'stat.CO': 'Statistics – Computation',
    'stat.ME': 'Statistics – Methodology',
    'stat.ML': 'Statistics – Machine Learning',
    'stat.OT': 'Statistics – Other Statistics',
    'stat.TH': 'Statistics – Theory',

    # ---------- Quantitative Biology ----------
    'q-bio': 'Quantitative Biology',
    'q-bio.BM': 'Quantitative Biology – Biomolecules',
    'q-bio.CB': 'Quantitative Biology – Cell Behavior',
    'q-bio.GN': 'Quantitative Biology – Genomics',
    'q-bio.MN': 'Quantitative Biology – Molecular Networks',
    'q-bio.NC': 'Quantitative Biology – Neurons and Cognition',
    'q-bio.OT': 'Quantitative Biology – Other',
    'q-bio.PE': 'Quantitative Biology – Populations and Evolution',
    'q-bio.QM': 'Quantitative Biology – Quantitative Methods',
    'q-bio.SC': 'Quantitative Biology – Subcellular Processes',
    'q-bio.TO': 'Quantitative Biology – Tissues and Organs',

    # ---------- Quantitative Finance ----------
    'q-fin.CP': 'Quantitative Finance – Computational Finance',
    'q-fin.EC': 'Quantitative Finance – Economics',
    'q-fin.GN': 'Quantitative Finance – General Finance',
    'q-fin.MF': 'Quantitative Finance – Mathematical Finance',
    'q-fin.PM': 'Quantitative Finance – Portfolio Management',
    'q-fin.PR': 'Quantitative Finance – Pricing of Securities',
    'q-fin.RM': 'Quantitative Finance – Risk Management',
    'q-fin.ST': 'Quantitative Finance – Statistical Finance',
    'q-fin.TR': 'Quantitative Finance – Trading and Market Microstructure',

    # ---------- Nonlinear Sciences ----------
    'nlin.AO': 'Nonlinear Sciences – Adaptation and Self-Organizing Systems',
    'nlin.CD': 'Nonlinear Sciences – Chaotic Dynamics',
    'nlin.CG': 'Nonlinear Sciences – Cellular Automata and Lattice Gases',
    'nlin.PS': 'Nonlinear Sciences – Pattern Formation and Solitons',
    'nlin.SI': 'Nonlinear Sciences – Exactly Solvable and Integrable Systems',

    # ---------- Economics ----------
    'econ.EM': 'Economics – Econometrics',
    'econ.GN': 'Economics – General Economics',
    'econ.TH': 'Economics – Theoretical Economics',

    # ---------- Electrical Engineering and Systems Science ----------
    'eess.AS': 'Electrical Engineering and Systems Science – Audio and Speech Processing',
    'eess.IV': 'Electrical Engineering and Systems Science – Image and Video Processing',
    'eess.SP': 'Electrical Engineering and Systems Science – Signal Processing',
    'eess.SY': 'Electrical Engineering and Systems Science – Systems and Control',

    # ---------- Other Categories ----------
    'adap-org': 'Adaptation, Noise, and Self-Organizing Systems',
    'alg-geom': 'Algebraic Geometry',
    'ao-sci': 'Astrophysics and Ocean Sciences',
    'bayes-an': 'Bayesian Analysis',
    'chao-dyn': 'Chaotic Dynamics',
    'chem-ph': 'Chemical Physics',
    'cmp-lg': 'Computation and Language',
    'comp-gas': 'Computational Geometry',
    'dg-ga': 'Differential Geometry',
    'funct-an': 'Functional Analysis',
    'mtrl-th': 'Materials Theory',
    'patt-sol': 'Pattern Formation and Solitons',
    'plasm-ph': 'Plasma Physics',
    'q-alg': 'Quantum Algebra and Topology',
    'solv-int': 'Exactly Solvable and Integrable Systems',
    'supr-con': 'Superconductivity',
}

def map_category(cat_string):
    if pd.isna(cat_string):
        return 'Other'
    cats = cat_string.split()
    mapped = [category_map.get(c, 'Other') for c in cats]
    return ', '.join(sorted(set(mapped)))

df['mapped_category'] = df['categories'].apply(map_category)
df = df[df['mapped_category'] != 'Other']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2866364 entries, 0 to 2866786
Data columns (total 9 columns):
 #   Column           Dtype         
---  ------           -----         
 0   id               object        
 1   title            object        
 2   abstract         object        
 3   authors          object        
 4   categories       object        
 5   update_date      datetime64[ns]
 6   clean_title      object        
 7   clean_abstract   object        
 8   mapped_category  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 218.7+ MB


In [13]:
df.head()

Unnamed: 0,id,title,abstract,authors,categories,update_date,clean_title,clean_abstract,mapped_category
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",hep-ph,2008-11-26,calculation of prompt diphoton production cros...,a fully differential calculation in perturbati...,High Energy Physics – Phenomenology
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran,math.CO cs.CG,2008-12-13,sparsity certifying graph decompositions,we describe a new algorithm the pebble game wi...,"Computer Science – Computational Geometry, Mat..."
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,Hongjun Pan,physics.gen-ph,2008-01-13,the evolution of the earth moon system based o...,the evolution of earth moon system is describe...,Physics – General Physics
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,David Callan,math.CO,2007-05-23,a determinant of stirling cycle numbers counts...,we show that a determinant of stirling cycle n...,Mathematics – Combinatorics
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,Wael Abu-Shammala and Alberto Torchinsky,math.CA math.FA,2013-10-15,from dyadic to,in this paper we show how to compute the norm ...,"Mathematics – Classical Analysis and ODEs, Mat..."


In [18]:
frac = 200000 / len(df)  

# Stratified random sampling
sample = (
    df.groupby('mapped_category', group_keys=False)
      .apply(lambda x: x.sample(frac=min(1, frac), random_state=42))
      .sort_index() 
)

  .apply(lambda x: x.sample(frac=min(1, frac), random_state=42))


In [19]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194065 entries, 32 to 2866770
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   id               194065 non-null  object        
 1   title            194065 non-null  object        
 2   abstract         194065 non-null  object        
 3   authors          194065 non-null  object        
 4   categories       194065 non-null  object        
 5   update_date      194065 non-null  datetime64[ns]
 6   clean_title      194065 non-null  object        
 7   clean_abstract   194065 non-null  object        
 8   mapped_category  194065 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 14.8+ MB


In [20]:
sample.rename(columns={'mapped_category': 'category'}, inplace=True)
sample.rename(columns={'categories':'category_code'}, inplace=True)

In [21]:
sample.head()

Unnamed: 0,id,title,abstract,authors,category_code,update_date,clean_title,clean_abstract,category
32,704.0033,Convergence of the discrete dipole approximati...,We performed a rigorous theoretical converge...,"Maxim A. Yurkin, Valeri P. Maltsev, Alfons G. ...",physics.optics physics.comp-ph,2022-03-31,convergence of the discrete dipole approximati...,we performed a rigorous theoretical convergenc...,"Physics – Computational Physics, Physics – Optics"
44,704.0045,Evolution of solitary waves and undular bores ...,This paper considers the propagation of shal...,"G.A. El, R.H.J. Grimshaw, A.M. Kamchatnov",nlin.PS nlin.SI,2007-09-23,evolution of solitary waves and undular bores ...,this paper considers the propagation of shallo...,Nonlinear Sciences – Exactly Solvable and Inte...
47,704.0048,Inference on white dwarf binary systems using ...,We report on the analysis of selected single...,"Alexander Stroeer, John Veitch, Christian Roev...",gr-qc astro-ph,2008-11-26,inference on white dwarf binary systems using ...,we report on the analysis of selected single s...,"Astrophysics, General Relativity and Quantum C..."
105,704.0106,Multiple Parton Scattering in Nuclei: Quark-qu...,Modifications to quark and antiquark fragmen...,"Andreas Schafer, Xin-Nian Wang and Ben-Wei Zhang",hep-ph nucl-th,2008-11-26,multiple parton scattering in nuclei quark qua...,modifications to quark and antiquark fragmenta...,"High Energy Physics – Phenomenology, Nuclear P..."
147,704.0148,Reexamination of spin decoherence in semicondu...,The longitudinal and transversal spin decohe...,"J. H. Jiang, Y. Y. Wang, and M. W. Wu",cond-mat.mtrl-sci,2008-01-20,reexamination of spin decoherence in semicondu...,the longitudinal and transversal spin decohere...,Condensed Matter – Materials Science


In [None]:
sample.to_csv('arxiv_processed.csv', index=False)