In [1]:
import pandas as pd
import numpy as np
import vaex

import glob

import sqlalchemy as sqla
import types
import json

In [2]:
%%capture
from tqdm.notebook import trange, tqdm
tqdm.pandas()

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# Replace the first with the second category
CATEGORY_ALIASES = {
    'math.MP': 'math-ph',
    'stat.TH': 'math.ST',
    'math.IT': 'cs.IT',
    'q-fin.EC': 'econ.GN',
    'cs.SY': 'eess.SY',
    'cs.NA': 'math.NA'
}

# Used subject categories, with aliases removed.
ACTIVE_CATEGORIES = [
    'astro-ph.CO','astro-ph.EP','astro-ph.GA',
    'astro-ph.HE','astro-ph.IM','astro-ph.SR',
    'cond-mat.dis-nn','cond-mat.mes-hall','cond-mat.mtrl-sci',
    'cond-mat.other','cond-mat.quant-gas','cond-mat.soft',
    'cond-mat.stat-mech','cond-mat.str-el','cond-mat.supr-con',
    'cs.AI','cs.AR','cs.CC','cs.CE','cs.CG','cs.CL','cs.CR',
    'cs.CV','cs.CY','cs.DB','cs.DC','cs.DL','cs.DM','cs.DS',
    'cs.ET','cs.FL','cs.GL','cs.GR','cs.GT','cs.HC','cs.IR',
    'cs.IT','cs.LG','cs.LO','cs.MA','cs.MM','cs.MS','cs.NE',
    'cs.NI','cs.OH','cs.OS','cs.PF','cs.PL','cs.RO','cs.SC',
    'cs.SD','cs.SE','cs.SI',
    'econ.EM','econ.GN','econ.TH',
    'eess.AS','eess.IV','eess.SP','eess.SY',
    'gr-qc',
    'hep-ex','hep-lat','hep-ph','hep-th',
    'math-ph','math.AC','math.AG','math.AP',
    'math.AT','math.CA','math.CO','math.CT',
    'math.CV','math.DG','math.DS','math.FA',
    'math.GM','math.GN','math.GR','math.GT',
    'math.HO','math.KT','math.LO','math.MG',
    'math.NA','math.NT','math.OA','math.OC',
    'math.PR','math.QA','math.RA','math.RT',
    'math.SG','math.SP','math.ST',
    'nlin.AO','nlin.CD','nlin.CG','nlin.PS','nlin.SI',
    'nucl-ex','nucl-th',
    'physics.acc-ph','physics.ao-ph','physics.app-ph',
    'physics.atm-clus','physics.atom-ph','physics.bio-ph',
    'physics.chem-ph','physics.class-ph','physics.comp-ph',
    'physics.data-an','physics.ed-ph','physics.flu-dyn',
    'physics.gen-ph','physics.geo-ph','physics.hist-ph',
    'physics.ins-det','physics.med-ph','physics.optics',
    'physics.plasm-ph','physics.pop-ph','physics.soc-ph',
    'physics.space-ph',
    'q-bio.BM','q-bio.CB','q-bio.GN','q-bio.MN','q-bio.NC',
    'q-bio.OT','q-bio.PE','q-bio.QM','q-bio.SC','q-bio.TO',
    'q-fin.CP','q-fin.GN','q-fin.MF','q-fin.PM',
    'q-fin.PR','q-fin.RM','q-fin.ST','q-fin.TR',
    'quant-ph',
    'stat.AP','stat.CO','stat.ME','stat.ML','stat.OT',
    'dg-ga', 'astro-ph'
]

CAT_REMAPPER = {x:x for x in ACTIVE_CATEGORIES} | CATEGORY_ALIASES


len(CATEGORY_ALIASES)
len(ACTIVE_CATEGORIES)
len(CAT_REMAPPER)



6

151

157

In [18]:
with open("/home/cjc73/.credentials/arxiv_mysql") as creds:
    #print(creds.read())
    cns = types.SimpleNamespace(**json.load(creds))


In [19]:
server_name = "arxiv-db-rep2.c94unvnkztba.us-east-1.rds.amazonaws.com"
db_name = "arXiv"
engine = sqla.create_engine(
    f"mysql+pymysql://{cns.user}:{cns.password}@{server_name}/{db_name}"
)
connection = engine.connect()
medadata = sqla.MetaData()


In [20]:
cmd = """
SELECT
    COUNT(DISTINCT m.paper_id) as countPapers
FROM `arXiv_metadata` m
where 
    created >= '2015-01-01 00:00:00'
    and created < '2016-01-01 00:00:00'
    and m.is_current = 1 
    and m.is_withdrawn = 0
;
"""
with engine.connect() as con:
    rs = con.execute(cmd)
    print(rs.fetchall())


[(101070,)]


In [53]:
cmd = """
select 
    m.paper_id,
    m.version,
    case when instr(paper_id, '/')=0 then substr(paper_id,1,4) 
        else substr(paper_id,instr(paper_id, '/')+1,4)
    end yymm,
    m.created,
    replace(m.abstract, '\t',' ') abstract
from arXiv_metadata m
where 
    created >= '2015-01-01 00:00:00'
    and created < '2016-01-01 00:00:00'
    and m.is_current = 1 
    and m.is_withdrawn = 0
#order by 3,1
limit 20
;
"""

In [100]:
list(range(5))

[0, 1, 2, 3, 4]

In [101]:
cmd = """
select 
    m.paper_id,
    m.version,
    case when instr(paper_id, '/')=0 then substr(paper_id,1,4) 
        else substr(paper_id,instr(paper_id, '/')+1,4)
    end yymm,
    m.created,
    m.title,
    m.abs_categories,
    replace(m.abstract, '\t',' ') abstract
from arXiv_metadata m
where 
    created >= '{}-01-01 00:00:00'
    and created < '{}-01-01 00:00:00'
    and m.is_current = 1 
    and m.is_withdrawn = 0
#order by 3,1
#limit 20
;
"""

#years = [2015+i for i in range(8)]
years = [2010+i for i in range(5)]
intervals = list(zip(years, years[1:]))

for from_yr, to_yr in tqdm(intervals):
    with engine.connect() as con, open(f"./data/fulltext/{from_yr}.json", 'w') as outfile:
        rs = con.execution_options(
            stream_results=True,
            max_row_buffer=500,
        ).execute(cmd.format(from_yr, to_yr))

        for row in tqdm(rs):
            paper_dict = dict(zip(row.keys(), row))
            paper_dict['created'] = paper_dict['created'].isoformat()
            _ = outfile.write(
                json.dumps(paper_dict, ensure_ascii=False)
            )
            _ = outfile.write('\n')

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [23]:
# get more examples of these:
small_cats = [x.strip() for x in """
dg-ga
cs.GL
econ.TH
nlin.CG
cs.OS
astro-ph
stat.OT
q-bio.OT
q-bio.SC
econ.EM
q-fin.TR
q-bio.CB
cs.MS
physics.atm-clus
q-fin.CP
cs.PF
q-fin.PM
q-bio.TO
q-fin.RM
cs.SC
econ.GN
physics.pop-ph
q-fin.MF
q-fin.PR
cs.AR
eess.AS
q-bio.GN
q-fin.ST
q-fin.GN
cs.GR
cs.MA
cs.ET
cs.MM
""".strip().split()]

cmd = """
select 
    m.paper_id,
    m.version,
    case when instr(paper_id, '/')=0 then substr(paper_id,1,4) 
        else substr(paper_id,instr(paper_id, '/')+1,4)
    end yymm,
    m.created,
    m.title,
    m.abs_categories,
    replace(m.abstract, '\t',' ') abstract
from arXiv_metadata m
where
    m.abs_categories like '{}%%'
    and created < '2010-01-01 00:00:00'
    and m.is_current = 1 
    and m.is_withdrawn = 0
#order by 3,1
#limit 20
;
""".strip()

# note all pre2010
intervals = list(zip(years, years[1:]))

for cat in tqdm(small_cats):
    with engine.connect() as con, open(f"./data/fulltext/2000_2009.json", 'a') as outfile:
        rs = con.execution_options(
            stream_results=True,
            max_row_buffer=500,
        ).execute(cmd.format(cat))

        for row in tqdm(rs):
            paper_dict = dict(zip(row.keys(), row))
            paper_dict['created'] = paper_dict['created'].isoformat()
            _ = outfile.write(
                json.dumps(paper_dict, ensure_ascii=False)
            )
            _ = outfile.write('\n')

  0%|          | 0/33 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [44]:
# use vaex to convert the year-json files into HDF5 format
# so they can be memory mapped and jointly loaded
json_files = glob.glob("./data/fulltext/*.json")


joptions = types.SimpleNamespace(
    orient='records', 
    copy_index=False,
    lines=True,
    dtype=str,
)

for jfile in tqdm(json_files):
    tmp_df = vaex.from_json(jfile, **vars(joptions))
    tmp_df['created'] = tmp_df['created'].astype('datetime64')
    tmp_df['prime_category'] = tmp_df.abs_categories.str.split().apply(lambda x: x[0])
    tmp_df['prime_category'] = tmp_df['prime_category'].map(CAT_REMAPPER, default_value='unknown')
    tmp_df = tmp_df[tmp_df['prime_category'] != 'unknown']
    tmp_df['major_category'] = tmp_df['prime_category'].str.split('.').apply(lambda x: x[0])
    
    tmp_df.export(jfile+'.arrow')


  0%|          | 0/12 [00:00<?, ?it/s]

In [5]:
all_df = vaex.open("./data/fulltext/*.arrow")
pre2020_df = all_df[all_df.created < np.datetime64("2020-01-01T00:00:00")]
testpost2020 = all_df[all_df.created >= np.datetime64("2020-01-01T00:00:00")]
#pre2020_df.head()
pre2020_df.shape
testpost2020.shape
#all_df.shape
#all_df.dtypes
#all_df.head()

pre2020_df.shuffle(random_state=42).export("./data/fulltext/pre2020_shuffled.hdf5")

(1034817, 9)

(404527, 9)

In [26]:
tt_df = vaex.open("./data/fulltext/pre2020_shuffled.hdf5")
#train_df, test_df = tt.df.ml

In [27]:
tt_df.ml.train_test_split?

[0;31mSignature:[0m
[0mtt_df[0m[0;34m.[0m[0mml[0m[0;34m.[0m[0mtrain_test_split[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtest_size[0m[0;34m=[0m[0;36m0.2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrings[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvirtual[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Will split the DataFrame in train and test part, assuming it is shuffled.

:param test_size: The fractional size of the test set.
:param strings: If True, the output DataFrames will also contain string columns, if any.
:param virtual: If True, the output DataFrames will also contain virtual contain, if any.
[0;31mFile:[0m      ~/miniconda3/envs/cforge/lib/python3.9/site-packages/vaex/ml/__init__.py
[0;31mType:[0m      method


In [8]:
vaex.settings.display.max_rows = 200
cat_count_df = pre2020_df.groupby(['major_category', 'prime_category'], agg = vaex.agg.count('paper_id')).sort(['paper_id_count'])
to_get = cat_count_df[cat_count_df['paper_id_count'] < 1000]['prime_category'].values
for value in to_get:
    print(value)

cat_count_df
pre2020_df.groupby('major_category', agg = vaex.agg.count('paper_id')).sort('paper_id_count')
vaex.settings.display.max_rows = 30

cs.GL
econ.TH
cs.OS
stat.OT
nlin.CG
econ.EM
q-bio.OT
q-bio.SC
q-fin.TR
dg-ga
q-fin.CP
q-fin.PM
cs.MS
cs.PF
q-fin.RM
econ.GN
q-bio.CB
q-bio.TO
q-fin.MF
eess.AS
cs.SC
q-fin.PR
cs.AR
physics.pop-ph
cs.GR
cs.ET
physics.atm-clus
q-fin.GN
cs.MA


#,major_category,prime_category,paper_id_count
0,cs,cs.GL,77
1,econ,econ.TH,122
2,cs,cs.OS,276
3,stat,stat.OT,331
4,nlin,nlin.CG,378
5,econ,econ.EM,414
6,q-bio,q-bio.OT,452
7,q-bio,q-bio.SC,519
8,q-fin,q-fin.TR,542
9,dg-ga,dg-ga,557


#,major_category,paper_id_count
0,dg-ga,557
1,econ,1168
2,nucl-ex,4397
3,hep-lat,5274
4,q-fin,5765
5,nlin,7340
6,hep-ex,7860
7,nucl-th,11251
8,eess,11583
9,q-bio,13117


In [5]:
## Build a stratified sample for major cats

train_ls = []
test_ls = []

TARGET_CLASS_SIZE = 2000
RAND_STATE = 42

tt_df = vaex.open(
    "./data/fulltext/pre2020_shuffled.hdf5"
).shuffle(random_state=RAND_STATE)

major_cats = tt_df['major_category'].unique()

for major_cat in tqdm(major_cats):
    temp_df = tt_df[tt_df['major_category']==major_cat].copy()
    
    if temp_df.shape[0] <= TARGET_CLASS_SIZE:
        temp_train, temp_test = temp_df.extract().ml.train_test_split(test_size=.2)
        train_ls.append(temp_train)
        test_ls.append(temp_test)

    else: #temp_df.shape[0] > TARGET_CLASS_SIZE:
        minor_cats = temp_df['prime_category'].unique()
        #get minor_cat counts:
        count_df = (
            temp_df.groupby(
                'prime_category', 
                agg=vaex.agg.count('paper_id')
            ).sort('paper_id_count')
        )
        # get as balanced a sample as possible
        minor_cat_len = len(minor_cats)
        target_sub_size = round(TARGET_CLASS_SIZE / minor_cat_len)
        sub_train_list = []
        sub_test_list = []
        
        # for each sub cat, get the target size or max possible. 
        # Distribute the extra demand to larger cats
        for minor_cat_count in count_df.iterrows():
            idx = minor_cat_count[0]
            minor_cat = minor_cat_count[1]['prime_category']
            paper_count = minor_cat_count[1]['paper_id_count']
            temp_sub = temp_df[temp_df['prime_category'] == minor_cat].extract()
            if paper_count > target_sub_size:
                temp_sub = temp_sub.sample(
                    n=target_sub_size,
                    random_state=RAND_STATE
                )
            else: #we have extra demand
                sub_cats_left = (minor_cat_len - idx + 1)
                target_delta = round((target_sub_size-paper_count)/sub_cats_left)
                target_sub_size += target_delta
            
            
            temp_sub_train, temp_sub_test = temp_sub.ml.train_test_split(test_size=.2)
            sub_train_list.append(temp_sub_train)
            sub_test_list.append(temp_sub_test)
        
        train_ls.append(vaex.concat(sub_train_list))
        test_ls.append(vaex.concat(sub_test_list))
    
train_df = vaex.concat(train_ls)
test_df = vaex.concat(test_ls)

train_df.shape
test_df.shape

train_df.export("./data/fulltext/train_major_cats.hdf5", progress=True)
test_df.export("./data/fulltext/test_major_cats.hdf5", progress=True)

  0%|          | 0/21 [00:00<?, ?it/s]



(31786, 9)

(7938, 9)

export(hdf5) [########################################] 100.00% elapsed time  :   580.41s =  9.7m =  0.2h    
export(hdf5) [########################################] 100.00% elapsed time  :   508.34s =  8.5m =  0.1h    
 

In [6]:
## Build a stratified sample for prime cats (sub groups)

train_ls = []
test_ls = []

TARGET_CLASS_SIZE = 1000
RAND_STATE = 31

tt_df = vaex.open(
    "./data/fulltext/pre2020_shuffled.hdf5"
).shuffle(random_state=RAND_STATE)

prime_cats = tt_df['prime_category'].unique()

for prime_cat in tqdm(prime_cats):
    temp_df = tt_df[tt_df['prime_category']==prime_cat].copy()
    temp_train, temp_test = temp_df[:TARGET_CLASS_SIZE].extract().ml.train_test_split(test_size=.2)
    train_ls.append(temp_train)
    test_ls.append(temp_test)
    
train_df = vaex.concat(train_ls)
test_df = vaex.concat(test_ls)

train_df.shape
test_df.shape

train_df.export("./data/fulltext/train_minor_cats.hdf5", progress=True)
test_df.export("./data/fulltext/test_minor_cats.hdf5", progress=True)

  0%|          | 0/151 [00:00<?, ?it/s]

(112026, 9)

(28002, 9)

export(hdf5) [########################################] 100.00% elapsed time  :   197.43s =  3.3m =  0.1h   
export(hdf5) [########################################] 100.00% elapsed time  :    44.93s =  0.7m =  0.0h  
 

In [31]:
len(temp_df[:100000000])
len(temp_df[:10])

414

10

In [33]:
train_df.shape
test_df.shape

(112026, 9)

(112026, 9)

In [8]:
train_df = vaex.open("./data/fulltext/train_major_cats.hdf5")
test_df = vaex.open("./data/fulltext/test_major_cats.hdf5")

In [9]:
vaex.settings.display.max_rows = 200
cat_count_df = train_df.groupby(['major_category', 'prime_category'], agg = vaex.agg.count('paper_id')).sort(['paper_id_count'])
cat_count_df
train_df.groupby('major_category', agg = vaex.agg.count('paper_id')).sort('paper_id_count')
vaex.settings.display.max_rows = 30

#,major_category,prime_category,paper_id_count
0,cs,cs.SC,21
1,cs,cs.CG,21
2,cs,cs.DC,21
3,cs,cs.CL,21
4,cs,cs.FL,21
5,cs,cs.IR,21
6,cs,cs.NI,21
7,cs,cs.AR,21
8,cs,cs.DM,21
9,cs,cs.LO,21


#,major_category,paper_id_count
0,dg-ga,446
1,econ,716
2,math,780
3,physics,792
4,cs,798
5,astro-ph,798
6,hep-lat,800
7,q-bio,800
8,hep-ph,800
9,stat,800


(29, {'prime_category': 'math.AP', 'paper_id_count': 21792})

In [50]:
foo_df = pre2020_df[pre2020_df['major_category']=='dg-ga'].sample(n=1000)


ValueError: Cannot take a larger sample than population when 'replace=False'

In [11]:
abs_df = vaex.from_json(
    './data/fulltext/2015.json',
    orient='records', 
    copy_index=False,
    lines=True,
)
abs_df

#,paper_id,version,yymm,created,title,abstract
0,1501.00223,1,1501,2015-01-01T00:01:03,$\mathbb{K}$-uniruled sets in affine geometry,'The main goal of this thesis is to study $\\math...
1,1501.00227,1,1501,2015-01-01T00:20:13,'Global solvability of 3D inhomogeneous Navier-S...,"'In this paper, we consider the three-dimensiona..."
2,1412.8505,2,1412,2015-01-01T00:38:26,Unphysical diagonal modular invariants,'A modular invariant for a chiral conformal fiel...
3,1307.5933,2,1307,2015-01-01T01:00:44,Brick Walls for Black Holes in AdS/CFT,"""We study the 't Hooft's brick wall model for bl..."
4,1501.00229,1,1501,2015-01-01T01:05:49,'The construction and deformation of Hom-Novikov...,'We study a twisted generalization of Novikov su...
...,...,...,...,...,...,...
101079,1611.03741,1,1611,2015-10-06T20:15:12,'Meshfree elastoplastic solid for nonsmooth mult...,'A method for simulation of elastoplastic solids...
101080,1611.05688,1,1611,2015-12-16T16:25:27,'The Tragedy of Your Upstairs Neighbors: Is the ...,'A commonly expressed concern about the rise of ...
101081,1702.04241,1,1702,2015-11-19T11:38:42,'Detection of Slang Words in e-Data using semi-S...,'The proposed algorithmic approach deals with fi...
101082,1702.04966,1,1702,2015-12-27T20:51:34,'Integration of QoS aspects in the Cloud Computi...,'Cloud Computing is a business model revolution ...


In [113]:
abs_df.ml.train_test_split?

[0;31mSignature:[0m
[0mabs_df[0m[0;34m.[0m[0mml[0m[0;34m.[0m[0mtrain_test_split[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtest_size[0m[0;34m=[0m[0;36m0.2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrings[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvirtual[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Will split the DataFrame in train and test part, assuming it is shuffled.

:param test_size: The fractional size of the test set.
:param strings: If True, the output DataFrames will also contain string columns, if any.
:param virtual: If True, the output DataFrames will also contain virtual contain, if any.
[0;31mFile:[0m      ~/miniconda3/envs/cforge/lib/python3.9/site-packages/vaex/ml/__init__.py
[0;31mType:[0m      method


In [9]:
pd.read_json('./data/fulltext/2015.json', lines=True)

Unnamed: 0,paper_id,version,yymm,created,title,abstract
0,1501.00223,1,1501,2015-01-01T00:01:03,$\mathbb{K}$-uniruled sets in affine geometry,The main goal of this thesis is to study $\mat...
1,1501.00227,1,1501,2015-01-01T00:20:13,Global solvability of 3D inhomogeneous Navier-...,"In this paper, we consider the three-dimension..."
2,1412.8505,2,1412,2015-01-01T00:38:26,Unphysical diagonal modular invariants,A modular invariant for a chiral conformal fie...
3,1307.5933,2,1307,2015-01-01T01:00:44,Brick Walls for Black Holes in AdS/CFT,We study the 't Hooft's brick wall model for b...
4,1501.00229,1,1501,2015-01-01T01:05:49,The construction and deformation of Hom-Noviko...,We study a twisted generalization of Novikov s...
...,...,...,...,...,...,...
101079,1611.03741,1,1611,2015-10-06T20:15:12,Meshfree elastoplastic solid for nonsmooth mul...,A method for simulation of elastoplastic solid...
101080,1611.05688,1,1611,2015-12-16T16:25:27,The Tragedy of Your Upstairs Neighbors: Is the...,A commonly expressed concern about the rise of...
101081,1702.04241,1,1702,2015-11-19T11:38:42,Detection of Slang Words in e-Data using semi-...,The proposed algorithmic approach deals with f...
101082,1702.04966,1,1702,2015-12-27T20:51:34,Integration of QoS aspects in the Cloud Comput...,Cloud Computing is a business model revolution...


In [20]:
def foo(a=1, b=2):
    print(a, b)

args = types.SimpleNamespace(a=10, b=20)
foo(**vars(args))

10 20


In [18]:
dir(args)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'a',
 'b']

In [116]:
all_df.dtypes

paper_id                  string
version                    int64
yymm                       int64
created           datetime64[ns]
title                     string
abs_categories            string
abstract                  string
prime_category            string
major_category            string
dtype: object

Timestamp('2020-01-01 00:00:00')