In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
data = pd.read_json('F:/ML/PaperClassification/Data/arxiv.json', lines=True)

In [None]:
data.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [None]:
data = data[['title', 'abstract', 'categories']]

In [None]:
# Generalize categories. For eg. math.CA and math.FA are generalized to math
data['general_category'] = data.categories.apply(lambda x:[a.split('.')[0] for a in x.split()])
data['general_category'] = data.general_category.map(pd.unique)
data.drop(['categories'], axis=1, inplace=True)

In [None]:
# Merge paper title and abstract
data['Text'] = data['title'] + data['abstract']
data = data.replace(r'\n',' ', regex=True)
data.drop(['title','abstract'], axis=1, inplace=True)

In [None]:
import re

data['Text'] = [re.sub(r'(\$[^\$]+\$)|([^\w][\d]+)','', text.lower()) for text in data['Text']]

In [None]:
data.head()

Unnamed: 0,general_category,Text
0,[hep-ph],calculation of prompt diphoton production cros...
1,"[math, cs]",sparsity-certifying graph decompositions we d...
2,[physics],the evolution of the earth-moon system based o...
3,[math],a determinant of stirling cycle numbers counts...
4,[math],from dyadic to in this paper we show how to...


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data.general_category)

In [None]:
labels = pd.DataFrame(labels, columns=mlb.classes_)

In [None]:
data = pd.concat([data,labels],axis=1)

In [None]:
data.head()

Unnamed: 0,general_category,Text,acc-phys,adap-org,alg-geom,ao-sci,astro-ph,atom-ph,bayes-an,chao-dyn,...,patt-sol,physics,plasm-ph,q-alg,q-bio,q-fin,quant-ph,solv-int,stat,supr-con
0,[hep-ph],calculation of prompt diphoton production cros...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[math, cs]",sparsity-certifying graph decompositions we d...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[physics],the evolution of the earth-moon system based o...,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,[math],a determinant of stirling cycle numbers counts...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[math],from dyadic to in this paper we show how to...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Sampling

In [None]:
# Used only 40% of the entire Arxiv dataset to reduce training time 
sample_data = data.sample(frac=0.4, random_state=1)

In [None]:
sample_data = sample_data[['Text','astro-ph','cond-mat','cs','gr-qc','hep-ph','hep-th','math','math-ph',
                           'physics','quant-ph','stat']]
sample_data['mask'] = sample_data[['astro-ph','cond-mat','cs','gr-qc','hep-ph','hep-th','math','math-ph',
                           'physics','quant-ph','stat']].sum(axis=1)
sample_data = sample_data[sample_data['mask']>0].drop(['mask'],axis=1)

In [None]:
sample_data.head()

Unnamed: 0,Text,astro-ph,cond-mat,cs,gr-qc,hep-ph,hep-th,math,math-ph,physics,quant-ph,stat
1218012,"e-values: calibration, combination, and applic...",0,0,0,0,0,0,1,0,0,0,1
1338946,stochastic thermodynamics of system with conti...,0,1,0,0,0,0,0,0,0,0,0
1319133,chiral interface states and related quantized ...,0,1,0,0,0,0,0,0,0,0,0
1347104,low-frequency unsteadiness mechanisms in shock...,0,0,0,0,0,0,0,0,1,0,0
1686513,"b_{d,s}->rho, omega, k*, phi decay form factor...",0,0,0,0,1,0,0,0,0,0,0


In [None]:
sample_data.to_csv('SampledArxiv.csv',index=False)