In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!python -m pip install dask
!python -m pip install 'fsspec>=0.3.3'

Collecting fsspec>=0.3.3
[?25l  Downloading https://files.pythonhosted.org/packages/a5/8b/1df260f860f17cb08698170153ef7db672c497c1840dcc8613ce26a8a005/fsspec-0.8.4-py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 2.4MB/s 
[?25hInstalling collected packages: fsspec
Successfully installed fsspec-0.8.4


In [None]:
import numpy as np 
import pandas as pd
import dask.bag as db
import json

In [None]:
lines=db.read_text('/content/drive/My Drive/arXiv/metadata.json')
lines 

dask.bag<bag-from-delayed, npartitions=1>

In [None]:
records=lines.map(lambda x:json.loads(x))
records.take(4)

({'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the LHC, showing\nthat enhanced sensitivity to the signal can be obtained with judicious\nselection 

In [None]:
records_count=records.count()
print("Number of Records in ArXiv Data is ",records_count.compute())

Number of Records in ArXiv Data is  1767485


In [None]:
ai_category_list=['cs.LG','cs.AI','cs.CV']
ai_docs = (records.filter(lambda x:any(ele in x['categories'] for ele in ai_category_list)==True))
print("Total Papers published in AI&ML ",ai_docs.count().compute())

Total Papers published in AI&ML  115955


In [None]:
ai_docs.take(1)

({'abstract': '  The intelligent acoustic emission locator is described in Part I, while Part\nII discusses blind source separation, time delay estimation and location of two\nsimultaneously active continuous acoustic emission sources.\n  The location of acoustic emission on complicated aircraft frame structures is\na difficult problem of non-destructive testing. This article describes an\nintelligent acoustic emission source locator. The intelligent locator comprises\na sensor antenna and a general regression neural network, which solves the\nlocation problem based on learning from examples. Locator performance was\ntested on different test specimens. Tests have shown that the accuracy of\nlocation depends on sound velocity and attenuation in the specimen, the\ndimensions of the tested area, and the properties of stored data. The location\naccuracy achieved by the intelligent locator is comparable to that obtained by\nthe conventional triangulation method, while the applicability of t

In [None]:
extract_latest_version_year=lambda x:x['versions'][-1]["created"].split(" ")[3]

In [None]:
ai_docs_by_year=ai_docs.map(extract_latest_version_year).frequencies().to_dataframe(columns=['submission_year','num_submissions']).compute()

In [None]:
df = df.filter(df.year>2018)

In [None]:
get_metadata = lambda x: {'id': x['id'].replace('\n',' '),
                  'title': x['title'].replace('\n',''),
                  'authors': x['authors'].replace('\n',' '),
                  'submitter' : x['submitter'],
                  'category':x['categories'].replace('\n',' '),
                 'version':x['versions'][-1]['created'].replace('\n',' '),
                 'date':x['versions'][-1]["created"],
                 'year':int(x['versions'][-1]["created"].split(" ")[3])}

In [None]:
docs = ai_docs.map(get_metadata)

In [None]:
data_after_2018=docs.filter(lambda x:x['year']>2018)

In [None]:
data_after_2018.count().compute()

61425

In [None]:
data_after_2018.take(1)

({'authors': 'Andrea Montanari, Federico Ricci-Tersenghi and Guilhem Semerjian',
  'category': 'cs.AI cond-mat.dis-nn cond-mat.stat-mech cs.CC',
  'date': 'Tue, 4 Jun 2019 11:43:45 GMT',
  'id': '0709.1667',
  'submitter': 'Federico Ricci-Tersenghi',
  'title': 'Solving Constraint Satisfaction Problems through Belief  Propagation-guided decimation',
  'version': 'Tue, 4 Jun 2019 11:43:45 GMT',
  'year': 2019},)

In [None]:
df = data_after_2018.to_dataframe().compute()

In [None]:
df.head()

Unnamed: 0,id,title,authors,submitter,category,version,date,year
0,709.1667,Solving Constraint Satisfaction Problems throu...,"Andrea Montanari, Federico Ricci-Tersenghi and...",Federico Ricci-Tersenghi,cs.AI cond-mat.dis-nn cond-mat.stat-mech cs.CC,"Tue, 4 Jun 2019 11:43:45 GMT","Tue, 4 Jun 2019 11:43:45 GMT",2019
1,804.4451,Dependence Structure Estimation via Copula,Jian Ma and Zengqi Sun,Jian Ma,cs.LG cs.IR stat.ME,"Sat, 7 Sep 2019 00:29:28 GMT","Sat, 7 Sep 2019 00:29:28 GMT",2019
2,811.2551,Modeling Cultural Dynamics,Liane Gabora,Liane Gabora,cs.MA cs.AI q-bio.NC,"Tue, 9 Jul 2019 20:25:22 GMT","Tue, 9 Jul 2019 20:25:22 GMT",2019
3,1001.1401,Incorporating characteristics of human creativ...,Steve DiPaola and Liane Gabora,Liane Gabora,cs.AI cs.NE q-bio.NC,"Tue, 9 Jul 2019 18:54:45 GMT","Tue, 9 Jul 2019 18:54:45 GMT",2019
4,1005.1518,Recognizability of Individual Creative Style W...,Liane Gabora,Liane Gabora,cs.AI,"Tue, 9 Jul 2019 19:56:00 GMT","Tue, 9 Jul 2019 19:56:00 GMT",2019


In [None]:
df?

In [None]:
df.dtypes

id                   object
title                object
authors              object
submitter            object
category             object
date         datetime64[ns]
year                  int64
dtype: object

In [None]:
len(df)

61425

In [None]:
 df['authors'] = df['authors'].str.replace("\\","").str.replace("and",",").str.split(",")

In [None]:
df['authors']=df['authors'].apply(lambda x: [y.strip() for y in x])

In [None]:
 df['category'] = df['category'].str.split(" ")

In [None]:
import datetime 

datetime.datetime.strptime('Mon Feb 15 2010', '%a %b %d %Y').strftime('%d/%m/%Y')
'15/02/2010'

'15/02/2010'

In [None]:
df['date'] = df['date'].str[5:16]
df['date'] =  pd.to_datetime(df['date'])

In [None]:
df.drop('version', axis=1, inplace=True)

In [None]:
df.head() 

Unnamed: 0,id,title,authors,submitter,category,date,year
0,709.1667,Solving Constraint Satisfaction Problems throu...,"[Andrea Montanari, Federico Ricci-Tersenghi, G...",Federico Ricci-Tersenghi,"[cs.AI, cond-mat.dis-nn, cond-mat.stat-mech, c...",2019-06-04,2019
1,804.4451,Dependence Structure Estimation via Copula,"[Jian Ma, Zengqi Sun]",Jian Ma,"[cs.LG, cs.IR, stat.ME]",2019-09-07,2019
2,811.2551,Modeling Cultural Dynamics,[Liane Gabora],Liane Gabora,"[cs.MA, cs.AI, q-bio.NC]",2019-07-09,2019
3,1001.1401,Incorporating characteristics of human creativ...,"[Steve DiPaola, Liane Gabora]",Liane Gabora,"[cs.AI, cs.NE, q-bio.NC]",2019-07-09,2019
4,1005.1518,Recognizability of Individual Creative Style W...,[Liane Gabora],Liane Gabora,[cs.AI],2019-07-09,2019


In [None]:
df.loc[df.id=='2005.14223']

Unnamed: 0,id,title,authors,submitter,category,date,year
48041,2005.14223,Empathic AI Painter: A Computational Creativit...,"[Ozge Nilay Yalcin, Nouf Abukhodair, Steve Di...","\""Ozge Yal\c{c}{\i}n Nilay","[cs.AI, cs.HC]",2020-05-28,2020


In [None]:
#df.to_dict('records')

In [None]:
similarities_transformers = pd.read_csv('/content/drive/My Drive/arXiv/transformers_edges.csv',dtype='object')

In [None]:
similarities_transformers.dtypes

Weight    object
Source    object
Target    object
dtype: object

In [None]:
citations = pd.read_csv('/content/drive/My Drive/arXiv/citations_network_edges.csv', dtype='object')

In [None]:
#target_cited_by_source

In [None]:
citations.dtypes

Target    object
Source    object
dtype: object

In [None]:
citations.Target.value_counts()

1706.06083    1282
1912.01703    1106
1509.02971    1071
1801.04381    1055
1703.10593     804
              ... 
1901.00520       1
1907.08088       1
2006.03695       1
2008.03989       1
1908.00877       1
Name: Target, Length: 47267, dtype: int64

In [None]:
topics = pd.read_csv('/content/drive/My Drive/arXiv/extracted_topics.csv')

In [None]:
topics.head()

Unnamed: 0,id,Dominant_Topic,title
0,709.1667,20,Solving Constraint Satisfaction Problems throu...
1,804.4451,54,Dependence Structure Estimation via Copula
2,811.2551,37,Modeling Cultural Dynamics
3,1001.1401,26,Incorporating characteristics of human creativ...
4,1005.1518,26,Recognizability of Individual Creative Style W...


In [None]:
main = df.merge(topics, on='id')

In [None]:
citations.columns

Index(['Target', 'Source'], dtype='object')

In [None]:
citations.dtypes

Target    object
Source    object
dtype: object

In [None]:
citations.head(10)

Unnamed: 0,Target,Source
0,2002.03494,1905.04241
1,2002.03494,1805.09901
2,2002.03494,1901.09749
3,2002.03494,1909.03977
4,2002.06205,2002.07696
5,2002.06205,2004.07126
6,2002.06205,1606.08415
7,2002.06205,1908.05161
8,2002.06205,2002.02315
9,2002.04147,1805.08657


In [None]:
%%time
citations_as_list = citations.groupby('Target')['Source'].apply(list).reset_index(name='cited_by')

CPU times: user 1.31 s, sys: 9.71 ms, total: 1.32 s
Wall time: 1.32 s


In [None]:
citations_as_list.columns

Index(['Target', 'cited_by'], dtype='object')

In [None]:
len(df) - len(citations_as_list)

14158

In [None]:
pipe_df = main.merge(citations_as_list, how='left', left_on='id', right_on='Target').drop(['Target','title_y'], axis=1)

In [None]:
similarities_transformers.columns

Index(['Weight', 'Source', 'Target'], dtype='object')

In [None]:
%%time
similarities_as_list = similarities_transformers.groupby('Source')['Target'].apply(list).reset_index(name='similar_to')

CPU times: user 795 ms, sys: 4.44 ms, total: 800 ms
Wall time: 803 ms


In [None]:
similarities_as_list.head()

Unnamed: 0,Source,similar_to
0,1106.0665,"[2002.04090, 1912.10600, 1909.12238, 2008.0238..."
1,1106.0666,"[2006.11274, 1905.09710]"
2,1107.2699,"[1912.05539, 1911.12410, 2006.08831, 1905.0412..."
3,1202.0515,[1910.06893]
4,1207.3772,"[1910.04371, 1912.03927]"


In [None]:
pipe_df_2 = pipe_df.merge(similarities_as_list, how='left', left_on='id', right_on='Source').drop(['Source'], axis=1)

In [None]:
len(df)-len(similarities_as_list)

32446

In [None]:
pipe_df_2.isna().sum()

id                    0
title_x               0
authors               0
submitter             0
category              0
date                  0
year                  0
Dominant_Topic        0
cited_by          14158
similar_to        32446
dtype: int64

In [None]:
pipe_df_2.rename({'title_x':'title'}, axis=1, inplace=True)

In [None]:
pipe_df_2.head()

Unnamed: 0,id,title,authors,submitter,category,date,year,Dominant_Topic,cited_by,similar_to
0,709.1667,Solving Constraint Satisfaction Problems throu...,"[Andrea Montanari, Federico Ricci-Tersenghi, G...",Federico Ricci-Tersenghi,"[cs.AI, cond-mat.dis-nn, cond-mat.stat-mech, c...",2019-06-04,2019,20,[1903.01969],
1,804.4451,Dependence Structure Estimation via Copula,"[Jian Ma, Zengqi Sun]",Jian Ma,"[cs.LG, cs.IR, stat.ME]",2019-09-07,2019,54,[2005.14025],
2,811.2551,Modeling Cultural Dynamics,[Liane Gabora],Liane Gabora,"[cs.MA, cs.AI, q-bio.NC]",2019-07-09,2019,37,"[1310.3781, 1308.5032, 1309.7407]",
3,1001.1401,Incorporating characteristics of human creativ...,"[Steve DiPaola, Liane Gabora]",Liane Gabora,"[cs.AI, cs.NE, q-bio.NC]",2019-07-09,2019,26,"[1610.02478, 2005.14223, 1309.7407, 1308.5032,...",
4,1005.1518,Recognizability of Individual Creative Style W...,[Liane Gabora],Liane Gabora,[cs.AI],2019-07-09,2019,26,,


In [None]:
pipe_df_2['date'] = df['date'].astype(str)

In [None]:
main_dict = pipe_df_2.to_dict('records')

In [None]:
json_object = json.dumps(main_dict)   

In [None]:
with open("/content/drive/My Drive/arXiv/data.json", "w") as outfile:  
    json.dump(main_dict, outfile) 

In [None]:
print(json.dumps(main_dict[25], indent=2))

{
  "id": "1309.0213",
  "title": "Learning to Rank for Blind Image Quality Assessment",
  "authors": [
    "Fei Gao",
    "Dacheng Tao",
    "Xinbo Gao",
    "Xuelong Li"
  ],
  "submitter": "Fei Gao",
  "category": [
    "cs.CV"
  ],
  "date": "2019-04-24",
  "year": 2019,
  "Dominant_Topic": 36,
  "cited_by": [
    "2005.13983",
    "1904.08632",
    "1902.06285",
    "1904.06505",
    "1911.10566",
    "1907.00516"
  ],
  "similar_to": [
    "1904.06505",
    "2006.14780",
    "1906.10169",
    "2005.13983"
  ]
}


In [2]:

import json

with open('/content/drive/My Drive/arXiv/data.json') as f:
  data = json.load(f)


In [9]:
def cleanNullTerms(d):
   return {
      k:v
      for k, v in d.items()
      if v is not None
   }


for d in data:
  cleanNullTerms(d)

In [12]:
import numpy as np 

res = {k:v for k,v in data.items() if v is not np.nan}

AttributeError: ignored