# Data Gathering

In [1]:
# We'll make use of DBLP dataset namely DBLP-Citation-network V12 from https://www.aminer.org/citation

# Steps:
# 1. Unzip dblp.v12.7z -> dblp.v12.json 
# 2. Convert json array content into one item per line
# jq -cn --stream "fromstream(1|truncate_stream(inputs))" dblp.v12.json > dblp.v12.jsonl
# 3. Split jsonl file into smaller chunks
# split -l 10000 dblp.v12.jsonl
# 4. Loop through chunk files and read data into dataframe

In [2]:
# import required dependencies
import json
import glob
import ast
import re

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
CHUNK_DATA_PATH = "data/dblp/chunks"
CHUNK_DATA_PATH

'data/dblp/chunks'

In [4]:
chunk_files = glob.glob(f"{CHUNK_DATA_PATH}/*")
len(chunk_files)

490

In [5]:
chunk_files[:10]

['data/dblp/chunks/pc',
 'data/dblp/chunks/sl',
 'data/dblp/chunks/pd',
 'data/dblp/chunks/sk',
 'data/dblp/chunks/sb',
 'data/dblp/chunks/pm',
 'data/dblp/chunks/se',
 'data/dblp/chunks/pj',
 'data/dblp/chunks/px',
 'data/dblp/chunks/sp']

In [6]:
sample_df = None

with open('data/dblp/chunks/pc') as f:
    sample_df = pd.DataFrame(json.loads(line) for line in f)

sample_df

Unnamed: 0,id,authors,title,year,n_citation,page_start,page_end,doc_type,publisher,volume,issue,doi,references,indexed_abstract,fos,venue
0,2873507692,"[{'name': 'Ioannis Delis', 'org': 'Department ...",Characterization of whole-body muscle activity...,2018,0,44,,Conference,ACM,,,10.1145/3200947.3201006,"[1973012362, 1975079024, 2008605701, 206338057...","{'IndexLength': 182, 'InvertedIndex': {'role.'...","[{'name': 'Muscle activity', 'w': 0}, {'name':...",{'raw': 'Hellenic Conference on Artificial Int...
1,2873533962,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",The effect of codebook design on the BER perfo...,2018,0,1,4,Conference,IEEE,,,10.1109/siu.2018.8404779,"[2008104692, 2042519026, 2054692642, 211076718...","{'IndexLength': 159, 'InvertedIndex': {'been':...","[{'name': 'Computer vision', 'w': 0.41825}, {'...",{'raw': 'Signal Processing and Communications ...
2,2873535434,"[{'name': 'Li Cheng', 'org': 'School of Electr...",An Optimized Infrared Detection Strategy for D...,2018,0,38137,38146,Journal,Institute of Electrical and Electronics Engine...,6,,10.1109/access.2018.2854221,[2793612255],"{'IndexLength': 195, 'InvertedIndex': {'been':...","[{'name': 'Heat flux', 'w': 0.45008}, {'name':...","{'raw': 'IEEE Access', 'id': 2485537415, 'type..."
3,2873541924,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",Unsupervised Natural Image Patch Learning,2018,0,,,Repository,,,,,"[2078790577, 2550791380, 2950187998]","{'IndexLength': 171, 'InvertedIndex': {'Clearl...","[{'name': 'Convergence (routing)', 'w': 0.4440...",{'raw': 'arXiv: Computer Vision and Pattern Re...
4,2873550248,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",Age-associated increase in mnemonic strategy u...,2018,2,162,169,Journal,Academic Press,181,,10.1016/j.neuroimage.2018.07.008,"[2049056222, 2101135654, 2783148479]","{'IndexLength': 219, 'InvertedIndex': {'(PFC),...","[{'name': 'Semantic clustering', 'w': 0}, {'na...","{'raw': 'NeuroImage', 'id': 103225281, 'type':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2884790315,"[{'name': 'Fabio G. Guerrero', 'org': 'School ...",An interactive approach for illustrating a pro...,2018,0,2282,2293,Journal,"John Wiley & Sons, Ltd",26,6,10.1002/cae.22041,"[1995875735, 1998324813, 2024400985, 203890527...",,"[{'name': 'Information theory', 'w': 0.49065},...",{'raw': 'Computer Applications in Engineering ...
9996,2884790510,"[{'name': 'Manish Sharma', 'org': 'Department ...",An automated diagnosis of depression using thr...,2018,18,508,520,Journal,Elsevier BV,52,,10.1016/j.cogsys.2018.07.010,"[1596717185, 2027927824, 2087347434, 209903329...","{'IndexLength': 279, 'InvertedIndex': {'optima...","[{'name': 'Orthogonal wavelet', 'w': 0.48532},...","{'raw': 'Cognitive Systems Research', 'id': 96..."
9997,2884790907,"[{'name': 'Ryan Aldrich', 'id': 2884488651}, {...",Low-Cost Radar for Object Tracking in Autonomo...,2018,3,1,5,Conference,IEEE,,,10.1109/vtcspring.2018.8417751,,"{'IndexLength': 168, 'InvertedIndex': {'distor...","[{'name': 'Radar engineering details', 'w': 0....","{'raw': 'Vehicular Technology Conference', 'id..."
9998,2884791133,"[{'name': 'Xiaoping Sun', 'org': 'Laboratory o...",Summarization of Scientific Paper Through Rein...,2018,6,40611,40625,Journal,Institute of Electrical and Electronics Engine...,6,,10.1109/access.2018.2856530,"[850029481, 1525595230, 1972028996, 1972719750...","{'IndexLength': 255, 'InvertedIndex': {'Top-k'...","[{'name': 'Information system', 'w': 0.46617},...","{'raw': 'IEEE Access', 'id': 2485537415, 'type..."


In [7]:
sample_df.filter(['id', 'title', 'authors', 'n_citation', 'year', 'doc_type', 'publisher', 'venue', 'references'])

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057..."
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718..."
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255]
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]"
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]"
...,...,...,...,...,...,...,...,...,...
9995,2884790315,An interactive approach for illustrating a pro...,"[{'name': 'Fabio G. Guerrero', 'org': 'School ...",0,2018,Journal,"John Wiley & Sons, Ltd",{'raw': 'Computer Applications in Engineering ...,"[1995875735, 1998324813, 2024400985, 203890527..."
9996,2884790510,An automated diagnosis of depression using thr...,"[{'name': 'Manish Sharma', 'org': 'Department ...",18,2018,Journal,Elsevier BV,"{'raw': 'Cognitive Systems Research', 'id': 96...","[1596717185, 2027927824, 2087347434, 209903329..."
9997,2884790907,Low-Cost Radar for Object Tracking in Autonomo...,"[{'name': 'Ryan Aldrich', 'id': 2884488651}, {...",3,2018,Conference,IEEE,"{'raw': 'Vehicular Technology Conference', 'id...",
9998,2884791133,Summarization of Scientific Paper Through Rein...,"[{'name': 'Xiaoping Sun', 'org': 'Laboratory o...",6,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...","[850029481, 1525595230, 1972028996, 1972719750..."


In [8]:
# Create utility function to generate dataset and keep relevant columns

def get_df(files):
    """
    Generate dataframe for list
    of files
    """
    frames = []

    for file in tqdm(files):

        df = None
        with open(file) as f:
            df = pd.DataFrame(json.loads(line) for line in f).filter(
                [
                    'id', 
                    'title', 
                    'authors', 
                    'n_citation', 
                    'year',
                    'doc_type', 
                    'publisher',
                    'venue', 
                    'references'
                ]
            )

        frames.append(df)
        
    return pd.concat(frames)

In [9]:
df = get_df(chunk_files)
df

  0%|          | 0/490 [00:00<?, ?it/s]

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057..."
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718..."
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255]
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]"
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]"
...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073]
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723]


In [10]:
# Replace NaN withe empty strings

df.fillna("", inplace=True)
df

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057..."
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718..."
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255]
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]"
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]"
...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073]
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723]


In [11]:
def clean_title(title):
    """
    Clean title
    
    :param title: input title
    :return: clean title
    """
    charsToMatch = "[${}\r^]"
    
    words = title.split(" ")
    words = [
        word for word in words
            if not any([char in word for char in charsToMatch])
    ]
    return " ".join(words)

In [12]:
sample_title = '${\rm H}_{infty}$ Output Tracking Control of Discrete-Time Nonlinear Systems via Standard Neural Network Models'
sample_title

'${\rm H}_{infty}$ Output Tracking Control of Discrete-Time Nonlinear Systems via Standard Neural Network Models'

In [13]:
clean_title(sample_title)

'Output Tracking Control of Discrete-Time Nonlinear Systems via Standard Neural Network Models'

In [14]:
sample_title = 'Fully Integrated On-Chip Coil in 0.13 $mu {\rm m}$ CMOS for Wireless Power Transfer Through Biological Media'
sample_title

'Fully Integrated On-Chip Coil in 0.13 $mu {\rm m}$ CMOS for Wireless Power Transfer Through Biological Media'

In [15]:
clean_title(sample_title)

'Fully Integrated On-Chip Coil in 0.13 CMOS for Wireless Power Transfer Through Biological Media'

In [16]:
sample_title = 'Construction of cyclic DNA codes over the Ring $\x1a_4[u]/langle u^2-1 \rangle $ Based on the deletion distance'
sample_title

'Construction of cyclic DNA codes over the Ring $\x1a_4[u]/langle u^2-1 \rangle $ Based on the deletion distance'

In [17]:
clean_title(sample_title)

'Construction of cyclic DNA codes over the Ring Based on the deletion distance'

In [18]:
df['clean_title'] = df['title'].progress_apply(lambda x: clean_title(x))
df

  0%|          | 0/4894081 [00:00<?, ?it/s]

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references,clean_title
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057...",Characterization of whole-body muscle activity...
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718...",The effect of codebook design on the BER perfo...
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255],An Optimized Infrared Detection Strategy for D...
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]",Unsupervised Natural Image Patch Learning
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]",Age-associated increase in mnemonic strategy u...
...,...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,,A Dynamic Window Neural Network for CCG Supert...
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073],Efficient implementation of quantum circuits w...
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,,A Dataset for Building Code-Mixed Goal Oriente...
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723],Modal operators and toric ideals


In [19]:
df['n_citation'].value_counts()

0       1356051
1        556543
2        391241
3        295222
4        232290
         ...   
4338          1
6666          1
2716          1
2850          1
3637          1
Name: n_citation, Length: 3071, dtype: int64

In [20]:
df['n_citation'].describe()

count   4894081.00
mean         16.80
std         115.82
min           0.00
25%           0.00
50%           3.00
75%          11.00
max       48327.00
Name: n_citation, dtype: float64

In [21]:
df[df['n_citation'].isnull()]

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references,clean_title


In [22]:
df

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references,clean_title
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057...",Characterization of whole-body muscle activity...
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718...",The effect of codebook design on the BER perfo...
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255],An Optimized Infrared Detection Strategy for D...
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]",Unsupervised Natural Image Patch Learning
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]",Age-associated increase in mnemonic strategy u...
...,...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,,A Dynamic Window Neural Network for CCG Supert...
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073],Efficient implementation of quantum circuits w...
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,,A Dataset for Building Code-Mixed Goal Oriente...
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723],Modal operators and toric ideals


In [23]:
df = df[['id', 'clean_title', 'authors', 'n_citation', 'year', 'doc_type', 'publisher', 'venue', 'references']]
df

Unnamed: 0,id,clean_title,authors,n_citation,year,doc_type,publisher,venue,references
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057..."
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718..."
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255]
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]"
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]"
...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073]
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723]


In [24]:
df.rename(columns={'clean_title': 'title'}, inplace=True)
df

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057..."
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718..."
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255]
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]"
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]"
...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073]
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723]


# Preprocessing

In [25]:
df[df['venue'].isnull()]

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references


In [26]:
# Check 'doc_type'

df['doc_type'].value_counts()

Conference     2245013
Journal        1919348
                499441
Repository      214762
Book             12823
BookChapter       1614
Patent            1062
Dataset             18
Name: doc_type, dtype: int64

In [27]:
# Check 'venue' sample

df['venue'].iloc[0]

{'raw': 'Hellenic Conference on Artificial Intelligence',
 'id': 1124396182,
 'type': 'C'}

In [28]:
# Create utility function to extract venue id and name

def extract_venue_id(data):
    """
    Extract venue id
    """
    venue_id = ""
    
    if data:
        
        if 'id' in data:
            venue_id = data["id"]
    
    return venue_id

def extract_venue_string(data):
    """
    Extract venue string
    """
    venue_str = ""
    
    if data:
        
        if 'raw' in data:
            venue_str = data["raw"]
    
    return venue_str

In [29]:
extract_venue_id(df['venue'].iloc[0])

1124396182

In [30]:
extract_venue_string(df['venue'].iloc[0])

'Hellenic Conference on Artificial Intelligence'

In [31]:
df['venue_id'] = df['venue'].progress_apply(lambda x: extract_venue_id(x))
df

  0%|          | 0/4894081 [00:00<?, ?it/s]

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references,venue_id
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057...",1124396182
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718...",2735422400
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255],2485537415
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]",2597175965
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]",103225281
...,...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,,1184914352
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073],41034432
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,,1169674987
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723],47057106


In [32]:
df['venue_str'] = df['venue'].progress_apply(lambda x: extract_venue_string(x))
df

  0%|          | 0/4894081 [00:00<?, ?it/s]

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references,venue_id,venue_str
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057...",1124396182,Hellenic Conference on Artificial Intelligence
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718...",2735422400,Signal Processing and Communications Applicati...
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255],2485537415,IEEE Access
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]",2597175965,arXiv: Computer Vision and Pattern Recognition
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]",103225281,NeuroImage
...,...,...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,,1184914352,National Conference on Artificial Intelligence
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073],41034432,Quantum Information & Computation
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,,1169674987,International Conference on Computational Ling...
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723],47057106,Journal of Logic and Computation


In [33]:
# Check sample authors from dataset

df['authors'].iloc[0]

[{'name': 'Ioannis Delis',
  'org': 'Department of Biomedical Sciences, University of Leeds, Leeds, UK',
  'id': 1898054021},
 {'name': 'Pauline Hilt',
  'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
  'id': 2645550909},
 {'name': 'Thierry Pozzo',
  'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
  'id': 531415370},
 {'name': 'Stefano Panzeri',
  'org': 'Neural Computation Laboratory, Center for Neuroscience and Cognitive Systems@UniTn, Istituto Italiano di Tecnologia, Rovereto (TN), Italy',
  'id': 322723101},
 {'name': 'Bastien Berret',
  'org': 'Univ. Paris-Sud, Université Paris-Saclay, Orsay Cedex, France',
  'id': 79947125}]

In [34]:
# How many unique authors are in the original dataset?

authors_lst = df['authors'].tolist()
len(authors_lst)

4894081

In [35]:
[
    author['id'] for author in tqdm(authors_lst[0])
]

  0%|          | 0/5 [00:00<?, ?it/s]

[1898054021, 2645550909, 531415370, 322723101, 79947125]

In [36]:
authors_lst[:2]

[[{'name': 'Ioannis Delis',
   'org': 'Department of Biomedical Sciences, University of Leeds, Leeds, UK',
   'id': 1898054021},
  {'name': 'Pauline Hilt',
   'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
   'id': 2645550909},
  {'name': 'Thierry Pozzo',
   'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
   'id': 531415370},
  {'name': 'Stefano Panzeri',
   'org': 'Neural Computation Laboratory, Center for Neuroscience and Cognitive Systems@UniTn, Istituto Italiano di Tecnologia, Rovereto (TN), Italy',
   'id': 322723101},
  {'name': 'Bastien Berret',
   'org': 'Univ. Paris-Sud, Université Paris-Saclay, Orsay Cedex, France',
   'id': 79947125}],
 [{'name': 'Ferdi Tekce',
   'org': 'Elektronik ve Haberleşme Mühendisliği, Bölümü Yildiz Teknik Üniversitesi, İstanbul/Türkiye',
   'id': 2865315720},
  {'name':

In [37]:
[
    [author['id'] for author in authors] for authors in authors_lst[:2]
]

[[1898054021, 2645550909, 531415370, 322723101, 79947125],
 [2865315720, 2312549210, 2033699703, 2134904855]]

In [38]:
author_ids_lst = [
    [author['id'] for author in authors] for authors in tqdm(authors_lst)
]
author_ids_lst[:3]

  0%|          | 0/4894081 [00:00<?, ?it/s]

[[1898054021, 2645550909, 531415370, 322723101, 79947125],
 [2865315720, 2312549210, 2033699703, 2134904855],
 [2903536698, 2097180593, 2756219407, 2814008799]]

In [39]:
author_ids = [author_id for sublist in author_ids_lst for author_id in sublist]
author_ids[:20]

[1898054021,
 2645550909,
 531415370,
 322723101,
 79947125,
 2865315720,
 2312549210,
 2033699703,
 2134904855,
 2903536698,
 2097180593,
 2756219407,
 2814008799,
 2230089554,
 564112320,
 2104333588,
 9988965,
 2581165408,
 2811603551,
 2856558135]

In [40]:
unique_author_ids = list(set(author_ids))
len(unique_author_ids)

4398138

In [41]:
unique_author_ids[:10]

[2810183681,
 2407530497,
 2231369732,
 2709520392,
 41943048,
 2013265928,
 2105540619,
 2248146956,
 2432696333,
 2818572301]

In [42]:
author_name_lst = [
    [author['name'] for author in authors] for authors in tqdm(authors_lst)
]
author_name_lst[:3]

  0%|          | 0/4894081 [00:00<?, ?it/s]

[['Ioannis Delis',
  'Pauline Hilt',
  'Thierry Pozzo',
  'Stefano Panzeri',
  'Bastien Berret'],
 ['Ferdi Tekce', 'Evren Catak', 'Umut Engin Ayten', 'Lutfiye Durak-Ata'],
 ['Li Cheng', 'Ruijin Liao', 'Lijun Yang', 'Fuzeng Zhang']]

In [43]:
author_names = [author_name for sublist in author_name_lst for author_name in sublist]
author_names[:20]

['Ioannis Delis',
 'Pauline Hilt',
 'Thierry Pozzo',
 'Stefano Panzeri',
 'Bastien Berret',
 'Ferdi Tekce',
 'Evren Catak',
 'Umut Engin Ayten',
 'Lutfiye Durak-Ata',
 'Li Cheng',
 'Ruijin Liao',
 'Lijun Yang',
 'Fuzeng Zhang',
 'Dov Danon',
 'Hadar Averbuch-Elor',
 'Ohad Fried',
 'Daniel Cohen-Or',
 'Qijing Yu',
 'Dana M. McCall',
 'Roya Homayouni']

In [44]:
unique_author_names = list(set(author_names))
len(unique_author_names)

3290430

In [45]:
authors_lst[:2]

[[{'name': 'Ioannis Delis',
   'org': 'Department of Biomedical Sciences, University of Leeds, Leeds, UK',
   'id': 1898054021},
  {'name': 'Pauline Hilt',
   'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
   'id': 2645550909},
  {'name': 'Thierry Pozzo',
   'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
   'id': 531415370},
  {'name': 'Stefano Panzeri',
   'org': 'Neural Computation Laboratory, Center for Neuroscience and Cognitive Systems@UniTn, Istituto Italiano di Tecnologia, Rovereto (TN), Italy',
   'id': 322723101},
  {'name': 'Bastien Berret',
   'org': 'Univ. Paris-Sud, Université Paris-Saclay, Orsay Cedex, France',
   'id': 79947125}],
 [{'name': 'Ferdi Tekce',
   'org': 'Elektronik ve Haberleşme Mühendisliği, Bölümü Yildiz Teknik Üniversitesi, İstanbul/Türkiye',
   'id': 2865315720},
  {'name':

In [46]:
[
    authors for sublist in authors_lst[:10] for authors in sublist
]

[{'name': 'Ioannis Delis',
  'org': 'Department of Biomedical Sciences, University of Leeds, Leeds, UK',
  'id': 1898054021},
 {'name': 'Pauline Hilt',
  'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
  'id': 2645550909},
 {'name': 'Thierry Pozzo',
  'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
  'id': 531415370},
 {'name': 'Stefano Panzeri',
  'org': 'Neural Computation Laboratory, Center for Neuroscience and Cognitive Systems@UniTn, Istituto Italiano di Tecnologia, Rovereto (TN), Italy',
  'id': 322723101},
 {'name': 'Bastien Berret',
  'org': 'Univ. Paris-Sud, Université Paris-Saclay, Orsay Cedex, France',
  'id': 79947125},
 {'name': 'Ferdi Tekce',
  'org': 'Elektronik ve Haberleşme Mühendisliği, Bölümü Yildiz Teknik Üniversitesi, İstanbul/Türkiye',
  'id': 2865315720},
 {'name': 'Evren Catak',
  'o

In [47]:
authors = [
    author for sublist in authors_lst for author in sublist
]
len(authors)

14934850

In [48]:
authors[:3]

[{'name': 'Ioannis Delis',
  'org': 'Department of Biomedical Sciences, University of Leeds, Leeds, UK',
  'id': 1898054021},
 {'name': 'Pauline Hilt',
  'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
  'id': 2645550909},
 {'name': 'Thierry Pozzo',
  'org': 'Fondazione Istituto Italiano di Tecnologia, Centro di Neurofisiologia traslazionale c/o sezione Fisiologia Umana, Ferrara, Italy',
  'id': 531415370}]

In [49]:
author_df = pd.DataFrame(authors)
author_df

Unnamed: 0,name,org,id
0,Ioannis Delis,"Department of Biomedical Sciences, University ...",1898054021
1,Pauline Hilt,"Fondazione Istituto Italiano di Tecnologia, Ce...",2645550909
2,Thierry Pozzo,"Fondazione Istituto Italiano di Tecnologia, Ce...",531415370
3,Stefano Panzeri,"Neural Computation Laboratory, Center for Neur...",322723101
4,Bastien Berret,"Univ. Paris-Sud, Université Paris-Saclay, Orsa...",79947125
...,...,...,...
14934845,Riccardo Camerlo,"Dipartimento di Matematica, Università di Geno...",1046033706
14934846,Giovanni Pistone,"de Castro Statistics, Collegio Carlo Alberto, ...",2028717393
14934847,Fabio Rapallo,Dipartimento di Scienze e Innovazione Tecnolog...,47534353
14934848,Steven J. Miller,"Department of Mathematics & Statistics, Willia...",2137280055


In [50]:
# Re-order column names

author_df = author_df[['id', 'name', 'org']]
author_df

Unnamed: 0,id,name,org
0,1898054021,Ioannis Delis,"Department of Biomedical Sciences, University ..."
1,2645550909,Pauline Hilt,"Fondazione Istituto Italiano di Tecnologia, Ce..."
2,531415370,Thierry Pozzo,"Fondazione Istituto Italiano di Tecnologia, Ce..."
3,322723101,Stefano Panzeri,"Neural Computation Laboratory, Center for Neur..."
4,79947125,Bastien Berret,"Univ. Paris-Sud, Université Paris-Saclay, Orsa..."
...,...,...,...
14934845,1046033706,Riccardo Camerlo,"Dipartimento di Matematica, Università di Geno..."
14934846,2028717393,Giovanni Pistone,"de Castro Statistics, Collegio Carlo Alberto, ..."
14934847,47534353,Fabio Rapallo,Dipartimento di Scienze e Innovazione Tecnolog...
14934848,2137280055,Steven J. Miller,"Department of Mathematics & Statistics, Willia..."


In [51]:
len(unique_author_ids)

4398138

In [52]:
author_df.drop_duplicates(subset=['id'], keep='first')

Unnamed: 0,id,name,org
0,1898054021,Ioannis Delis,"Department of Biomedical Sciences, University ..."
1,2645550909,Pauline Hilt,"Fondazione Istituto Italiano di Tecnologia, Ce..."
2,531415370,Thierry Pozzo,"Fondazione Istituto Italiano di Tecnologia, Ce..."
3,322723101,Stefano Panzeri,"Neural Computation Laboratory, Center for Neur..."
4,79947125,Bastien Berret,"Univ. Paris-Sud, Université Paris-Saclay, Orsa..."
...,...,...,...
14934729,2962813519,Je-Kwang Ryu,
14934753,2964296278,Boyuan Kong,Dept. of Electrical Engineering and Computer S...
14934805,2963126023,Fahem Kebair,
14934827,2473832003,Asmaa H. Elsaid,"Information Systems Department, Faculty of Com..."


In [53]:
# Remove duplicates

author_df = author_df.drop_duplicates(subset=['id'], keep='first')
author_df

Unnamed: 0,id,name,org
0,1898054021,Ioannis Delis,"Department of Biomedical Sciences, University ..."
1,2645550909,Pauline Hilt,"Fondazione Istituto Italiano di Tecnologia, Ce..."
2,531415370,Thierry Pozzo,"Fondazione Istituto Italiano di Tecnologia, Ce..."
3,322723101,Stefano Panzeri,"Neural Computation Laboratory, Center for Neur..."
4,79947125,Bastien Berret,"Univ. Paris-Sud, Université Paris-Saclay, Orsa..."
...,...,...,...
14934729,2962813519,Je-Kwang Ryu,
14934753,2964296278,Boyuan Kong,Dept. of Electrical Engineering and Computer S...
14934805,2963126023,Fahem Kebair,
14934827,2473832003,Asmaa H. Elsaid,"Information Systems Department, Faculty of Com..."


In [54]:
author_df.fillna("", inplace=True)
author_df

Unnamed: 0,id,name,org
0,1898054021,Ioannis Delis,"Department of Biomedical Sciences, University ..."
1,2645550909,Pauline Hilt,"Fondazione Istituto Italiano di Tecnologia, Ce..."
2,531415370,Thierry Pozzo,"Fondazione Istituto Italiano di Tecnologia, Ce..."
3,322723101,Stefano Panzeri,"Neural Computation Laboratory, Center for Neur..."
4,79947125,Bastien Berret,"Univ. Paris-Sud, Université Paris-Saclay, Orsa..."
...,...,...,...
14934729,2962813519,Je-Kwang Ryu,
14934753,2964296278,Boyuan Kong,Dept. of Electrical Engineering and Computer S...
14934805,2963126023,Fahem Kebair,
14934827,2473832003,Asmaa H. Elsaid,"Information Systems Department, Faculty of Com..."


In [55]:
author_df.to_csv('data/dblp/author.csv', index=False)

In [56]:
df

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue,references,venue_id,venue_str
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,{'raw': 'Hellenic Conference on Artificial Int...,"[1973012362, 1975079024, 2008605701, 206338057...",1124396182,Hellenic Conference on Artificial Intelligence
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,{'raw': 'Signal Processing and Communications ...,"[2008104692, 2042519026, 2054692642, 211076718...",2735422400,Signal Processing and Communications Applicati...
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,"{'raw': 'IEEE Access', 'id': 2485537415, 'type...",[2793612255],2485537415,IEEE Access
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,{'raw': 'arXiv: Computer Vision and Pattern Re...,"[2078790577, 2550791380, 2950187998]",2597175965,arXiv: Computer Vision and Pattern Recognition
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,"{'raw': 'NeuroImage', 'id': 103225281, 'type':...","[2049056222, 2101135654, 2783148479]",103225281,NeuroImage
...,...,...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,{'raw': 'National Conference on Artificial Int...,,1184914352,National Conference on Artificial Intelligence
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated","{'raw': 'Quantum Information & Computation', '...",[2401457073],41034432,Quantum Information & Computation
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,{'raw': 'International Conference on Computati...,,1169674987,International Conference on Computational Ling...
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),"{'raw': 'Journal of Logic and Computation', 'i...",[2610670723],47057106,Journal of Logic and Computation


In [57]:
# Keep relevant columns

df = df[['id', 'title', 'authors', 'n_citation', 'year', 'doc_type', 'publisher', 'venue_id', 'venue_str', 'references']]
df

Unnamed: 0,id,title,authors,n_citation,year,doc_type,publisher,venue_id,venue_str,references
0,2873507692,Characterization of whole-body muscle activity...,"[{'name': 'Ioannis Delis', 'org': 'Department ...",0,2018,Conference,ACM,1124396182,Hellenic Conference on Artificial Intelligence,"[1973012362, 1975079024, 2008605701, 206338057..."
1,2873533962,The effect of codebook design on the BER perfo...,"[{'name': 'Ferdi Tekce', 'org': 'Elektronik ve...",0,2018,Conference,IEEE,2735422400,Signal Processing and Communications Applicati...,"[2008104692, 2042519026, 2054692642, 211076718..."
2,2873535434,An Optimized Infrared Detection Strategy for D...,"[{'name': 'Li Cheng', 'org': 'School of Electr...",0,2018,Journal,Institute of Electrical and Electronics Engine...,2485537415,IEEE Access,[2793612255]
3,2873541924,Unsupervised Natural Image Patch Learning,"[{'name': 'Dov Danon', 'id': 2230089554}, {'na...",0,2018,Repository,,2597175965,arXiv: Computer Vision and Pattern Recognition,"[2078790577, 2550791380, 2950187998]"
4,2873550248,Age-associated increase in mnemonic strategy u...,"[{'name': 'Qijing Yu', 'org': 'Psychology Depa...",2,2018,Journal,Academic Press,103225281,NeuroImage,"[2049056222, 2101135654, 2783148479]"
...,...,...,...,...,...,...,...,...,...,...
9995,2963831124,A Dynamic Window Neural Network for CCG Supert...,"[{'name': 'Huijia Wu', 'id': 2531734576}, {'na...",0,2016,Conference,,1184914352,National Conference on Artificial Intelligence,
9996,2963831130,Efficient implementation of quantum circuits w...,"[{'name': 'Stephen Brierley', 'org': 'DAMTP, C...",3,2017,Journal,"Rinton Press, Incorporated",41034432,Quantum Information & Computation,[2401457073]
9997,2963831170,A Dataset for Building Code-Mixed Goal Oriente...,"[{'name': 'Suman Banerjee', 'id': 2808453915},...",0,2018,Conference,,1169674987,International Conference on Computational Ling...,
9998,2963831230,Modal operators and toric ideals,"[{'name': 'Riccardo Camerlo', 'org': 'Dipartim...",0,2019,Journal,Oxford University Press (OUP),47057106,Journal of Logic and Computation,[2610670723]


In [58]:
df.to_csv('data/dblp/data.csv', index=False)