In [4]:
import pandas as pd

### Function for filling dictionaries

In [23]:
def analyse_part_bd(part, author_id_name, author_articles, author_n_citations, article_citations):
    '''
    Arguments (also a return, except pandas df):
        part - pandas data frame which is a part of the whole dataset
        arguments are also return (except pandas df)!
        author_id_name -  dict: {'author_id': 'author_name'}
        author_articles - dict: {'author_id': ['aothor's articles id's]}
        author_n_citations - dict: {'author_id': [n_citations_i, ...]}
        article_citations - dict: {'article_id': ['id's of articels^ which used 'article_id' as reference]}
    '''
    for ind, article in part.iterrows():  # iterating over all rows (articles)
        article_id = article['_id']  # id of this article
        n_citation = article['n_citation']  # number of citations
        authors = article['authors']
        if isinstance(authors, list):
            for author in authors:  # see all authors of the article
                # print(author)
                if bool(author):  # if author is not empty {}
                    if 'name' in author.keys():
                        author_name = author['name']
                    else:  # if author doesn't have key 'name'
                        author_name = None
                    if '_id' in author.keys():
                        author_id = author['_id']
                    else:  # if author doesn't have key '_id'
                        if 'orgid' in author.keys():  # some authors has key 'orgig' instead of '_id'
                            author_id = author['orgid']
                        else:
                            if author_name is None:  # there is no any keys: 'name', '_id', 'orgig'
                                # print(author)  # print damaged author
                                continue  # this author is damaged: look at the NEXT author!
                            else:
                                author_id = author_name.replace(" ", "")  # create key from name

                    # author_id_name: {'author_id': 'author_name'}
                    if (not author_id in author_id_name.keys()) or (author_id_name[author_id] is None):  # add author
                        author_id_name[author_id] = author_name

                    # author_articles: {'author_id': [list of articles id's]}
                    # author_n_citations: {'author_id': [list of n_citations]}
                    if not author_id in author_articles.keys():
                        author_articles[author_id] = []  # create key and add article to the author
                        author_n_citations[author_id] = []  # ceate list and add n_citations for the article
                    if not article_id in author_articles[author_id]:
                        author_articles[author_id].append(article_id)  # add article to the author
                        author_n_citations[author_id].append(n_citation)

            # go through referencies
            # article_citations: {'article_id': [list of articles id's which used it as reference]}
            if False:  # skip for now - seems like too long operations!
                if not article_id in article_citations.keys():
                    article_citations[article_id] = []
                for ref_id in article['references']:
                    if not ref_id in article_citations.keys():
                        article_citations[ref_id] = []  # this article (article_id) cited the ref_id
                    if not article_id in article_citations[ref_id]:
                        article_citations[ref_id].append(article_id)
        else:
            # print(authors)
            articles_no_author.append(article_id)  # add to list 
            # print(f"ValueError: {article['authors']=}")
    return (author_id_name, author_articles, author_n_citations, article_citations)

### Creation of dictionaries to store specific data:

In [19]:
author_id_name = {}  # author's name by id
author_articles = {}  # authors articles by id (key: author_id, value: list of articles id)

author_n_citations = {}  # key: author_id, value: list of int(n_citations) of all his articles!
article_citations = {}  # article's citations by article's id (key: article_id, value: list of articles id)

articles_no_author = []  # articles with NaN author 

In [8]:
n_parts = 108

In [5]:
from ipywidgets import IntProgress
from IPython.display import display
import time

In [27]:
progress_bar = IntProgress(min=0, max=n_parts) # instantiate the bar

### Going through all batches and filling the dictionaries:

In [28]:
display(progress_bar)
for ind_part in range(n_parts):
    part_i = pd.read_json(f"data/dblpv13_{ind_part}.json")
    (author_id_name, author_articles, author_n_citations, article_citations) = \
                analyse_part_bd(part_i, author_id_name, author_articles, author_n_citations, article_citations)
    progress_bar.value = ind_part

IntProgress(value=0, max=108)

ValueError: Expected object or value

**For some reason, a couple of batches were not processed!**

In [30]:
len(author_id_name.keys())

4362827

In [2]:
import json

### Saving the dictionaries

In [35]:
with open('author_id_name.json', 'w') as fp:
    json.dump(author_id_name, fp)

In [36]:
with open('author_n_citations.json', 'w') as fp:
    json.dump(author_n_citations, fp)

In [37]:
with open('author_articles.json', 'w') as fp:
    json.dump(author_articles, fp)

In [23]:
part.loc[0]

_id                                    53e9982cb7602d97020524cd
title                                  Persistent OSPF Attacks.
authors       [{'_id': '53f463d9dabfaedf4363bdf5', 'name': '...
venue         {'_id': '555037607cea80f954183648', 'raw': 'ND...
year                                                       2012
keywords                                                     []
fos           [Open Shortest Path First, Eavesdropping, Man-...
n_citation                                                 27.0
lang                                                         en
pdf                                                            
url           [http://www.internetsociety.org/persistent-osp...
references    [53e9a33db7602d9702c48b6f, 53e99ffcb7602d97028...
page_start                                                  NaN
page_end                                                    NaN
volume                                                      NaN
issue                                   

## Studying connections

Let's form arrays of points to find the relationship between the average _**h-index**_ of the authors of the article and its _**n_citations**_

Reading an already created dictionary of _**h-idexes**_:

In [27]:
with open('author_id_hindex.json') as json_file:  # load file
    # key: author_id, value: list of int(n_citations) of all his articles!
    author_id_hindex_json = json.load(json_file)

In [28]:
citations_lst = []
average_hindex_lst = []
articles_no_author = []

In [29]:
def connection_citations_hindex(part, author_id_hindex_json, citations, average_hindex_lst, articles_no_author):
    '''
    Arguments (also a return, except pandas df):
        author_n_citations_json - dict: {'author_id': [n_citations_i, ...]}
        citations - list of articles n_citations
        average_hindex_lst - list of average h-index of the authors h-indeces
    '''
    for ind, article in part.iterrows():  # iterating over all rows (articles)
        article_id = article['_id']  # id of this article
        n_citation = article['n_citation']  # number of citations
        authors = article['authors']
        n_authors = 0
        h_index_sum = 0
        if isinstance(authors, list):
            for author in authors:  # see all authors of the article
                if bool(author):  # if author is not empty {}
                    if 'name' in author.keys():
                        author_name = author['name']
                    else:  # if author doesn't have key 'name'
                        author_name = None
                    if '_id' in author.keys():
                        author_id = author['_id']
                    else:  # if author doesn't have key '_id'
                        if 'orgid' in author.keys():  # some authors has key 'orgig' instead of '_id'
                            author_id = author['orgid']
                        else:
                            if author_name is None:  # there is no any keys: 'name', '_id', 'orgig'
                                n_authors += 1  # print damaged author
                                continue  # this author is damaged: look at the NEXT author!
                            else:
                                author_id = author_name.replace(" ", "")  # create key from name
                    
                    n_authors += 1
                    author_h_index = author_id_hindex_json[author_id]['h-index']
                    h_index_sum += author_h_index
        else:
            articles_no_author.append(article_id)  # add to list 

        citations.append(n_citation)
        if n_authors > 0:
            average_hindex_lst.append(h_index_sum / n_authors)
        else:
            average_hindex_lst.append(0)
            
    return (citations, average_hindex_lst, articles_no_author)

In [30]:
progress_bar2 = IntProgress(min=0, max=n_parts) # instantiate the bar

In [31]:
display(progress_bar2)
for ind_part in range(n_parts):
    part_i = pd.read_json(f"data/dblpv13_{ind_part}.json")
    (citations_lst, average_hindex_lst, articles_no_author) = \
                connection_citations_hindex(part_i, author_id_hindex_json, citations_lst, average_hindex_lst, articles_no_author)
    progress_bar2.value = ind_part

IntProgress(value=0, max=108)

ValueError: Expected object or value

**The same problem again!**

In [32]:
len(citations_lst)

5350000

In [33]:
len(average_hindex_lst)

5350000

In [48]:
max(average_hindex_lst)

176.0

In [44]:
import math

In [46]:
with open('av_hindex_n_citations.txt', 'w') as f:
    for i in range(len(average_hindex_lst)):
        f.write(str(round(average_hindex_lst[i], 3)))
        f.write('\t')
        if math.isnan(citations_lst[i]):
            f.write('0')
#             print('nan')
        else:
            f.write(str(citations_lst[i]))
        f.write('\n')