# Creation of the indices of proximity: second version

Importing the necessary libraries.

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from scipy import stats
import re
import nltk
import missingno as msno
from tqdm import tqdm
import json
import statistics
import math
import time

nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords
from gensim.models import Phrases
from collections import Counter

# enabling Jupyter Lab to include the output of our plots directly in this notebook
%matplotlib inline


# used to avoid blurry output plots in Jupyter Notebooks
%config InLineBackend.figure_format = "retina"

pd.options.mode.chained_assignment = None

Downloading the files I will use.

In [2]:
infile_df_full_extended = open('../exploratory_analysis/data_exploratory_analysis/df_full_extended','rb')
df_full_extended = pickle.load(infile_df_full_extended)
infile_df_full_extended.close()

In [3]:
infile_df_full = open('../creation_data_and_variables/data_creation_variables/df_full','rb')
df_full = pickle.load(infile_df_full)
infile_df_full.close()

# **Part 1 -- Processing the data**

<div class="alert-info">
1.1 Preparation dataset "df_full_extended"
</div>

I change the type of variable of my database "df_full_extended" for my further computations.

In [4]:
df_full_extended['cosine_similarity']=df_full_extended['cosine_similarity'].astype(float)
df_full_extended['score_concepts']=df_full_extended['score_concepts'].astype(float)

I check the type of variables I have here.

In [5]:
df_full_extended.dtypes

paper                              object
keyword                            object
cosine_similarity                 float64
title                              object
publication_date                   object
abstract                           object
year                                int64
month                              object
author                             object
referenced_works                   object
concepts                           object
score_concepts                    float64
yearly_H_index_notincremental     float64
yearly_H_index_incremental        float64
monthly_H_index_incremental       float64
monthly_H_index_notincremental    float64
dtype: object

Everything is fine just as expected. Now I compute the mean of my numerical variables, over months and years, as an example.

In [6]:
df_full_extended.groupby(['year','month']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,cosine_similarity,score_concepts,yearly_H_index_notincremental,yearly_H_index_incremental,monthly_H_index_incremental,monthly_H_index_notincremental
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012,April,0.333883,0.024248,0.569220,0.569220,0.257392,0.202285
2012,August,0.343405,0.023505,0.662949,0.662949,0.501859,0.353160
2012,December,0.338644,0.022436,0.343470,0.343470,0.343470,0.212868
2012,February,0.327935,0.021750,0.431507,0.431507,0.280822,0.273973
2012,January,0.347939,0.026209,0.394850,0.394850,0.197425,0.197425
...,...,...,...,...,...,...,...
2022,March,0.340837,0.025016,0.575432,0.897593,0.669630,0.159568
2022,May,0.342882,0.025851,0.345319,0.504796,0.374096,0.094845
2022,November,0.342772,0.020264,0.441697,0.771184,0.767337,0.173536
2022,October,0.341273,0.024905,0.395794,0.636472,0.621406,0.163528


I now want to put the score of attribution to concepts as columns instead of one column for each paper.
I create an auxiliary dataframe to reach this aim.

In [7]:
dfhelp = df_full_extended[['paper','concepts','score_concepts']]
dfhelp=dfhelp.drop_duplicates()

In [8]:
dfhelp=dfhelp.set_index(['paper','concepts'])['score_concepts'].unstack().reset_index()

In [9]:
dfhelp

concepts,paper,Authentication,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,Electronic voting,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000206743,0.0,0.000000,0.0,0.0,0.0,0.821057,0.0,0.0,0.0,...,0.0,0.595266,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,https://openalex.org/W1000603767,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.747658,0.0,0.0,0.0,0.0,0.0,0.0
2,https://openalex.org/W1002126369,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,https://openalex.org/W100226608,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.701117,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,https://openalex.org/W1002797211,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86974,https://openalex.org/W99586052,0.0,0.000000,0.0,0.0,0.0,0.476391,0.0,0.0,0.0,...,0.0,0.505444,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86975,https://openalex.org/W996094322,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.250971,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86976,https://openalex.org/W997299233,0.0,0.631994,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
86977,https://openalex.org/W998997404,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.631673,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


Now I want to merge this dataframe with another frame, whose columns 'concepts', 'score_concepts', 'referenced_works' where deleted such that I obtain one big dataframe, with the concepts as columns.

In [10]:
dftomerge = df_full_extended.drop(['concepts','score_concepts','referenced_works'], axis=1)
dftomerge = dftomerge.drop_duplicates()

In [11]:
dfintermed = pd.merge(dftomerge,dfhelp, on='paper', how='inner')
dfintermed

Unnamed: 0,paper,keyword,cosine_similarity,title,publication_date,abstract,year,month,author,yearly_H_index_notincremental,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000206743,verif,0.3090,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://openalex.org/W1000206743,hierarchi,0.2980,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://openalex.org/W1000206743,microprocessor,0.2864,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://openalex.org/W1000206743,verifi,0.2821,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://openalex.org/W1000206743,hierarch,0.2562,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544719,https://openalex.org/W999914091,wavelength,0.3866,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544720,https://openalex.org/W999914091,photon,0.3747,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544721,https://openalex.org/W999914091,quantum,0.3536,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544722,https://openalex.org/W999914091,multiplex,0.3057,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


I've go everything except the column 'referenced_works'. Again, I do a merge to obtain the final dataframe I am aiming at.

In [12]:
dfref=df_full_extended[['paper','referenced_works']]
dfref=dfref.drop_duplicates(subset=['paper'])

In [13]:
dfall = pd.merge(dfref, dfintermed, on='paper', how='inner')
dfall

Unnamed: 0,paper,referenced_works,keyword,cosine_similarity,title,publication_date,abstract,year,month,author,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000206743,"[https://openalex.org/W2116665839, https://ope...",verif,0.3090,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://openalex.org/W1000206743,"[https://openalex.org/W2116665839, https://ope...",hierarchi,0.2980,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://openalex.org/W1000206743,"[https://openalex.org/W2116665839, https://ope...",microprocessor,0.2864,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://openalex.org/W1000206743,"[https://openalex.org/W2116665839, https://ope...",verifi,0.2821,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://openalex.org/W1000206743,"[https://openalex.org/W2116665839, https://ope...",hierarch,0.2562,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544719,https://openalex.org/W999914091,[https://openalex.org/W2085511467],wavelength,0.3866,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544720,https://openalex.org/W999914091,[https://openalex.org/W2085511467],photon,0.3747,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544721,https://openalex.org/W999914091,[https://openalex.org/W2085511467],quantum,0.3536,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544722,https://openalex.org/W999914091,[https://openalex.org/W2085511467],multiplex,0.3057,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


I explode the column 'referenced_works', in order to have the complete dataframe I am aiming at.

In [14]:
df_final = dfall.explode('referenced_works')

Now I do have my final dataframe, I will use to compute my indices of proximity.

In [15]:
df_final

Unnamed: 0,paper,referenced_works,keyword,cosine_similarity,title,publication_date,abstract,year,month,author,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000206743,https://openalex.org/W2116665839,verif,0.3090,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,https://openalex.org/W1000206743,https://openalex.org/W2182097415,verif,0.3090,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,https://openalex.org/W1000206743,https://openalex.org/W2281564060,verif,0.3090,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://openalex.org/W1000206743,https://openalex.org/W2116665839,hierarchi,0.2980,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,https://openalex.org/W1000206743,https://openalex.org/W2182097415,hierarchi,0.2980,Modified RSA Digital Signature Scheme for Data...,2014-11-18,As we know that digital signature is an authe...,2014,November,https://openalex.org/A3039217926,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544719,https://openalex.org/W999914091,https://openalex.org/W2085511467,wavelength,0.3866,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544720,https://openalex.org/W999914091,https://openalex.org/W2085511467,photon,0.3747,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544721,https://openalex.org/W999914091,https://openalex.org/W2085511467,quantum,0.3536,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1544722,https://openalex.org/W999914091,https://openalex.org/W2085511467,multiplex,0.3057,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Last I save this dataframe, since it could be useful, not to have to do the computations again each time, I am interest in this dataframe.

In [16]:
df_final.to_pickle('data_indices/df_computations_indices')

<div class="alert-info">
1.2 Preparation auxiliary dataset for references
</div>

Now I want to do the same with df_full to use it as source of information for the cited papers. In fact, I deleted many papers from the set of papers I am studying, because there were no referenced works, several ids for the same paper, etc. All these papers or ids are still part of the set of referenced_works. For this reason, I need the full dataset to have information about these papers.

I rename one column, to make everything more uniform among my datasets.

In [17]:
df_full.rename(columns={"id": "paper"}, inplace=True)

Again, I will change my dataset, putting the concepts as columns having directly the score of attribution whithin themselves.

In [18]:
dfhelp = df_full[['paper','concepts','score_concepts']]
dfhelp=dfhelp.drop_duplicates()

In [19]:
dfhelp['score_concepts']=dfhelp['score_concepts'].astype(float)

In [20]:
dfhelp=dfhelp.set_index(['paper','concepts'])['score_concepts'].unstack().reset_index()

In [21]:
dfhelp

concepts,paper,Authentication protocole,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,Electronic voting,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.000000
1,https://openalex.org/W100004108,0.000000,0.746300,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
2,https://openalex.org/W1000101879,0.431697,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
3,https://openalex.org/W1000206743,0.000000,0.000000,0.0,0.0,0.0,0.821057,0.0,0.0,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
4,https://openalex.org/W1000558944,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.255347,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199842,https://openalex.org/W998997404,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.631673,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
199843,https://openalex.org/W99938045,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.775669
199844,https://openalex.org/W999389294,0.000000,0.029723,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
199845,https://openalex.org/W999405428,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.423747,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000


In [22]:
dftomerge = df_full.drop(['concepts','score_concepts'], axis=1)
dftomerge = dftomerge.drop_duplicates()

In [23]:
df_auxiliary_ref = pd.merge(dftomerge,dfhelp, on='paper', how='inner')
df_auxiliary_ref

Unnamed: 0,paper,title,publication_date,author,referenced_works,abstract,year,month,Authentication protocole,Biometrics,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1498573661,A Secure Biometrics-Based Multi-Server Authent...,2015-06-01,https://openalex.org/A1921362044,https://openalex.org/W1481832011,"Recently, in 2014, He and Wang proposed a rob...",2015,June,0.592982,0.779361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,https://openalex.org/W1498573661,A Secure Biometrics-Based Multi-Server Authent...,2015-06-01,https://openalex.org/A1921362044,https://openalex.org/W1931880689,"Recently, in 2014, He and Wang proposed a rob...",2015,June,0.592982,0.779361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,https://openalex.org/W1498573661,A Secure Biometrics-Based Multi-Server Authent...,2015-06-01,https://openalex.org/A1921362044,https://openalex.org/W1993920884,"Recently, in 2014, He and Wang proposed a rob...",2015,June,0.592982,0.779361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,https://openalex.org/W1498573661,A Secure Biometrics-Based Multi-Server Authent...,2015-06-01,https://openalex.org/A1921362044,https://openalex.org/W2005461933,"Recently, in 2014, He and Wang proposed a rob...",2015,June,0.592982,0.779361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,https://openalex.org/W1498573661,A Secure Biometrics-Based Multi-Server Authent...,2015-06-01,https://openalex.org/A1921362044,https://openalex.org/W2018300500,"Recently, in 2014, He and Wang proposed a rob...",2015,June,0.592982,0.779361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2484635,https://openalex.org/W748367845,Verifiability in e-Auction protocols & Brandt'...,2013-03-17,https://openalex.org/A2777384365,,An electronic auction protocol will only be u...,2013,March,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419473
2484636,https://openalex.org/W748367845,Verifiability in e-Auction protocols & Brandt'...,2013-03-17,https://openalex.org/A2023056027,,An electronic auction protocol will only be u...,2013,March,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419473
2484637,https://openalex.org/W748367845,Verifiability in e-Auction protocols & Brandt'...,2013-03-17,https://openalex.org/A3206034141,,An electronic auction protocol will only be u...,2013,March,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419473
2484638,https://openalex.org/W952216897,Sporadic Solutions to Zero-One Exclusion Tasks,2014-07-08,https://openalex.org/A2796886382,,Zero-one exclusion is a family of distributed...,2014,July,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.426501


This is now done. I check how many papers are affiliated to no technologies at all. They will not count in my computations, since I can not attribute them to any technologies.

<div class="alert-info">
1.3 Checking papers attributed to no technologies at all
</div>

In [26]:
list_concepts= ['Authentication protocole','Biometrics','Blockchain','Digital rights management'
,'Digital signature','Distributed algorithm','Electronic voting','Functional encryption',
'Hardware acceleration','Hardware security module','Hash function','Homomorphic encryption','Identity management',
'Key management','Link encryption','Post-quantum cryptography','Public-key cryptography','Quantum key distribution',
'Quantum cryptography','Random number generation','Symmetric-key algorithm','Threshold cryptosystem',
'Trusted Computing','Tunneling protocol','Zero-knowlegde proof']

In [31]:
dfhelp.rename(columns = {'Authentication protocole':'Authentication'}, inplace = True)
df_auxiliary_ref.rename(columns = {'Authentication protocole':'Authentication'}, inplace = True)

In [32]:
myinfodataframe = dfhelp
for tech in list_concepts:
    myinfodataframe = myinfodataframe.loc[myinfodataframe[tech]==0]
myinfodataframe

concepts,paper,Authentication,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,Electronic voting,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
9,https://openalex.org/W1002055276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,https://openalex.org/W1007878853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,https://openalex.org/W1010237670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,https://openalex.org/W1012583890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,https://openalex.org/W1013095302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199778,https://openalex.org/W985309937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199798,https://openalex.org/W989745980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199811,https://openalex.org/W992966919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199817,https://openalex.org/W994197007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
list_weird_papers = list(set(myinfodataframe.paper.tolist()))

In [34]:
print('There are '+str(len(list_weird_papers))+' papers attributed to no technologies at all.')

There are 16034 papers attributed to no technologies at all.


I do have all the data I need. I define the functions I will use and then, I will compute the indices of proximity.

# **Part 2 -- Function for the computation of the indices**

For each author and for each month, the function 'importance_author_intech' takes its hindex (incremental or not, based on the condition), the average attribution to technology 1 and 2 of the papers published during this month, and the average number of times this author appear in t1 and t2 during this month.\
This function returns 'hindex * attribution_to_t1_t2 * average_time_in_t1_2', which should represent the importance of the author for this link of technologies during this month.

In [35]:
def importance_author_t1_t2(author,df,tech1,tech2,condition):
    myinfo = df.loc[df['author']==author]
    
    dftech1 =myinfo.loc[myinfo[tech1]!=0]
    dftech2 =myinfo.loc[myinfo[tech2]!=0]
    
    #to compute the number of times this author published a paper related to tech 1 or tech 2
    times_author_in_t1 = len(list(set(dftech1.paper.tolist())))
    times_author_in_t2 = len(list(set(dftech2.paper.tolist())))
    
    attribution_tech1= dftech1[tech1].tolist()
    attribution_tech2= dftech2[tech2].tolist()
    
    if condition == 'incremental':
        hindex = myinfo.monthly_H_index_incremental.tolist()[0]+1
    if condition == 'nonincremental':
        hindex = myinfo.monthly_H_index_notincremental.tolist()[0]+1
    
    attribution_to_t1_t2 = (np.mean(attribution_tech1)+np.mean(attribution_tech2))/2
    
    average_time_in_t1_2 = (times_author_in_t1+times_author_in_t2)/2
    
    element_sum = hindex*attribution_to_t1_t2*average_time_in_t1_2
    
    return element_sum

In [36]:
def importance_author_t1(author,df,tech1,condition):
    myinfo = df.loc[df['author']==author]
    dftech1 =myinfo.loc[myinfo[tech1]!=0]
    
    #to compute the number of times this author published a paper related to tech 1 or tech 2
    attribution_tech1= dftech1[tech1].tolist()
    
    if condition == 'incremental':
        hindex = myinfo.monthly_H_index_incremental.tolist()[0]+1
    if condition == 'nonincremental':
        hindex = myinfo.monthly_H_index_notincremental.tolist()[0]+1
    
    attribution_to_t1 = np.mean(attribution_tech1)
    times_author_in_t1 = len(list(set(dftech1.paper.tolist())))
    
    element_sum = hindex*attribution_to_t1*times_author_in_t1
    
    return element_sum

In very similar fashion as done for authors, for each keyword during each month, the function 'importance_keywords_intech' takes the average cosine_similarity of the keyword of all the times it appears in t1 and t2 during this month, the average attribution to technology 1 and 2 of the papers published during this month where the keyword appears as a keyword, and the average number of times this keyword appear in t1 and t2 during this month.\
This function returns 'average_cosine_similarity * attribution_to_t1_t2 * average_time_in_t1_2', which should represent the importance of the keyword for this link of technologies during this month.

In [37]:
def importance_keywords_t1_t2(keyword,df,tech1,tech2):
    myinfo = df.loc[df['keyword']==keyword]
    
    dftech1 =myinfo.loc[myinfo[tech1]!=0]
    dftech2 =myinfo.loc[myinfo[tech2]!=0]
    
    times_keywords_in_t1 = len(list(set(dftech1.paper.tolist())))
    times_keywords_in_t2 = len(list(set(dftech2.paper.tolist())))
    
    attribution_tech1= dftech1[tech1].tolist()
    attribution_tech2= dftech2[tech2].tolist()
    
    cosine_similarity_tech1= dftech1.cosine_similarity.tolist()
    cosine_similarity_tech2= dftech2.cosine_similarity.tolist()
    
    average_cosine_similarity = (np.mean(cosine_similarity_tech1)+np.mean(cosine_similarity_tech2))/2
    attribution_to_t1_t2 = (np.mean(attribution_tech1)+np.mean(attribution_tech2))/2
    average_time_in_t1_2 = (times_keywords_in_t1+times_keywords_in_t2)/2
    
    element_sum = average_cosine_similarity*attribution_to_t1_t2*average_time_in_t1_2
    
    return element_sum

In [38]:
def importance_keywords_t1(keyword,df,tech1):
    dfkey = df.loc[df['keyword']==keyword]

    attribution_tech1= dfkey[tech1].tolist()
    cosine_similarity_tech1= dfkey.cosine_similarity.tolist()

    average_cosine_similarity = np.mean(cosine_similarity_tech1)
    attribution_to_t1 = np.mean(attribution_tech1)
    times_keywords_in_t1 = len(list(set(dfkey.paper.tolist())))
    
    element_sum = average_cosine_similarity*attribution_to_t1*times_keywords_in_t1
    
    return element_sum

We now come to the part about citations. This is sligthly more difficult and we have 4 functions in total.

The function below computes for a paper and a technology 1 and 2, the sum of the mean of attribution to technology 1 and 2 for all the referenced works related to 2 with respect to my paper which is attributed to technology 1.\
In other words, it gives: sum((attr_to_t1 + attr_to_t2_ref_x)/2) for all ref_x in referenced_works of my paper, where attr_to_t1 is the score of attribution of my paper to t1 and attr_to_t2_ref_x is the score of attribution to t2 of each referenced work.

In [39]:
def importance_cit_tech(paper,df,tech1,tech2,df_ref_help):
  
    myinfo = df.loc[df['paper']==paper]

    attribution_tech1= myinfo[tech1].tolist()[0]
    
    referenced_works=list(set(myinfo['referenced_works'].tolist()))
    
    list_component_sum_ref = list(map(lambda x: info_ref_tech(x,df_ref_help,attribution_tech1,tech2), referenced_works))
    
    return sum(list_component_sum_ref)

This function below is an auxiliary function. For a referenced paper 'ref', the score of attribution to technology 1 of the paper which has 'ref' as reference and a technology 'tech2', this function gives (attr_to_t1 + attr_to_t2_ref_x)/2 as explained above, if the referenced work is connected to 'tech2' and zero either.

In [82]:
def info_ref_tech(ref,df,attribution_paper_t1,tech2):
    myinforef = df.loc[df['paper']==str(ref)]
    #then I have a paper not related to my research
    if len(myinforef)==0:
        attribution_t1_t2 =0
    else:
        att_to_t2 = myinforef[tech2].tolist()[0]
        if att_to_t2!=0:
            attribution_t1_t2 = (attribution_paper_t1+att_to_t2)/2
        else:
            attribution_t1_t2 = 0
    return attribution_t1_t2

__Definition main functions for the computation of my indices__

I now define the function that will properly compute the indices of proximity, for the given dataframes.
All this function do the same, they compute a list of indices of proximity for all combination of technologies during a specific month taking the subdataframe containing all required information for the month during the specific year. They return then this list of indices of proximity, for each month.

The function below computes the index of proximity regarding keywords.

In [41]:
def create_indices_keywords(dfinfos,listconcepts):
    
    start = time.time()
    listindices= []
    
    for tech1 in list_concepts:
        for tech2 in list_concepts:

            # variables for my computations
    
            dftech1 =dfinfos.loc[dfinfos[tech1]!=0]
            dftech2 =dfinfos.loc[dfinfos[tech2]!=0]
    
            if len(dftech2) == 0 or len(dftech1)==0:
            # this means we can not compute anything because we have no data
            # we decide to let it blank.
                index_keywords=np.nan
            else:
    
                # keywords
    
                keywords_tech1 = dftech1.keyword.tolist()
                keywords_tech2 = dftech2.keyword.tolist()
    
                common_keywords = list(set(keywords_tech1) & set(keywords_tech2))
                keywords_t1 = list(set(keywords_tech1))
                keywords_t2 = list(set(keywords_tech2))
    
                df_tech_1_2_common_keywords = dfinfos.loc[dfinfos.keyword.isin(common_keywords)]
    
                if len(common_keywords)==0:
                    index_keywords = 0
                else:
                    list_comp_sum_keywords_t1_t2 = list(map(lambda x: importance_keywords_t1_t2(x, df_tech_1_2_common_keywords,tech1,tech2), common_keywords))
                    list_comp_sum_keywords_t1 = list(map(lambda x: importance_keywords_t1(x, dftech1,tech1), keywords_t1))
                    index_keywords = sum(list_comp_sum_keywords_t1_t2)/sum(list_comp_sum_keywords_t1)
    
            listindices.append(index_keywords)
    
    end = time.time()
    year = dfinfos.year.tolist()[0]
    month = dfinfos.month.tolist()[0]
    print('Indices-keywords of proximity for '+str(year)+' in '+str(month)+' were computed in '+str(round(end-start,2))+' seconds.')

    return listindices

The function below computes the index of proximity regarding colaboration.

In [42]:
def create_indices_colab(dfinfos,list_concepts):
    
    start = time.time()
    listindices= []
    
    for tech1 in list_concepts:
        for tech2 in list_concepts:

            # variables for my computations
    
            dftech1 =dfinfos.loc[dfinfos[tech1]!=0]
            dftech2 =dfinfos.loc[dfinfos[tech2]!=0]
    
            if len(dftech2) == 0 or len(dftech1)==0:
               # this means we can not compute anything because we have no data
                # we decide to let it blank.
                index_colab_notincrem=np.nan
                index_colab_increm = np.nan
            else:
    
            # colab
    
                authors_tech1 = dftech1.author.tolist()
                authors_tech2 = dftech2.author.tolist()
    
                common_authors = list(set(authors_tech1) & set(authors_tech2))
                authors_t1 = list(set(authors_tech1))
                authors_t2 = list(set(authors_tech2))
                if len(common_authors)==0:
                    index_colab_notincrem=0
                    index_colab_increm = 0
                else:
                    unionauthors = list(set(authors_tech1) | set(authors_tech2))
    
                    df_tech_1_2_union_authors = dfinfos.loc[dfinfos.author.isin(unionauthors)]
    
                    list_comp_sum_authors_t1_t2_notincrem = list(map(lambda x: importance_author_t1_t2(x, df_tech_1_2_union_authors,tech1,tech2,'nonincremental'), common_authors))
                    list_comp_sum_authors_t1_notincrem = list(map(lambda x: importance_author_t1(x, dftech1,tech1,'nonincremental'), authors_tech1))
                    index_colab_notincrem = sum(list_comp_sum_authors_t1_t2_notincrem)/sum(list_comp_sum_authors_t1_notincrem)
        
                    list_comp_sum_authors_t1_t2_increm = list(map(lambda x: importance_author_t1_t2(x, df_tech_1_2_union_authors,tech1,tech2,'incremental'), common_authors))
                    list_comp_sum_authors_t1_increm = list(map(lambda x: importance_author_t1(x, dftech2,tech2,'incremental'), authors_tech2))
                    index_colab_increm = sum(list_comp_sum_authors_t1_t2_increm)/sum(list_comp_sum_authors_t1_increm)
    
       
            listindices.append([index_colab_notincrem,index_colab_increm])
        
    end = time.time()
    year = dfinfos.year.tolist()[0]
    month = dfinfos.month.tolist()[0]
    print('Indices-colab of proximity for '+str(year)+' in '+str(month)+' were computed in '+str(round(end-start,2))+' seconds.')

    return listindices

The function below computes the index of proximity regarding citations.

In [43]:
def create_indices_cit(dfinfos,listconcepts,df_ref_help):
    
    start = time.time()
    listindices= []
    
    for tech1 in list_concepts:
        for tech2 in list_concepts:

            # variables for my computations
    
            dftech1 =dfinfos.loc[dfinfos[tech1]!=0]
            dftech2 =dfinfos.loc[dfinfos[tech2]!=0]

            if len(dftech2) == 0 or len(dftech1)==0:
            # this means we can not compute anything because we have no data we decide to let it blank.
                index_cit_2_1=np.nan
                index_cit_1_2 = np.nan
            else:
                papers_cit_t1 = list(set(dftech1.paper.tolist()))
                papers_cit_t2 = list(set(dftech2.paper.tolist()))
 
                df_t1_ref_t2 = dftech1.loc[dftech1['referenced_works'].isin(papers_cit_t2)]
                papers_cit_t1_t2 = list(set(df_t1_ref_t2.paper.tolist()))
                
                df_t2_ref_t1 = dftech2.loc[dftech2['referenced_works'].isin(papers_cit_t1)]
                papers_cit_t2_t1 = list(set(df_t2_ref_t1.paper.tolist()))

    
                if len(papers_cit_t1_t2)==0:
                    index_cit_1_2 = 0
                if len(papers_cit_t2_t1)==0:
                    index_cit_2_1 = 0
                if len(papers_cit_t1_t2)!=0:
                    list_comp_sum_cit_t1_t2 = list(map(lambda x: importance_cit_tech(x, dftech1,tech1,tech2,df_ref_help), papers_cit_t1_t2))
                    index_cit_1_2 = sum(list_comp_sum_cit_t1_t2)
                if len(papers_cit_t2_t1)!=0:
                    list_comp_sum_cit_t2_t1 = list(map(lambda x: importance_cit_tech(x, dftech2,tech2,tech1,df_ref_help), papers_cit_t2_t1))
                    index_cit_2_1 = sum(list_comp_sum_cit_t2_t1)
    
            listindices.append([index_cit_1_2,index_cit_2_1])
        
    end = time.time()
    year = dfinfos.year.tolist()[0]
    month = dfinfos.month.tolist()[0]
    
    print('Indices-cit of proximity for '+str(year)+' in '+str(month)+' were computed in '+str(round(end-start,2))+' seconds.')
    
    return listindices

# **Part 3 -- Proximity indices based on keywords**

<div class="alert-info">
3.1. Computation the indices
</div>

In [44]:
list_concepts= ['Authentication protocole','Biometrics','Blockchain','Digital rights management'
,'Digital signature','Distributed algorithm','Electronic voting','Functional encryption',
'Hardware acceleration','Hardware security module','Hash function','Homomorphic encryption','Identity management',
'Key management','Link encryption','Post-quantum cryptography','Public-key cryptography','Quantum key distribution',
'Quantum cryptography','Random number generation','Symmetric-key algorithm','Threshold cryptosystem',
'Trusted Computing','Tunneling protocol','Zero-knowlegde proof']

I select only the information I need, to reduce my database.

In [45]:
df_key = df_final.drop(['referenced_works','title','publication_date','abstract','author','yearly_H_index_notincremental','yearly_H_index_incremental','monthly_H_index_incremental','monthly_H_index_notincremental'], axis=1).copy()
df_key = df_key.drop_duplicates()

With the function below, I compute a dataframe of indices based on keywords for all months and all combinations of technologies.

In [46]:
indices_key = df_key.groupby(['year','month']).apply(lambda x: create_indices_keywords(x,list_concepts)).to_frame()

Indices-keywords of proximity for 2012 in April were computed in 15.12 seconds.
Indices-keywords of proximity for 2012 in August were computed in 11.5 seconds.
Indices-keywords of proximity for 2012 in December were computed in 23.45 seconds.
Indices-keywords of proximity for 2012 in February were computed in 4.04 seconds.
Indices-keywords of proximity for 2012 in January were computed in 3.08 seconds.
Indices-keywords of proximity for 2012 in July were computed in 13.69 seconds.
Indices-keywords of proximity for 2012 in June were computed in 10.66 seconds.
Indices-keywords of proximity for 2012 in March were computed in 8.13 seconds.
Indices-keywords of proximity for 2012 in May were computed in 11.46 seconds.
Indices-keywords of proximity for 2012 in November were computed in 17.47 seconds.
Indices-keywords of proximity for 2012 in October were computed in 19.26 seconds.
Indices-keywords of proximity for 2012 in September were computed in 22.13 seconds.
Indices-keywords of proximity 

Indices-keywords of proximity for 2020 in June were computed in 58.42 seconds.
Indices-keywords of proximity for 2020 in March were computed in 41.39 seconds.
Indices-keywords of proximity for 2020 in May were computed in 52.11 seconds.
Indices-keywords of proximity for 2020 in November were computed in 58.63 seconds.
Indices-keywords of proximity for 2020 in October were computed in 83.6 seconds.
Indices-keywords of proximity for 2020 in September were computed in 59.28 seconds.
Indices-keywords of proximity for 2021 in April were computed in 59.0 seconds.
Indices-keywords of proximity for 2021 in August were computed in 61.44 seconds.
Indices-keywords of proximity for 2021 in December were computed in 54.04 seconds.
Indices-keywords of proximity for 2021 in February were computed in 47.46 seconds.
Indices-keywords of proximity for 2021 in January were computed in 30.82 seconds.
Indices-keywords of proximity for 2021 in July were computed in 77.62 seconds.
Indices-keywords of proximit

<div class="alert-info">
3.2. Restructuration of the dataframe
</div>

We now aim to turn the dataframe 'indices_key' into a dataframe as we want it (with columns and so on).

In [47]:
indices_key.rename(columns={0: "index_keyword"}, inplace=True)
indices_key.reset_index(inplace=True, level=['month'])
indices_key.reset_index(inplace=True, level=['year'])

In [48]:
indices_key

Unnamed: 0,year,month,index_keyword
0,2012,April,"[1.0, 0.3330929145900216, nan, 0.0804865893049..."
1,2012,August,"[1.0, 0.5805385060564239, nan, nan, nan, 0.104..."
2,2012,December,"[1.0, 0.641932657840741, nan, 0.01276007006910..."
3,2012,February,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,2012,January,"[1.0, 0.8925331992313279, nan, nan, nan, nan, ..."
...,...,...,...
127,2022,March,"[1.0, 1.0864632778982648, 3.7354337046829937, ..."
128,2022,May,"[1.0, 1.0666825965716717, 4.151600527089536, 0..."
129,2022,November,"[1.0, 0.4313936962385364, 1.2933832391514775, ..."
130,2022,October,"[1.0, 0.9496540972600807, 3.307016794472871, 0..."


We need to create lists for the concepts for the columns we want to create.

In [49]:
list_combination = []
for tech1 in list_concepts:
    for tech2 in list_concepts:
        list_combination.append(tech1)
        list_combination.append(tech2)
        
end = len(list_combination)-1
first_listconcept = list_combination[:end:2]
second_listconcept = list_combination[1:(end+1):2]

In [50]:
indices_key

Unnamed: 0,year,month,index_keyword
0,2012,April,"[1.0, 0.3330929145900216, nan, 0.0804865893049..."
1,2012,August,"[1.0, 0.5805385060564239, nan, nan, nan, 0.104..."
2,2012,December,"[1.0, 0.641932657840741, nan, 0.01276007006910..."
3,2012,February,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,2012,January,"[1.0, 0.8925331992313279, nan, nan, nan, nan, ..."
...,...,...,...
127,2022,March,"[1.0, 1.0864632778982648, 3.7354337046829937, ..."
128,2022,May,"[1.0, 1.0666825965716717, 4.151600527089536, 0..."
129,2022,November,"[1.0, 0.4313936962385364, 1.2933832391514775, ..."
130,2022,October,"[1.0, 0.9496540972600807, 3.307016794472871, 0..."


We now explode the column 'indices' and add the columns about the concepts.

In [51]:
indices_key= indices_key.explode('index_keyword')
indices_key['concept1']=12*21*first_listconcept
indices_key['concept2']=12*21*second_listconcept

We are done with our last modification. We just verify and save the dataframe for later.

In [52]:
indices_key

Unnamed: 0,year,month,index_keyword,concept1,concept2
0,2012,April,1.0,Authentication,Authentication
0,2012,April,0.333093,Authentication,Biometrics
0,2012,April,,Authentication,Blockchain
0,2012,April,0.080487,Authentication,Digital rights management
0,2012,April,0.593677,Authentication,Digital signature
...,...,...,...,...,...
131,2022,September,0.080302,Zero-knowlegde proof,Symmetric-key algorithm
131,2022,September,,Zero-knowlegde proof,Threshold cryptosystem
131,2022,September,0.125157,Zero-knowlegde proof,Trusted Computing
131,2022,September,,Zero-knowlegde proof,Tunneling protocol


In [53]:
indices_key.to_pickle('data_indices/indices_key_normalized')

# **Part 4 -- Proximity indices based on collaboration**

<div class="alert-info">
4.1. Computation the indices
</div>

I select only the information I need, to reduce my database.

In [54]:
df_colab = df_final.drop(['referenced_works','title','publication_date','abstract','keyword','cosine_similarity'], axis=1).copy()
df_colab = df_colab.drop_duplicates()

With the function below, I compute a dataframe of indices based on collaboration for all months and all combinations of technologies.

In [55]:
indices_colab = df_colab.groupby(['year','month']).apply(lambda x: create_indices_colab(x,list_concepts)).to_frame()

Indices-colab of proximity for 2012 in April were computed in 4.43 seconds.
Indices-colab of proximity for 2012 in August were computed in 3.11 seconds.
Indices-colab of proximity for 2012 in December were computed in 6.88 seconds.
Indices-colab of proximity for 2012 in February were computed in 1.23 seconds.
Indices-colab of proximity for 2012 in January were computed in 1.58 seconds.
Indices-colab of proximity for 2012 in July were computed in 4.71 seconds.
Indices-colab of proximity for 2012 in June were computed in 2.6 seconds.
Indices-colab of proximity for 2012 in March were computed in 2.09 seconds.
Indices-colab of proximity for 2012 in May were computed in 3.54 seconds.
Indices-colab of proximity for 2012 in November were computed in 6.29 seconds.
Indices-colab of proximity for 2012 in October were computed in 5.83 seconds.
Indices-colab of proximity for 2012 in September were computed in 7.41 seconds.
Indices-colab of proximity for 2013 in April were computed in 5.5 seconds.


Indices-colab of proximity for 2020 in October were computed in 281.29 seconds.
Indices-colab of proximity for 2020 in September were computed in 133.33 seconds.
Indices-colab of proximity for 2021 in April were computed in 129.17 seconds.
Indices-colab of proximity for 2021 in August were computed in 136.69 seconds.
Indices-colab of proximity for 2021 in December were computed in 106.71 seconds.
Indices-colab of proximity for 2021 in February were computed in 132.1 seconds.
Indices-colab of proximity for 2021 in January were computed in 111.97 seconds.
Indices-colab of proximity for 2021 in July were computed in 148.8 seconds.
Indices-colab of proximity for 2021 in June were computed in 152.99 seconds.
Indices-colab of proximity for 2021 in March were computed in 168.14 seconds.
Indices-colab of proximity for 2021 in May were computed in 179.32 seconds.
Indices-colab of proximity for 2021 in November were computed in 121.62 seconds.
Indices-colab of proximity for 2021 in October were 

<div class="alert-info">
4.2. Restructuration of the dataframe
</div>

We now aim to turn the dataframe 'indices_colab' into a dataframe as we want it (with columns and so on).

In [56]:
indices_colab.rename(columns={0: "indices"}, inplace=True)
indices_colab.reset_index(inplace=True, level=['month'])
indices_colab.reset_index(inplace=True, level=['year'])

In [57]:
indices_colab

Unnamed: 0,year,month,indices
0,2012,April,"[[0.9999999999999998, 0.9999999999999998], [0,..."
1,2012,August,"[[1.0000000000000002, 1.0000000000000002], [0...."
2,2012,December,"[[0.9999999999999999, 1.0000000000000004], [0...."
3,2012,February,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
4,2012,January,"[[0.9083671681705072, 0.9083671681705072], [0...."
...,...,...,...
127,2022,March,"[[0.9414605364102283, 0.8335131675890325], [0...."
128,2022,May,"[[0.98344013067591, 0.9881916632877544], [0.13..."
129,2022,November,"[[1.0, 1.0000000000000002], [0, 0], [0, 0], [n..."
130,2022,October,"[[0.9999999999999994, 1.0000000000000004], [0,..."


We need to create lists for the concepts for the columns we want to create.

In [58]:
list_combination = []
for tech1 in list_concepts:
    for tech2 in list_concepts:
        list_combination.append(tech1)
        list_combination.append(tech2)
        
end = len(list_combination)-1
first_listconcept = list_combination[:end:2]
second_listconcept = list_combination[1:(end+1):2]

We now explode the column 'indices' and add the columns about the concepts.

In [59]:
indices_colab= indices_colab.explode('indices')
indices_colab['concept1']= 12*21*first_listconcept
indices_colab['concept2']= 12*21*second_listconcept

In [60]:
indices_colab

Unnamed: 0,year,month,indices,concept1,concept2
0,2012,April,"[0.9999999999999998, 0.9999999999999998]",Authentication,Authentication
0,2012,April,"[0, 0]",Authentication,Biometrics
0,2012,April,"[nan, nan]",Authentication,Blockchain
0,2012,April,"[0, 0]",Authentication,Digital rights management
0,2012,April,"[0, 0]",Authentication,Digital signature
...,...,...,...,...,...
131,2022,September,"[0, 0]",Zero-knowlegde proof,Symmetric-key algorithm
131,2022,September,"[nan, nan]",Zero-knowlegde proof,Threshold cryptosystem
131,2022,September,"[0, 0]",Zero-knowlegde proof,Trusted Computing
131,2022,September,"[nan, nan]",Zero-knowlegde proof,Tunneling protocol


We take all the indices, which is a list of lists.

In [61]:
allindices = list(indices_colab.indices.tolist())

In [62]:
end = len(allindices)-1

We do a list of the indices based on not incremental monthly h-indices and the ones based on incremental monthly h-indices.

In [63]:
indices1=[x[0] for x in allindices]
indices2= [x[1] for x in allindices]

We update our dataframe adding the wanted information and eliminating the other.

In [64]:
indices_colab=indices_colab.drop('indices',axis=1)
indices_colab['index_colab_notincrem']=indices1
indices_colab['index_colab_increm']=indices2

We are done with our last modification. We just verify and save the dataframe for later.

In [65]:
indices_colab

Unnamed: 0,year,month,concept1,concept2,index_colab_notincrem,index_colab_increm
0,2012,April,Authentication,Authentication,1.0,1.0
0,2012,April,Authentication,Biometrics,0.0,0.0
0,2012,April,Authentication,Blockchain,,
0,2012,April,Authentication,Digital rights management,0.0,0.0
0,2012,April,Authentication,Digital signature,0.0,0.0
...,...,...,...,...,...,...
131,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
131,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
131,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
131,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [66]:
indices_colab.to_pickle('data_indices/indices_colab_normalized')

# **Part 5 -- Proximity indices based on citations**

<div class="alert-info">
5.1. Computation the indices
</div>

I select only the information I need, to reduce my database.

In [67]:
df_cit = df_final.drop(['keyword','author','cosine_similarity','title','publication_date','abstract','yearly_H_index_notincremental','yearly_H_index_incremental','monthly_H_index_incremental','monthly_H_index_notincremental'], axis=1).copy()
df_cit = df_cit.drop_duplicates()

I select only the information I need from my auxiliary dataframe for the referenced works, to reduce my database.

In [68]:
df_ref_help = df_auxiliary_ref.drop(['year','month','author','title','publication_date','abstract'], axis=1).copy()
df_ref_help = df_ref_help.drop_duplicates()
df_ref_help

Unnamed: 0,paper,referenced_works,Authentication,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1498573661,https://openalex.org/W1481832011,0.592982,0.779361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,https://openalex.org/W1498573661,https://openalex.org/W1931880689,0.592982,0.779361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,https://openalex.org/W1498573661,https://openalex.org/W1993920884,0.592982,0.779361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,https://openalex.org/W1498573661,https://openalex.org/W2005461933,0.592982,0.779361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,https://openalex.org/W1498573661,https://openalex.org/W2018300500,0.592982,0.779361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2484592,https://openalex.org/W4310022506,https://openalex.org/W4210655227,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.793215
2484629,https://openalex.org/W4310384508,,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.800074
2484630,https://openalex.org/W59628553,,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.978435
2484634,https://openalex.org/W748367845,,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419473


With the function below, I compute a dataframe of indices based on citations for all months and all combinations of technologies.

In [69]:
df_cit

Unnamed: 0,paper,referenced_works,year,month,Authentication,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000206743,https://openalex.org/W2116665839,2014,November,0.0,0.000000,0.0,0.0,0.0,0.821057,...,0.0,0.595266,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
0,https://openalex.org/W1000206743,https://openalex.org/W2182097415,2014,November,0.0,0.000000,0.0,0.0,0.0,0.821057,...,0.0,0.595266,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
0,https://openalex.org/W1000206743,https://openalex.org/W2281564060,2014,November,0.0,0.000000,0.0,0.0,0.0,0.821057,...,0.0,0.595266,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
15,https://openalex.org/W1000603767,https://openalex.org/W2063231592,2013,February,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.747658,0.0,0.0,0.0,0.0,0.0,0.0
15,https://openalex.org/W1000603767,https://openalex.org/W2082249447,2013,February,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.747658,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544679,https://openalex.org/W996094322,https://openalex.org/W1487450316,2013,November,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.250971,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1544679,https://openalex.org/W996094322,https://openalex.org/W1554262711,2013,November,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.250971,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1544694,https://openalex.org/W997299233,https://openalex.org/W2803380720,2021,January,0.0,0.631994,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1544704,https://openalex.org/W998997404,https://openalex.org/W2014022156,2015,December,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.631673,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
indices_cit = df_cit.groupby(['year','month']).apply(lambda x: create_indices_cit(x,list_concepts,df_ref_help)).to_frame()

Indices-cit of proximity for 2012 in April were computed in 3.06 seconds.
Indices-cit of proximity for 2012 in August were computed in 9.24 seconds.
Indices-cit of proximity for 2012 in December were computed in 2.78 seconds.
Indices-cit of proximity for 2012 in February were computed in 2.21 seconds.
Indices-cit of proximity for 2012 in January were computed in 2.34 seconds.
Indices-cit of proximity for 2012 in July were computed in 1.62 seconds.
Indices-cit of proximity for 2012 in June were computed in 1.9 seconds.
Indices-cit of proximity for 2012 in March were computed in 2.15 seconds.
Indices-cit of proximity for 2012 in May were computed in 13.65 seconds.
Indices-cit of proximity for 2012 in November were computed in 1.95 seconds.
Indices-cit of proximity for 2012 in October were computed in 6.03 seconds.
Indices-cit of proximity for 2012 in September were computed in 4.66 seconds.
Indices-cit of proximity for 2013 in April were computed in 3.01 seconds.
Indices-cit of proximity

Indices-cit of proximity for 2021 in August were computed in 103.8 seconds.
Indices-cit of proximity for 2021 in December were computed in 59.78 seconds.
Indices-cit of proximity for 2021 in February were computed in 117.57 seconds.
Indices-cit of proximity for 2021 in January were computed in 207.47 seconds.
Indices-cit of proximity for 2021 in July were computed in 196.78 seconds.
Indices-cit of proximity for 2021 in June were computed in 60.31 seconds.
Indices-cit of proximity for 2021 in March were computed in 108.21 seconds.
Indices-cit of proximity for 2021 in May were computed in 114.42 seconds.
Indices-cit of proximity for 2021 in November were computed in 48.75 seconds.
Indices-cit of proximity for 2021 in October were computed in 138.79 seconds.
Indices-cit of proximity for 2021 in September were computed in 123.48 seconds.
Indices-cit of proximity for 2022 in April were computed in 14.04 seconds.
Indices-cit of proximity for 2022 in August were computed in 30.93 seconds.
Ind

<div class="alert-info">
5.2. Restructuration of the dataframe
</div>

We now aim to turn the dataframe 'indices_cit' into a dataframe as we want it (with columns and so on).

In [85]:
indices_cit.rename(columns={0: "indices"}, inplace=True)
indices_cit.reset_index(inplace=True, level=['month'])
indices_cit.reset_index(inplace=True, level=['year'])

In [86]:
indices_cit

Unnamed: 0,year,month,indices
0,2012,April,"[[0, 0], [0, 0], [nan, nan], [0, 0], [0, 0], [..."
1,2012,August,"[[0, 0], [0, 0], [nan, nan], [nan, nan], [nan,..."
2,2012,December,"[[1.501524625, 1.501524625], [0, 0], [nan, nan..."
3,2012,February,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
4,2012,January,"[[6.904018749999999, 6.904018749999999], [0, 0..."
...,...,...,...
127,2022,March,"[[0, 0], [0, 0], [0, 0], [nan, nan], [0, 0], [..."
128,2022,May,"[[0, 0], [0, 0], [0, 1.432623085], [0, 0], [0,..."
129,2022,November,"[[0, 0], [0, 0], [0, 0], [nan, nan], [0, 0], [..."
130,2022,October,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0..."


We need to create lists for the concepts for the columns we want to create.

In [87]:
list_combination = []
for tech1 in list_concepts:
    for tech2 in list_concepts:
        list_combination.append(tech1)
        list_combination.append(tech2)
        
end = len(list_combination)-1
first_listconcept = list_combination[:end:2]
second_listconcept = list_combination[1:(end+1):2]

We now explode the column 'indices' and add the columns about the concepts.

In [88]:
indices_cit= indices_cit.explode('indices')
indices_cit['concept1']=12*21*first_listconcept
indices_cit['concept2']=12*21*second_listconcept

We take all the indices, which is a list of lists.

In [89]:
allindices = list(indices_cit.indices.tolist())

We do a list of the indices based on not incremental monthly h-indices and the ones based on incremental monthly h-indices.

In [90]:
indices1=[x[0] for x in allindices]
indices2= [x[1] for x in allindices]

We update our dataframe adding the wanted information and eliminating the other.

In [91]:
indices_cit=indices_cit.drop('indices',axis=1)

In [92]:
indices_cit['index_cit_t1_t2']=indices1
indices_cit['index_cit_t2_t1']=indices2

We are done with our last modification. We just verify and save the dataframe for later.

In [93]:
indices_cit

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1
0,2012,April,Authentication,Authentication,0.0,0.0
0,2012,April,Authentication,Biometrics,0.0,0.0
0,2012,April,Authentication,Blockchain,,
0,2012,April,Authentication,Digital rights management,0.0,0.0
0,2012,April,Authentication,Digital signature,0.0,0.0
...,...,...,...,...,...,...
131,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
131,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
131,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
131,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [94]:
indices_cit.to_pickle('data_indices/indices_cit_notnormalized')

# **Part 6 -- Merging the data altogether**

In [95]:
indices_cit

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1
0,2012,April,Authentication,Authentication,0.0,0.0
0,2012,April,Authentication,Biometrics,0.0,0.0
0,2012,April,Authentication,Blockchain,,
0,2012,April,Authentication,Digital rights management,0.0,0.0
0,2012,April,Authentication,Digital signature,0.0,0.0
...,...,...,...,...,...,...
131,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
131,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
131,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
131,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [96]:
indices_key

Unnamed: 0,year,month,index_keyword,concept1,concept2
0,2012,April,1.0,Authentication,Authentication
0,2012,April,0.333093,Authentication,Biometrics
0,2012,April,,Authentication,Blockchain
0,2012,April,0.080487,Authentication,Digital rights management
0,2012,April,0.593677,Authentication,Digital signature
...,...,...,...,...,...
131,2022,September,0.080302,Zero-knowlegde proof,Symmetric-key algorithm
131,2022,September,,Zero-knowlegde proof,Threshold cryptosystem
131,2022,September,0.125157,Zero-knowlegde proof,Trusted Computing
131,2022,September,,Zero-knowlegde proof,Tunneling protocol


In [97]:
indices_colab

Unnamed: 0,year,month,concept1,concept2,index_colab_notincrem,index_colab_increm
0,2012,April,Authentication,Authentication,1.0,1.0
0,2012,April,Authentication,Biometrics,0.0,0.0
0,2012,April,Authentication,Blockchain,,
0,2012,April,Authentication,Digital rights management,0.0,0.0
0,2012,April,Authentication,Digital signature,0.0,0.0
...,...,...,...,...,...,...
131,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
131,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
131,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
131,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [98]:
dfintermed = pd.merge(indices_cit,indices_key, on=['year','month','concept1','concept2'], how='right')
dfintermed

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1,index_keyword
0,2012,April,Authentication,Authentication,0.0,0.0,1.0
1,2012,April,Authentication,Biometrics,0.0,0.0,0.333093
2,2012,April,Authentication,Blockchain,,,
3,2012,April,Authentication,Digital rights management,0.0,0.0,0.080487
4,2012,April,Authentication,Digital signature,0.0,0.0,0.593677
...,...,...,...,...,...,...,...
82495,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0,0.080302
82496,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,,
82497,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0,0.125157
82498,2022,September,Zero-knowlegde proof,Tunneling protocol,,,


In [99]:
dfindices = pd.merge(dfintermed,indices_colab, on=['year','month','concept1','concept2'], how='right')
dfindices

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1,index_keyword,index_colab_notincrem,index_colab_increm
0,2012,April,Authentication,Authentication,0.0,0.0,1.0,1.0,1.0
1,2012,April,Authentication,Biometrics,0.0,0.0,0.333093,0.0,0.0
2,2012,April,Authentication,Blockchain,,,,,
3,2012,April,Authentication,Digital rights management,0.0,0.0,0.080487,0.0,0.0
4,2012,April,Authentication,Digital signature,0.0,0.0,0.593677,0.0,0.0
...,...,...,...,...,...,...,...,...,...
82495,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0,0.080302,0.0,0.0
82496,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,,,,
82497,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0,0.125157,0.0,0.0
82498,2022,September,Zero-knowlegde proof,Tunneling protocol,,,,,


In [100]:
dfindices.to_pickle('data_indices/dfindices_normalized')