# Creation of the indices of proximity: second version

Importing the necessary libraries.

In [30]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from scipy import stats
import re
import nltk
import missingno as msno
from tqdm import tqdm
import json
import statistics
import math
import time

nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords
from gensim.models import Phrases
from collections import Counter

# enabling Jupyter Lab to include the output of our plots directly in this notebook
%matplotlib inline


# used to avoid blurry output plots in Jupyter Notebooks
%config InLineBackend.figure_format = "retina"

pd.options.mode.chained_assignment = None

Downloading the files I will use.

In [31]:
infile_df_full_extended = open('../exploratory_analysis/data_exploratory_analysis/df_full_extended','rb')
df_full_extended = pickle.load(infile_df_full_extended)
infile_df_full_extended.close()

In [32]:
infile_df_full = open('../creation_data_and_variables/data_creation_variables/df_full','rb')
df_full = pickle.load(infile_df_full)
infile_df_full.close()

# **Part 1 -- Processing the data**

<div class="alert-info">
1.1 Preparation dataset "df_full_extended"
</div>

I change the type of variable of my database "df_full_extended" for my further computations.

In [33]:
df_full_extended['cosine_similarity']=df_full_extended['cosine_similarity'].astype(float)
df_full_extended['score_concepts']=df_full_extended['score_concepts'].astype(float)

I check the type of variables I have here.

In [34]:
df_full_extended.dtypes

paper                              object
keyword                            object
cosine_similarity                 float64
title                              object
publication_date                   object
abstract                           object
year                                int64
month                              object
author                             object
referenced_works                   object
concepts                           object
score_concepts                    float64
yearly_H_index_notincremental     float64
yearly_H_index_incremental        float64
monthly_H_index_incremental       float64
monthly_H_index_notincremental    float64
dtype: object

Everything is fine just as expected. Now I compute the mean of my numerical variables, over months and years, as an example.

In [35]:
df_full_extended.groupby(['year','month']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,cosine_similarity,score_concepts,yearly_H_index_notincremental,yearly_H_index_incremental,monthly_H_index_incremental,monthly_H_index_notincremental
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2002,April,0.336798,0.018752,0.325359,0.325359,0.162679,0.090909
2002,August,0.355786,0.019191,0.494118,0.494118,0.282353,0.200000
2002,December,0.348507,0.023473,0.308458,0.308458,0.308458,0.189055
2002,February,0.359337,0.025975,0.540541,0.540541,0.081081,0.027027
2002,January,0.354044,0.031817,0.302885,0.302885,0.086538,0.086538
...,...,...,...,...,...,...,...
2022,March,0.342014,0.027173,0.698236,1.147621,0.862750,0.181592
2022,May,0.341304,0.027530,0.380178,0.573609,0.419654,0.100426
2022,November,0.342051,0.025757,0.397678,0.725012,0.723638,0.180600
2022,October,0.344576,0.025700,0.441514,0.768125,0.736073,0.175695


I now want to put the score of attribution to concepts as columns instead of one column for each paper.
I create an auxiliary dataframe to reach this aim.

In [36]:
dfhelp = df_full_extended[['paper','concepts','score_concepts']]
dfhelp=dfhelp.drop_duplicates()

In [37]:
dfhelp=dfhelp.set_index(['paper','concepts'])['score_concepts'].unstack().reset_index()

In [38]:
dfhelp

concepts,paper,Authentication protocole,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,Electronic voting,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.000000
1,https://openalex.org/W100004108,0.0,0.7463,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
2,https://openalex.org/W1000206743,0.0,0.0000,0.0,0.0,0.000000,0.821057,0.0,0.0,0.0,...,0.0,0.595266,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
3,https://openalex.org/W1000368397,0.0,0.0000,0.0,0.0,0.626143,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
4,https://openalex.org/W1000558944,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.255347,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135443,https://openalex.org/W998997404,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.631673,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
135444,https://openalex.org/W99938045,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.775669
135445,https://openalex.org/W999405428,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.423747,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
135446,https://openalex.org/W999768700,0.0,0.0000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.200094,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000


Now I want to merge this dataframe with another frame, whose columns 'concepts', 'score_concepts', 'referenced_works' where deleted such that I obtain one big dataframe, with the concepts as columns.

In [39]:
dftomerge = df_full_extended.drop(['concepts','score_concepts','referenced_works'], axis=1)
dftomerge = dftomerge.drop_duplicates()

In [40]:
dfintermed = pd.merge(dftomerge,dfhelp, on='paper', how='inner')
dfintermed

Unnamed: 0,paper,keyword,cosine_similarity,title,publication_date,abstract,year,month,author,yearly_H_index_notincremental,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,enforc,0.3738,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
1,https://openalex.org/W1000018889,servic,0.3339,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
2,https://openalex.org/W1000018889,framework,0.3110,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
3,https://openalex.org/W1000018889,trust,0.2798,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
4,https://openalex.org/W1000018889,smartphon,0.2749,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213914,https://openalex.org/W999914091,keywordsimag,0.4641,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213915,https://openalex.org/W999914091,retrievalhash,0.4246,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213916,https://openalex.org/W999914091,index,0.4054,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213917,https://openalex.org/W999914091,algorithm,0.3436,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


I've go everything except the column 'referenced_works'. Again, I do a merge to obtain the final dataframe I am aiming at.

In [41]:
dfref=df_full_extended[['paper','referenced_works']]
dfref=dfref.drop_duplicates(subset=['paper'])

In [42]:
dfall = pd.merge(dfref, dfintermed, on='paper', how='inner')
dfall

Unnamed: 0,paper,referenced_works,keyword,cosine_similarity,title,publication_date,abstract,year,month,author,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,"[https://openalex.org/W1542792105, https://ope...",enforc,0.3738,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
1,https://openalex.org/W1000018889,"[https://openalex.org/W1542792105, https://ope...",servic,0.3339,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
2,https://openalex.org/W1000018889,"[https://openalex.org/W1542792105, https://ope...",framework,0.3110,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
3,https://openalex.org/W1000018889,"[https://openalex.org/W1542792105, https://ope...",trust,0.2798,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
4,https://openalex.org/W1000018889,"[https://openalex.org/W1542792105, https://ope...",smartphon,0.2749,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213914,https://openalex.org/W999914091,"[https://openalex.org/W1968781463, https://ope...",keywordsimag,0.4641,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213915,https://openalex.org/W999914091,"[https://openalex.org/W1968781463, https://ope...",retrievalhash,0.4246,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213916,https://openalex.org/W999914091,"[https://openalex.org/W1968781463, https://ope...",index,0.4054,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213917,https://openalex.org/W999914091,"[https://openalex.org/W1968781463, https://ope...",algorithm,0.3436,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


I explode the column 'referenced_works', in order to have the complete dataframe I am aiming at.

In [43]:
df_final = dfall.explode('referenced_works')

Now I do have my final dataframe, I will use to compute my indices of proximity.

In [44]:
df_final

Unnamed: 0,paper,referenced_works,keyword,cosine_similarity,title,publication_date,abstract,year,month,author,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,https://openalex.org/W1542792105,enforc,0.3738,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
0,https://openalex.org/W1000018889,https://openalex.org/W1821135345,enforc,0.3738,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
0,https://openalex.org/W1000018889,https://openalex.org/W2055456945,enforc,0.3738,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
0,https://openalex.org/W1000018889,https://openalex.org/W2145187482,enforc,0.3738,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
1,https://openalex.org/W1000018889,https://openalex.org/W1542792105,servic,0.3339,Remote Policy Enforcement for Trusted Applicat...,2013-12-04,"Both in the cloud and mobile environments, a ...",2013,December,https://openalex.org/A2107438709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213918,https://openalex.org/W999914091,https://openalex.org/W2038276547,imagemean,0.3342,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213918,https://openalex.org/W999914091,https://openalex.org/W2085511467,imagemean,0.3342,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213918,https://openalex.org/W999914091,https://openalex.org/W2099907898,imagemean,0.3342,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213918,https://openalex.org/W999914091,https://openalex.org/W2115467209,imagemean,0.3342,Indexing for Image Retrieval: A Machine Learni...,2013-03-09,"AbstractIn this paper, we explore the use of ...",2013,March,https://openalex.org/A2210425818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


Last I save this dataframe, since it could be useful, not to have to do the computations again each time, I am interest in this dataframe.

In [45]:
df_final.to_pickle('data_indices/df_computations_indices')

<div class="alert-info">
1.2 Preparation auxiliary dataset for references
</div>

Now I want to do the same with df_full to use it as source of information for the cited papers. In fact, I deleted many papers from the set of papers I am studying, because there were no referenced works, several ids for the same paper, etc. All these papers or ids are still part of the set of referenced_works. For this reason, I need the full dataset to have information about these papers.

I rename one column, to make everything more uniform among my datasets.

In [46]:
df_full.rename(columns={"id": "paper"}, inplace=True)

Again, I will change my dataset, putting the concepts as columns having directly the score of attribution whithin themselves.

In [47]:
dfhelp = df_full[['paper','concepts','score_concepts']]
dfhelp=dfhelp.drop_duplicates()

In [48]:
dfhelp['score_concepts']=dfhelp['score_concepts'].astype(float)

In [49]:
dfhelp=dfhelp.set_index(['paper','concepts'])['score_concepts'].unstack().reset_index()

In [50]:
dfhelp

concepts,paper,Authentication protocole,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,Electronic voting,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,0.000000,0.0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.430285,0.0,0.0
1,https://openalex.org/W100004108,0.000000,0.7463,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
2,https://openalex.org/W1000101879,0.431697,0.0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
3,https://openalex.org/W1000145799,0.000000,0.0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.336889,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
4,https://openalex.org/W1000206743,0.000000,0.0000,0.0,0.0,0.0,0.821057,0.0,0.0,0.0,...,0.0,0.595266,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288696,https://openalex.org/W999574700,0.000000,0.0000,0.0,0.0,0.0,0.644459,0.0,0.0,0.0,...,0.0,0.641884,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
288697,https://openalex.org/W99960847,0.000000,0.0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.865002,0.60725,0.0,0.0,0.0,0.000000,0.0,0.0
288698,https://openalex.org/W999768700,0.000000,0.0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.200094,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0
288699,https://openalex.org/W999914091,0.000000,0.0000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0


In [51]:
dftomerge = df_full.drop(['concepts','score_concepts'], axis=1)
dftomerge = dftomerge.drop_duplicates()

In [52]:
df_auxiliary_ref = pd.merge(dftomerge,dfhelp, on='paper', how='inner')
df_auxiliary_ref

Unnamed: 0,paper,title,publication_date,author,referenced_works,abstract,year,month,Authentication protocole,Biometrics,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1635413415,The Kerberos Network Authentication Service (V5),2005-07-01,https://openalex.org/A2974782682,,This document gives an overview and specifica...,2005,July,0.473908,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,https://openalex.org/W1635413415,The Kerberos Network Authentication Service (V5),2005-07-01,https://openalex.org/A2120412456,,This document gives an overview and specifica...,2005,July,0.473908,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,https://openalex.org/W2132391275,Diameter Base Protocol,2003-09-01,https://openalex.org/A2431458385,,The Diameter base protocol is intended to pro...,2003,September,0.535038,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,https://openalex.org/W2132391275,Diameter Base Protocol,2003-09-01,https://openalex.org/A2556980190,,The Diameter base protocol is intended to pro...,2003,September,0.535038,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,https://openalex.org/W2132391275,Diameter Base Protocol,2003-09-01,https://openalex.org/A2674829090,,The Diameter base protocol is intended to pro...,2003,September,0.535038,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4139712,https://openalex.org/W98729159,Zero-knowledge against quantum attacks (Prelim...,2005-01-01,https://openalex.org/A2984226480,https://openalex.org/W389490173,This paperprovesthat several interactiveproof...,2005,January,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534
4139713,https://openalex.org/W98729159,Zero-knowledge against quantum attacks (Prelim...,2005-01-01,https://openalex.org/A2984226480,https://openalex.org/W1549361339,This paperprovesthat several interactiveproof...,2005,January,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534
4139714,https://openalex.org/W98729159,Zero-knowledge against quantum attacks (Prelim...,2005-01-01,https://openalex.org/A2984226480,https://openalex.org/W1594537107,This paperprovesthat several interactiveproof...,2005,January,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534
4139715,https://openalex.org/W98729159,Zero-knowledge against quantum attacks (Prelim...,2005-01-01,https://openalex.org/A2984226480,https://openalex.org/W2069170136,This paperprovesthat several interactiveproof...,2005,January,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534


This is now done. I check how many papers are affiliated to no technologies at all. They will not count in my computations, since I can not attribute them to any technologies.

<div class="alert-info">
1.3 Checking papers attributed to no technologies at all
</div>

In [53]:
list_concepts= ['Authentication protocole','Biometrics','Blockchain','Digital rights management'
,'Digital signature','Distributed algorithm','Electronic voting','Functional encryption',
'Hardware acceleration','Hardware security module','Hash function','Homomorphic encryption','Identity management',
'Key management','Link encryption','Post-quantum cryptography','Public-key cryptography','Quantum key distribution',
'Quantum cryptography','Random number generation','Symmetric-key algorithm','Threshold cryptosystem',
'Trusted Computing','Tunneling protocol','Zero-knowlegde proof']

In [54]:
myinfodataframe = dfhelp
for tech in list_concepts:
    myinfodataframe = myinfodataframe.loc[myinfodataframe[tech]==0]
myinfodataframe

concepts,paper,Authentication protocole,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,Electronic voting,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
18,https://openalex.org/W1002055276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,https://openalex.org/W100250255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,https://openalex.org/W1006036159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,https://openalex.org/W100660255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,https://openalex.org/W1007878853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288662,https://openalex.org/W996029897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288669,https://openalex.org/W996653913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288684,https://openalex.org/W998412290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288686,https://openalex.org/W99880915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
list_weird_papers = list(set(myinfodataframe.paper.tolist()))

In [56]:
print('There are '+str(len(list_weird_papers))+' papers attributed to no technologies at all.')

There are 22392 papers attributed to no technologies at all.


I do have all the data I need. I define the functions I will use and then, I will compute the indices of proximity.

# **Part 2 -- Function for the computation of the indices**

For each author and for each month, the function 'importance_author_intech' takes its hindex (incremental or not, based on the condition), the average attribution to technology 1 and 2 of the papers published during this month, and the average number of times this author appear in t1 and t2 during this month.\
This function returns 'hindex * attribution_to_t1_t2 * average_time_in_t1_2', which should represent the importance of the author for this link of technologies during this month.

In [57]:
def importance_author_t1_t2(author,df,tech1,tech2,condition):
    myinfo = df.loc[df['author']==author]
    
    dftech1 =myinfo.loc[myinfo[tech1]!=0]
    dftech2 =myinfo.loc[myinfo[tech2]!=0]
    
    #to compute the number of times this author published a paper related to tech 1 or tech 2
    times_author_in_t1 = len(list(set(dftech1.paper.tolist())))
    times_author_in_t2 = len(list(set(dftech2.paper.tolist())))
    
    attribution_tech1= dftech1[tech1].tolist()
    attribution_tech2= dftech2[tech2].tolist()
    
    if condition == 'incremental':
        hindex = myinfo.monthly_H_index_incremental.tolist()[0]+1
    if condition == 'nonincremental':
        hindex = myinfo.monthly_H_index_notincremental.tolist()[0]+1
    
    attribution_to_t1_t2 = (np.mean(attribution_tech1)+np.mean(attribution_tech2))/2
    
    average_time_in_t1_2 = (times_author_in_t1+times_author_in_t2)/2
    
    element_sum = hindex*attribution_to_t1_t2*average_time_in_t1_2
    
    return element_sum

In [58]:
def importance_author_t1(author,df,tech1,condition):
    myinfo = df.loc[df['author']==author]
    dftech1 =myinfo.loc[myinfo[tech1]!=0]
    
    #to compute the number of times this author published a paper related to tech 1 or tech 2
    attribution_tech1= dftech1[tech1].tolist()
    
    if condition == 'incremental':
        hindex = myinfo.monthly_H_index_incremental.tolist()[0]+1
    if condition == 'nonincremental':
        hindex = myinfo.monthly_H_index_notincremental.tolist()[0]+1
    
    attribution_to_t1 = np.mean(attribution_tech1)
    times_author_in_t1 = len(list(set(dftech1.paper.tolist())))
    
    element_sum = hindex*attribution_to_t1*times_author_in_t1
    
    return element_sum

In very similar fashion as done for authors, for each keyword during each month, the function 'importance_keywords_intech' takes the average cosine_similarity of the keyword of all the times it appears in t1 and t2 during this month, the average attribution to technology 1 and 2 of the papers published during this month where the keyword appears as a keyword, and the average number of times this keyword appear in t1 and t2 during this month.\
This function returns 'average_cosine_similarity * attribution_to_t1_t2 * average_time_in_t1_2', which should represent the importance of the keyword for this link of technologies during this month.

In [59]:
def importance_keywords_t1_t2(keyword,df,tech1,tech2):
    myinfo = df.loc[df['keyword']==keyword]
    
    dftech1 =myinfo.loc[myinfo[tech1]!=0]
    dftech2 =myinfo.loc[myinfo[tech2]!=0]
    
    times_keywords_in_t1 = len(list(set(dftech1.paper.tolist())))
    times_keywords_in_t2 = len(list(set(dftech2.paper.tolist())))
    
    attribution_tech1= dftech1[tech1].tolist()
    attribution_tech2= dftech2[tech2].tolist()
    
    cosine_similarity_tech1= dftech1.cosine_similarity.tolist()
    cosine_similarity_tech2= dftech2.cosine_similarity.tolist()
    
    average_cosine_similarity = (np.mean(cosine_similarity_tech1)+np.mean(cosine_similarity_tech2))/2
    attribution_to_t1_t2 = (np.mean(attribution_tech1)+np.mean(attribution_tech2))/2
    average_time_in_t1_2 = (times_keywords_in_t1+times_keywords_in_t2)/2
    
    element_sum = average_cosine_similarity*attribution_to_t1_t2*average_time_in_t1_2
    
    return element_sum

In [60]:
def importance_keywords_t1(keyword,df,tech1):
    dfkey = df.loc[df['keyword']==keyword]

    attribution_tech1= dfkey[tech1].tolist()
    cosine_similarity_tech1= dfkey.cosine_similarity.tolist()

    average_cosine_similarity = np.mean(cosine_similarity_tech1)
    attribution_to_t1 = np.mean(attribution_tech1)
    times_keywords_in_t1 = len(list(set(dfkey.paper.tolist())))
    
    element_sum = average_cosine_similarity*attribution_to_t1*times_keywords_in_t1
    
    return element_sum

We now come to the part about citations. This is sligthly more difficult and we have 4 functions in total.

The function below computes for a paper and a technology 1 and 2, the sum of the mean of attribution to technology 1 and 2 for all the referenced works related to 2 with respect to my paper which is attributed to technology 1.\
In other words, it gives: sum((attr_to_t1 + attr_to_t2_ref_x)/2) for all ref_x in referenced_works of my paper, where attr_to_t1 is the score of attribution of my paper to t1 and attr_to_t2_ref_x is the score of attribution to t2 of each referenced work.

In [61]:
def importance_cit_tech(paper,df,tech1,tech2,df_ref_help):
  
    myinfo = df.loc[df['paper']==paper]

    attribution_tech1= myinfo[tech1].tolist()[0]
    
    referenced_works=list(set(myinfo['referenced_works'].tolist()))
    
    list_component_sum_ref = list(map(lambda x: info_ref_tech(x,df_ref_help,attribution_tech1,tech2), referenced_works))
    
    return sum(list_component_sum_ref)

This function below is an auxiliary function. For a referenced paper 'ref', the score of attribution to technology 1 of the paper which has 'ref' as reference and a technology 'tech2', this function gives (attr_to_t1 + attr_to_t2_ref_x)/2 as explained above, if the referenced work is connected to 'tech2' and zero either.

In [62]:
def info_ref_tech(ref,df,attribution_paper_t1,tech2):
    myinforef = df.loc[df['paper']==str(ref)]
    #then I have a paper not related to my research
    if len(myinforef)==0:
        attribution_t1_t2 =0
    else:
        att_to_t2 = myinforef[tech2].tolist()[0]
        if att_to_t2!=0:
            attribution_t1_t2 = (attribution_paper_t1+att_to_t2)/2
        else:
            attribution_t1_t2 = 0
    return attribution_t1_t2

__Definition main functions for the computation of my indices__

I now define the function that will properly compute the indices of proximity, for the given dataframes.
All this function do the same, they compute a list of indices of proximity for all combination of technologies during a specific month taking the subdataframe containing all required information for the month during the specific year. They return then this list of indices of proximity, for each month.

The function below computes the index of proximity regarding keywords.

In [63]:
def create_indices_keywords(dfinfos,listconcepts):
    
    start = time.time()
    listindices= []
    
    for tech1 in list_concepts:
        for tech2 in list_concepts:

            # variables for my computations
    
            dftech1 =dfinfos.loc[dfinfos[tech1]!=0]
            dftech2 =dfinfos.loc[dfinfos[tech2]!=0]
    
            if len(dftech2) == 0 or len(dftech1)==0:
            # this means we can not compute anything because we have no data
            # we decide to let it blank.
                index_keywords=np.nan
            else:
    
                # keywords
    
                keywords_tech1 = dftech1.keyword.tolist()
                keywords_tech2 = dftech2.keyword.tolist()
    
                common_keywords = list(set(keywords_tech1) & set(keywords_tech2))
                keywords_t1 = list(set(keywords_tech1))
                keywords_t2 = list(set(keywords_tech2))
    
                df_tech_1_2_common_keywords = dfinfos.loc[dfinfos.keyword.isin(common_keywords)]
    
                if len(common_keywords)==0:
                    index_keywords = 0
                else:
                    list_comp_sum_keywords_t1_t2 = list(map(lambda x: importance_keywords_t1_t2(x, df_tech_1_2_common_keywords,tech1,tech2), common_keywords))
                    list_comp_sum_keywords_t1 = list(map(lambda x: importance_keywords_t1(x, dftech1,tech1), keywords_t1))
                    index_keywords = sum(list_comp_sum_keywords_t1_t2)/sum(list_comp_sum_keywords_t1)
    
            listindices.append(index_keywords)
    
    end = time.time()
    year = dfinfos.year.tolist()[0]
    month = dfinfos.month.tolist()[0]
    print('Indices-keywords of proximity for '+str(year)+' in '+str(month)+' were computed in '+str(round(end-start,2))+' seconds.')

    return listindices

The function below computes the index of proximity regarding colaboration.

In [64]:
def create_indices_colab(dfinfos,list_concepts):
    
    start = time.time()
    listindices= []
    
    for tech1 in list_concepts:
        for tech2 in list_concepts:

            # variables for my computations
    
            dftech1 =dfinfos.loc[dfinfos[tech1]!=0]
            dftech2 =dfinfos.loc[dfinfos[tech2]!=0]
    
            if len(dftech2) == 0 or len(dftech1)==0:
               # this means we can not compute anything because we have no data
                # we decide to let it blank.
                index_colab_notincrem=np.nan
                index_colab_increm = np.nan
            else:
    
            # colab
    
                authors_tech1 = dftech1.author.tolist()
                authors_tech2 = dftech2.author.tolist()
    
                common_authors = list(set(authors_tech1) & set(authors_tech2))
                authors_t1 = list(set(authors_tech1))
                authors_t2 = list(set(authors_tech2))
                if len(common_authors)==0:
                    index_colab_notincrem=0
                    index_colab_increm = 0
                else:
                    unionauthors = list(set(authors_tech1) | set(authors_tech2))
    
                    df_tech_1_2_union_authors = dfinfos.loc[dfinfos.author.isin(unionauthors)]
    
                    list_comp_sum_authors_t1_t2_notincrem = list(map(lambda x: importance_author_t1_t2(x, df_tech_1_2_union_authors,tech1,tech2,'nonincremental'), common_authors))
                    list_comp_sum_authors_t1_notincrem = list(map(lambda x: importance_author_t1(x, dftech1,tech1,'nonincremental'), authors_tech1))
                    index_colab_notincrem = sum(list_comp_sum_authors_t1_t2_notincrem)/sum(list_comp_sum_authors_t1_notincrem)
        
                    list_comp_sum_authors_t1_t2_increm = list(map(lambda x: importance_author_t1_t2(x, df_tech_1_2_union_authors,tech1,tech2,'incremental'), common_authors))
                    list_comp_sum_authors_t1_increm = list(map(lambda x: importance_author_t1(x, dftech2,tech2,'incremental'), authors_tech2))
                    index_colab_increm = sum(list_comp_sum_authors_t1_t2_increm)/sum(list_comp_sum_authors_t1_increm)
    
       
            listindices.append([index_colab_notincrem,index_colab_increm])
        
    end = time.time()
    year = dfinfos.year.tolist()[0]
    month = dfinfos.month.tolist()[0]
    print('Indices-colab of proximity for '+str(year)+' in '+str(month)+' were computed in '+str(round(end-start,2))+' seconds.')

    return listindices

The function below computes the index of proximity regarding citations.

In [65]:
def create_indices_cit(dfinfos,listconcepts,df_ref_help):
    
    start = time.time()
    listindices= []
    
    for tech1 in list_concepts:
        for tech2 in list_concepts:

            # variables for my computations
    
            dftech1 =dfinfos.loc[dfinfos[tech1]!=0]
            dftech2 =dfinfos.loc[dfinfos[tech2]!=0]

            if len(dftech2) == 0 or len(dftech1)==0:
            # this means we can not compute anything because we have no data we decide to let it blank.
                index_cit_2_1=np.nan
                index_cit_1_2 = np.nan
            else:
                papers_cit_t1 = list(set(dftech1.paper.tolist()))
                papers_cit_t2 = list(set(dftech2.paper.tolist()))
 
                df_t1_ref_t2 = dftech1.loc[dftech1['referenced_works'].isin(papers_cit_t2)]
                papers_cit_t1_t2 = list(set(df_t1_ref_t2.paper.tolist()))
                
                df_t2_ref_t1 = dftech2.loc[dftech2['referenced_works'].isin(papers_cit_t1)]
                papers_cit_t2_t1 = list(set(df_t2_ref_t1.paper.tolist()))

    
                if len(papers_cit_t1_t2)==0:
                    index_cit_1_2 = 0
                if len(papers_cit_t2_t1)==0:
                    index_cit_2_1 = 0
                if len(papers_cit_t1_t2)!=0:
                    list_comp_sum_cit_t1_t2 = list(map(lambda x: importance_cit_tech(x, dftech1,tech1,tech2,df_ref_help), papers_cit_t1_t2))
                    index_cit_1_2 = sum(list_comp_sum_cit_t1_t2)
                if len(papers_cit_t2_t1)!=0:
                    list_comp_sum_cit_t2_t1 = list(map(lambda x: importance_cit_tech(x, dftech2,tech2,tech1,df_ref_help), papers_cit_t2_t1))
                    index_cit_2_1 = sum(list_comp_sum_cit_t2_t1)
    
            listindices.append([index_cit_1_2,index_cit_2_1])
        
    end = time.time()
    year = dfinfos.year.tolist()[0]
    month = dfinfos.month.tolist()[0]
    
    print('Indices-cit of proximity for '+str(year)+' in '+str(month)+' were computed in '+str(round(end-start,2))+' seconds.')
    
    return listindices

# **Part 3 -- Proximity indices based on keywords**

<div class="alert-info">
3.1. Computation the indices
</div>

In [66]:
list_concepts= ['Authentication protocole','Biometrics','Blockchain','Digital rights management'
,'Digital signature','Distributed algorithm','Electronic voting','Functional encryption',
'Hardware acceleration','Hardware security module','Hash function','Homomorphic encryption','Identity management',
'Key management','Link encryption','Post-quantum cryptography','Public-key cryptography','Quantum key distribution',
'Quantum cryptography','Random number generation','Symmetric-key algorithm','Threshold cryptosystem',
'Trusted Computing','Tunneling protocol','Zero-knowlegde proof']

I select only the information I need, to reduce my database.

In [67]:
df_key = df_final.drop(['referenced_works','title','publication_date','abstract','author','yearly_H_index_notincremental','yearly_H_index_incremental','monthly_H_index_incremental','monthly_H_index_notincremental'], axis=1).copy()
df_key = df_key.drop_duplicates()

With the function below, I compute a dataframe of indices based on keywords for all months and all combinations of technologies.

In [68]:
indices_key = df_key.groupby(['year','month']).apply(lambda x: create_indices_keywords(x,list_concepts)).to_frame()

Indices-keywords of proximity for 2002 in April were computed in 0.44 seconds.
Indices-keywords of proximity for 2002 in August were computed in 0.56 seconds.
Indices-keywords of proximity for 2002 in December were computed in 2.45 seconds.
Indices-keywords of proximity for 2002 in February were computed in 0.7 seconds.
Indices-keywords of proximity for 2002 in January were computed in 0.69 seconds.
Indices-keywords of proximity for 2002 in July were computed in 0.68 seconds.
Indices-keywords of proximity for 2002 in June were computed in 0.7 seconds.
Indices-keywords of proximity for 2002 in March were computed in 0.58 seconds.
Indices-keywords of proximity for 2002 in May were computed in 0.66 seconds.
Indices-keywords of proximity for 2002 in November were computed in 1.54 seconds.
Indices-keywords of proximity for 2002 in October were computed in 0.98 seconds.
Indices-keywords of proximity for 2002 in September were computed in 0.86 seconds.
Indices-keywords of proximity for 2003 i

Indices-keywords of proximity for 2010 in June were computed in 17.71 seconds.
Indices-keywords of proximity for 2010 in March were computed in 13.54 seconds.
Indices-keywords of proximity for 2010 in May were computed in 16.09 seconds.
Indices-keywords of proximity for 2010 in November were computed in 13.74 seconds.
Indices-keywords of proximity for 2010 in October were computed in 17.55 seconds.
Indices-keywords of proximity for 2010 in September were computed in 16.03 seconds.
Indices-keywords of proximity for 2011 in April were computed in 13.04 seconds.
Indices-keywords of proximity for 2011 in August were computed in 13.42 seconds.
Indices-keywords of proximity for 2011 in December were computed in 23.21 seconds.
Indices-keywords of proximity for 2011 in February were computed in 10.72 seconds.
Indices-keywords of proximity for 2011 in January were computed in 12.55 seconds.
Indices-keywords of proximity for 2011 in July were computed in 17.63 seconds.
Indices-keywords of proxim

Indices-keywords of proximity for 2019 in April were computed in 23.31 seconds.
Indices-keywords of proximity for 2019 in August were computed in 25.01 seconds.
Indices-keywords of proximity for 2019 in December were computed in 36.43 seconds.
Indices-keywords of proximity for 2019 in February were computed in 16.06 seconds.
Indices-keywords of proximity for 2019 in January were computed in 22.0 seconds.
Indices-keywords of proximity for 2019 in July were computed in 33.54 seconds.
Indices-keywords of proximity for 2019 in June were computed in 27.84 seconds.
Indices-keywords of proximity for 2019 in March were computed in 20.88 seconds.
Indices-keywords of proximity for 2019 in May were computed in 24.61 seconds.
Indices-keywords of proximity for 2019 in November were computed in 27.44 seconds.
Indices-keywords of proximity for 2019 in October were computed in 30.99 seconds.
Indices-keywords of proximity for 2019 in September were computed in 26.89 seconds.
Indices-keywords of proximi

<div class="alert-info">
3.2. Restructuration of the dataframe
</div>

We now aim to turn the dataframe 'indices_key' into a dataframe as we want it (with columns and so on).

In [69]:
indices_key.rename(columns={0: "index_keyword"}, inplace=True)
indices_key.reset_index(inplace=True, level=['month'])
indices_key.reset_index(inplace=True, level=['year'])

In [70]:
indices_key

Unnamed: 0,year,month,index_keyword
0,2002,April,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,2002,August,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2,2002,December,"[1.0000000000000002, 0.133449812834919, 0, nan..."
3,2002,February,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,2002,January,"[1.0, 0, nan, 0, 0.17564659286920653, nan, nan..."
...,...,...,...
247,2022,March,"[1.0, 0.6778312246874492, 3.6997596911521518, ..."
248,2022,May,"[1.0, 0.7748343728877309, 3.906280410615206, 0..."
249,2022,November,"[1.0, 0.5292589970172329, 3.4808163886272676, ..."
250,2022,October,"[1.0, 0.8646786971897975, 2.303954344763335, 0..."


We need to create lists for the concepts for the columns we want to create.

In [71]:
list_combination = []
for tech1 in list_concepts:
    for tech2 in list_concepts:
        list_combination.append(tech1)
        list_combination.append(tech2)
        
end = len(list_combination)-1
first_listconcept = list_combination[:end:2]
second_listconcept = list_combination[1:(end+1):2]

In [72]:
indices_key

Unnamed: 0,year,month,index_keyword
0,2002,April,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,2002,August,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2,2002,December,"[1.0000000000000002, 0.133449812834919, 0, nan..."
3,2002,February,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,2002,January,"[1.0, 0, nan, 0, 0.17564659286920653, nan, nan..."
...,...,...,...
247,2022,March,"[1.0, 0.6778312246874492, 3.6997596911521518, ..."
248,2022,May,"[1.0, 0.7748343728877309, 3.906280410615206, 0..."
249,2022,November,"[1.0, 0.5292589970172329, 3.4808163886272676, ..."
250,2022,October,"[1.0, 0.8646786971897975, 2.303954344763335, 0..."


We now explode the column 'indices' and add the columns about the concepts.

In [73]:
indices_key= indices_key.explode('index_keyword')
indices_key['concept1']=12*21*first_listconcept
indices_key['concept2']=12*21*second_listconcept

We are done with our last modification. We just verify and save the dataframe for later.

In [74]:
indices_key

Unnamed: 0,year,month,index_keyword,concept1,concept2
0,2002,April,,Authentication protocole,Authentication protocole
0,2002,April,,Authentication protocole,Biometrics
0,2002,April,,Authentication protocole,Blockchain
0,2002,April,,Authentication protocole,Digital rights management
0,2002,April,,Authentication protocole,Digital signature
...,...,...,...,...,...
251,2022,September,0.304064,Zero-knowlegde proof,Symmetric-key algorithm
251,2022,September,,Zero-knowlegde proof,Threshold cryptosystem
251,2022,September,0.1581,Zero-knowlegde proof,Trusted Computing
251,2022,September,,Zero-knowlegde proof,Tunneling protocol


In [75]:
indices_key.to_pickle('data_indices/indices_key_normalized')

# **Part 4 -- Proximity indices based on collaboration**

<div class="alert-info">
4.1. Computation the indices
</div>

I select only the information I need, to reduce my database.

In [76]:
df_colab = df_final.drop(['referenced_works','title','publication_date','abstract','keyword','cosine_similarity'], axis=1).copy()
df_colab = df_colab.drop_duplicates()

With the function below, I compute a dataframe of indices based on collaboration for all months and all combinations of technologies.

In [77]:
indices_colab = df_colab.groupby(['year','month']).apply(lambda x: create_indices_colab(x,list_concepts)).to_frame()

Indices-colab of proximity for 2002 in April were computed in 0.47 seconds.
Indices-colab of proximity for 2002 in August were computed in 0.5 seconds.
Indices-colab of proximity for 2002 in December were computed in 2.46 seconds.
Indices-colab of proximity for 2002 in February were computed in 0.63 seconds.
Indices-colab of proximity for 2002 in January were computed in 0.71 seconds.
Indices-colab of proximity for 2002 in July were computed in 0.57 seconds.
Indices-colab of proximity for 2002 in June were computed in 0.58 seconds.
Indices-colab of proximity for 2002 in March were computed in 0.58 seconds.
Indices-colab of proximity for 2002 in May were computed in 0.72 seconds.
Indices-colab of proximity for 2002 in November were computed in 1.6 seconds.
Indices-colab of proximity for 2002 in October were computed in 1.02 seconds.
Indices-colab of proximity for 2002 in September were computed in 0.77 seconds.
Indices-colab of proximity for 2003 in April were computed in 1.9 seconds.
I

Indices-colab of proximity for 2010 in October were computed in 27.88 seconds.
Indices-colab of proximity for 2010 in September were computed in 26.58 seconds.
Indices-colab of proximity for 2011 in April were computed in 20.43 seconds.
Indices-colab of proximity for 2011 in August were computed in 19.11 seconds.
Indices-colab of proximity for 2011 in December were computed in 37.37 seconds.
Indices-colab of proximity for 2011 in February were computed in 12.37 seconds.
Indices-colab of proximity for 2011 in January were computed in 25.63 seconds.
Indices-colab of proximity for 2011 in July were computed in 27.21 seconds.
Indices-colab of proximity for 2011 in June were computed in 25.4 seconds.
Indices-colab of proximity for 2011 in March were computed in 23.65 seconds.
Indices-colab of proximity for 2011 in May were computed in 23.41 seconds.
Indices-colab of proximity for 2011 in November were computed in 31.12 seconds.
Indices-colab of proximity for 2011 in October were computed in

Indices-colab of proximity for 2019 in March were computed in 42.27 seconds.
Indices-colab of proximity for 2019 in May were computed in 58.67 seconds.
Indices-colab of proximity for 2019 in November were computed in 59.55 seconds.
Indices-colab of proximity for 2019 in October were computed in 76.91 seconds.
Indices-colab of proximity for 2019 in September were computed in 64.31 seconds.
Indices-colab of proximity for 2020 in April were computed in 48.02 seconds.
Indices-colab of proximity for 2020 in August were computed in 61.36 seconds.
Indices-colab of proximity for 2020 in December were computed in 116.05 seconds.
Indices-colab of proximity for 2020 in February were computed in 47.3 seconds.
Indices-colab of proximity for 2020 in January were computed in 74.72 seconds.
Indices-colab of proximity for 2020 in July were computed in 83.94 seconds.
Indices-colab of proximity for 2020 in June were computed in 78.08 seconds.
Indices-colab of proximity for 2020 in March were computed in 

<div class="alert-info">
4.2. Restructuration of the dataframe
</div>

We now aim to turn the dataframe 'indices_colab' into a dataframe as we want it (with columns and so on).

In [78]:
indices_colab.rename(columns={0: "indices"}, inplace=True)
indices_colab.reset_index(inplace=True, level=['month'])
indices_colab.reset_index(inplace=True, level=['year'])

In [79]:
indices_colab

Unnamed: 0,year,month,indices
0,2002,April,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
1,2002,August,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
2,2002,December,"[[1.0, 1.0], [0, 0], [0, 0], [nan, nan], [0, 0..."
3,2002,February,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
4,2002,January,"[[1.0, 1.0], [0, 0], [nan, nan], [0, 0], [0, 0..."
...,...,...,...
247,2022,March,"[[0.9429491776545729, 0.8389429685517612], [0...."
248,2022,May,"[[0.9842490253617707, 0.9884691917478067], [0...."
249,2022,November,"[[1.0000000000000002, 1.0000000000000007], [0...."
250,2022,October,"[[0.9999999999999996, 1.0000000000000002], [0,..."


We need to create lists for the concepts for the columns we want to create.

In [80]:
list_combination = []
for tech1 in list_concepts:
    for tech2 in list_concepts:
        list_combination.append(tech1)
        list_combination.append(tech2)
        
end = len(list_combination)-1
first_listconcept = list_combination[:end:2]
second_listconcept = list_combination[1:(end+1):2]

We now explode the column 'indices' and add the columns about the concepts.

In [81]:
indices_colab= indices_colab.explode('indices')
indices_colab['concept1']= 12*21*first_listconcept
indices_colab['concept2']= 12*21*second_listconcept

In [82]:
indices_colab

Unnamed: 0,year,month,indices,concept1,concept2
0,2002,April,"[nan, nan]",Authentication protocole,Authentication protocole
0,2002,April,"[nan, nan]",Authentication protocole,Biometrics
0,2002,April,"[nan, nan]",Authentication protocole,Blockchain
0,2002,April,"[nan, nan]",Authentication protocole,Digital rights management
0,2002,April,"[nan, nan]",Authentication protocole,Digital signature
...,...,...,...,...,...
251,2022,September,"[0, 0]",Zero-knowlegde proof,Symmetric-key algorithm
251,2022,September,"[nan, nan]",Zero-knowlegde proof,Threshold cryptosystem
251,2022,September,"[0, 0]",Zero-knowlegde proof,Trusted Computing
251,2022,September,"[nan, nan]",Zero-knowlegde proof,Tunneling protocol


We take all the indices, which is a list of lists.

In [83]:
allindices = list(indices_colab.indices.tolist())

In [84]:
end = len(allindices)-1

We do a list of the indices based on not incremental monthly h-indices and the ones based on incremental monthly h-indices.

In [85]:
indices1=[x[0] for x in allindices]
indices2= [x[1] for x in allindices]

We update our dataframe adding the wanted information and eliminating the other.

In [86]:
indices_colab=indices_colab.drop('indices',axis=1)
indices_colab['index_colab_notincrem']=indices1
indices_colab['index_colab_increm']=indices2

We are done with our last modification. We just verify and save the dataframe for later.

In [87]:
indices_colab

Unnamed: 0,year,month,concept1,concept2,index_colab_notincrem,index_colab_increm
0,2002,April,Authentication protocole,Authentication protocole,,
0,2002,April,Authentication protocole,Biometrics,,
0,2002,April,Authentication protocole,Blockchain,,
0,2002,April,Authentication protocole,Digital rights management,,
0,2002,April,Authentication protocole,Digital signature,,
...,...,...,...,...,...,...
251,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
251,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
251,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
251,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [88]:
indices_colab.to_pickle('data_indices/indices_colab_normalized')

# **Part 5 -- Proximity indices based on citations**

<div class="alert-info">
5.1. Computation the indices
</div>

I select only the information I need, to reduce my database.

In [89]:
df_cit = df_final.drop(['keyword','author','cosine_similarity','title','publication_date','abstract','yearly_H_index_notincremental','yearly_H_index_incremental','monthly_H_index_incremental','monthly_H_index_notincremental'], axis=1).copy()
df_cit = df_cit.drop_duplicates()

I select only the information I need from my auxiliary dataframe for the referenced works, to reduce my database.

In [90]:
df_ref_help = df_auxiliary_ref.drop(['year','month','author','title','publication_date','abstract'], axis=1).copy()
df_ref_help = df_ref_help.drop_duplicates()
df_ref_help

Unnamed: 0,paper,referenced_works,Authentication protocole,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,Disk Encryption,Distributed algorithm,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1635413415,,0.473908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,https://openalex.org/W2132391275,,0.535038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7,https://openalex.org/W2134011023,https://openalex.org/W77932805,0.411799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
8,https://openalex.org/W2134011023,https://openalex.org/W2470325412,0.411799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
11,https://openalex.org/W2160584199,https://openalex.org/W1514141030,0.612274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4139712,https://openalex.org/W98729159,https://openalex.org/W389490173,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534
4139713,https://openalex.org/W98729159,https://openalex.org/W1549361339,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534
4139714,https://openalex.org/W98729159,https://openalex.org/W1594537107,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534
4139715,https://openalex.org/W98729159,https://openalex.org/W2069170136,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.903534


With the function below, I compute a dataframe of indices based on citations for all months and all combinations of technologies.

In [91]:
df_cit

Unnamed: 0,paper,referenced_works,year,month,Authentication protocole,Biometrics,Blockchain,Differential Privacy,Digital rights management,Digital signature,...,Post-quantum cryptography,Public-key cryptography,Quantum cryptography,Quantum key distribution,Random number generation,Symmetric-key algorithm,Threshold cryptosystem,Trusted Computing,Tunneling protocol,Zero-knowlegde proof
0,https://openalex.org/W1000018889,https://openalex.org/W1542792105,2013,December,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
0,https://openalex.org/W1000018889,https://openalex.org/W1821135345,2013,December,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
0,https://openalex.org/W1000018889,https://openalex.org/W2055456945,2013,December,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
0,https://openalex.org/W1000018889,https://openalex.org/W2145187482,2013,December,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430285,0.0,0.0
20,https://openalex.org/W100004108,https://openalex.org/W150310457,2012,September,0.0,0.7463,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213909,https://openalex.org/W999914091,https://openalex.org/W2038276547,2013,March,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213909,https://openalex.org/W999914091,https://openalex.org/W2085511467,2013,March,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213909,https://openalex.org/W999914091,https://openalex.org/W2099907898,2013,March,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2213909,https://openalex.org/W999914091,https://openalex.org/W2115467209,2013,March,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [92]:
indices_cit = df_cit.groupby(['year','month']).apply(lambda x: create_indices_cit(x,list_concepts,df_ref_help)).to_frame()

Indices-cit of proximity for 2002 in April were computed in 0.38 seconds.
Indices-cit of proximity for 2002 in August were computed in 0.6 seconds.
Indices-cit of proximity for 2002 in December were computed in 2.81 seconds.
Indices-cit of proximity for 2002 in February were computed in 0.27 seconds.
Indices-cit of proximity for 2002 in January were computed in 0.27 seconds.
Indices-cit of proximity for 2002 in July were computed in 1.01 seconds.
Indices-cit of proximity for 2002 in June were computed in 0.76 seconds.
Indices-cit of proximity for 2002 in March were computed in 0.75 seconds.
Indices-cit of proximity for 2002 in May were computed in 0.99 seconds.
Indices-cit of proximity for 2002 in November were computed in 0.35 seconds.
Indices-cit of proximity for 2002 in October were computed in 1.82 seconds.
Indices-cit of proximity for 2002 in September were computed in 1.41 seconds.
Indices-cit of proximity for 2003 in April were computed in 1.16 seconds.
Indices-cit of proximity 

Indices-cit of proximity for 2011 in August were computed in 13.88 seconds.
Indices-cit of proximity for 2011 in December were computed in 70.36 seconds.
Indices-cit of proximity for 2011 in February were computed in 29.87 seconds.
Indices-cit of proximity for 2011 in January were computed in 26.29 seconds.
Indices-cit of proximity for 2011 in July were computed in 99.22 seconds.
Indices-cit of proximity for 2011 in June were computed in 31.34 seconds.
Indices-cit of proximity for 2011 in March were computed in 72.61 seconds.
Indices-cit of proximity for 2011 in May were computed in 116.22 seconds.
Indices-cit of proximity for 2011 in November were computed in 42.48 seconds.
Indices-cit of proximity for 2011 in October were computed in 19.26 seconds.
Indices-cit of proximity for 2011 in September were computed in 8.79 seconds.
Indices-cit of proximity for 2012 in April were computed in 119.59 seconds.
Indices-cit of proximity for 2012 in August were computed in 103.09 seconds.
Indices-

Indices-cit of proximity for 2020 in August were computed in 54.0 seconds.
Indices-cit of proximity for 2020 in December were computed in 103.41 seconds.
Indices-cit of proximity for 2020 in February were computed in 75.21 seconds.
Indices-cit of proximity for 2020 in January were computed in 94.86 seconds.
Indices-cit of proximity for 2020 in July were computed in 85.03 seconds.
Indices-cit of proximity for 2020 in June were computed in 100.38 seconds.
Indices-cit of proximity for 2020 in March were computed in 51.27 seconds.
Indices-cit of proximity for 2020 in May were computed in 70.39 seconds.
Indices-cit of proximity for 2020 in November were computed in 113.4 seconds.
Indices-cit of proximity for 2020 in October were computed in 85.57 seconds.
Indices-cit of proximity for 2020 in September were computed in 61.37 seconds.
Indices-cit of proximity for 2021 in April were computed in 80.65 seconds.
Indices-cit of proximity for 2021 in August were computed in 75.53 seconds.
Indices-c

<div class="alert-info">
5.2. Restructuration of the dataframe
</div>

We now aim to turn the dataframe 'indices_cit' into a dataframe as we want it (with columns and so on).

In [93]:
indices_cit.rename(columns={0: "indices"}, inplace=True)
indices_cit.reset_index(inplace=True, level=['month'])
indices_cit.reset_index(inplace=True, level=['year'])

In [94]:
indices_cit

Unnamed: 0,year,month,indices
0,2002,April,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
1,2002,August,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
2,2002,December,"[[0, 0], [0, 0], [0, 0], [nan, nan], [0, 0], [..."
3,2002,February,"[[nan, nan], [nan, nan], [nan, nan], [nan, nan..."
4,2002,January,"[[0, 0], [0, 0], [nan, nan], [0, 0], [0, 0], [..."
...,...,...,...
247,2022,March,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0..."
248,2022,May,"[[0, 0], [0, 0], [0, 1.432623085], [0, 0], [0,..."
249,2022,November,"[[0, 0], [0, 0.419277], [0, 0], [nan, nan], [0..."
250,2022,October,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0..."


We need to create lists for the concepts for the columns we want to create.

In [95]:
list_combination = []
for tech1 in list_concepts:
    for tech2 in list_concepts:
        list_combination.append(tech1)
        list_combination.append(tech2)
        
end = len(list_combination)-1
first_listconcept = list_combination[:end:2]
second_listconcept = list_combination[1:(end+1):2]

We now explode the column 'indices' and add the columns about the concepts.

In [96]:
indices_cit= indices_cit.explode('indices')
indices_cit['concept1']=12*21*first_listconcept
indices_cit['concept2']=12*21*second_listconcept

We take all the indices, which is a list of lists.

In [97]:
allindices = list(indices_cit.indices.tolist())

We do a list of the indices based on not incremental monthly h-indices and the ones based on incremental monthly h-indices.

In [98]:
indices1=[x[0] for x in allindices]
indices2= [x[1] for x in allindices]

We update our dataframe adding the wanted information and eliminating the other.

In [99]:
indices_cit=indices_cit.drop('indices',axis=1)

In [100]:
indices_cit['index_cit_t1_t2']=indices1
indices_cit['index_cit_t2_t1']=indices2

We are done with our last modification. We just verify and save the dataframe for later.

In [101]:
indices_cit

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1
0,2002,April,Authentication protocole,Authentication protocole,,
0,2002,April,Authentication protocole,Biometrics,,
0,2002,April,Authentication protocole,Blockchain,,
0,2002,April,Authentication protocole,Digital rights management,,
0,2002,April,Authentication protocole,Digital signature,,
...,...,...,...,...,...,...
251,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
251,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
251,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
251,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [102]:
indices_cit.to_pickle('data_indices/indices_cit_notnormalized')

# **Part 6 -- Merging the data altogether**

In [103]:
indices_cit

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1
0,2002,April,Authentication protocole,Authentication protocole,,
0,2002,April,Authentication protocole,Biometrics,,
0,2002,April,Authentication protocole,Blockchain,,
0,2002,April,Authentication protocole,Digital rights management,,
0,2002,April,Authentication protocole,Digital signature,,
...,...,...,...,...,...,...
251,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
251,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
251,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
251,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [104]:
indices_key

Unnamed: 0,year,month,index_keyword,concept1,concept2
0,2002,April,,Authentication protocole,Authentication protocole
0,2002,April,,Authentication protocole,Biometrics
0,2002,April,,Authentication protocole,Blockchain
0,2002,April,,Authentication protocole,Digital rights management
0,2002,April,,Authentication protocole,Digital signature
...,...,...,...,...,...
251,2022,September,0.304064,Zero-knowlegde proof,Symmetric-key algorithm
251,2022,September,,Zero-knowlegde proof,Threshold cryptosystem
251,2022,September,0.1581,Zero-knowlegde proof,Trusted Computing
251,2022,September,,Zero-knowlegde proof,Tunneling protocol


In [105]:
indices_colab

Unnamed: 0,year,month,concept1,concept2,index_colab_notincrem,index_colab_increm
0,2002,April,Authentication protocole,Authentication protocole,,
0,2002,April,Authentication protocole,Biometrics,,
0,2002,April,Authentication protocole,Blockchain,,
0,2002,April,Authentication protocole,Digital rights management,,
0,2002,April,Authentication protocole,Digital signature,,
...,...,...,...,...,...,...
251,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0
251,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,
251,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0
251,2022,September,Zero-knowlegde proof,Tunneling protocol,,


In [106]:
dfintermed = pd.merge(indices_cit,indices_key, on=['year','month','concept1','concept2'], how='right')
dfintermed

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1,index_keyword
0,2002,April,Authentication protocole,Authentication protocole,,,
1,2002,April,Authentication protocole,Biometrics,,,
2,2002,April,Authentication protocole,Blockchain,,,
3,2002,April,Authentication protocole,Digital rights management,,,
4,2002,April,Authentication protocole,Digital signature,,,
...,...,...,...,...,...,...,...
157495,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0,0.304064
157496,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,,
157497,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0,0.1581
157498,2022,September,Zero-knowlegde proof,Tunneling protocol,,,


In [107]:
dfindices = pd.merge(dfintermed,indices_colab, on=['year','month','concept1','concept2'], how='right')
dfindices

Unnamed: 0,year,month,concept1,concept2,index_cit_t1_t2,index_cit_t2_t1,index_keyword,index_colab_notincrem,index_colab_increm
0,2002,April,Authentication protocole,Authentication protocole,,,,,
1,2002,April,Authentication protocole,Biometrics,,,,,
2,2002,April,Authentication protocole,Blockchain,,,,,
3,2002,April,Authentication protocole,Digital rights management,,,,,
4,2002,April,Authentication protocole,Digital signature,,,,,
...,...,...,...,...,...,...,...,...,...
157495,2022,September,Zero-knowlegde proof,Symmetric-key algorithm,0.0,0.0,0.304064,0.0,0.0
157496,2022,September,Zero-knowlegde proof,Threshold cryptosystem,,,,,
157497,2022,September,Zero-knowlegde proof,Trusted Computing,0.0,0.0,0.1581,0.0,0.0
157498,2022,September,Zero-knowlegde proof,Tunneling protocol,,,,,


In [108]:
dfindices.to_pickle('data_indices/dfindices_normalized')