# Creation dataframes for visualization purposes in the dataexploration

_Foreword_

The goal of this notebook is to create two pandas dataframe. One measuring the importance of a keyword in the field of encryption technologies at a certain time, while the other computes the average of cosine similarity of this keyword over all publications from 2002 to 2022.

Importing the necessary libraries.

In [1]:
import pickle
import pandas as pd
from tqdm import tqdm
import math

Importing the pandas dataframe dfkeywords with which I will do all the computations.

In [2]:
infile_keywords = open('data_creation_variables/dfkeywords','rb')
dfkeywords = pickle.load(infile_keywords)
infile_keywords.close()

<div class="alert-info">
1. Dataframe for the importance of keywords
</div>

The function below creates a pandas dataframe having in each row, for each keyword given as input, the importance of the keyword overall. It is measured summing the cosine similarities related to the occurence of each keyword up to a certain month/year or occuring during a certain month/year.

In [3]:
def compute_info_keywords(listkeywords, dfkeywords, my_years, my_months):
    dico_importance_keywords = {'keyword': [], 'year': [], 'month': [], 'importance_for_the_month': [],
                                'importance_to_the_month': [],
                                'importance_for_the_year': [], 'importance_to_the_year': []}
    
    #I choose only the rows related to my list of keywords.
    df_myinfos_keyword = dfkeywords.loc[dfkeywords['keyword'].isin(listkeywords)].copy()

    for keyword in listkeywords:
        #setting up the count to zero
        mypreviousyearimportance = 0
        myincrementalvalue = 0
        
        #I choose only the rows related to my keyword
        infokeyword = df_myinfos_keyword.loc[df_myinfos_keyword['keyword'] == keyword].copy()
        for year in my_years:
             #I choose only the rows related to my year
            infokeywords_year = infokeyword.loc[infokeyword['year'] == year].copy()
            for month in my_months:
                
                #I choose only the rows related to my month
                infokeywords_month = infokeywords_year.loc[infokeywords_year['month'] == month].copy()
                
                # well if there are no rows, no occurence of keywords this month, therefore we add 0.e
                if len(infokeywords_month) == 0:
                    cos_sim = 0
                else:
                    list_cos_sim = infokeywords_month.cosine_similarity.tolist()
                    cos_sim = sum(list_cos_sim)
                    # I think I should add them up all instead of taking only the first one right?
                    #cos_sim = list_cos_sim[0]
                    
                #I add all information to my dictionary
                myincrementalvalue = myincrementalvalue + cos_sim
                dico_importance_keywords['importance_to_the_month'].append(myincrementalvalue)
                dico_importance_keywords['importance_for_the_month'].append(cos_sim)
                dico_importance_keywords['keyword'].append(keyword)
                dico_importance_keywords['year'].append(year)
                dico_importance_keywords['month'].append(month)
                
                #except for the yearly information that I add only once a year
                if month == 'December':
                    # I add 12 times my incremental value
                    dico_importance_keywords['importance_to_the_year'] = \
                    dico_importance_keywords['importance_to_the_year'] + 12 * [myincrementalvalue]
                    
                    #counting the importance only for a year
                    importancefortheyear = myincrementalvalue - mypreviousyearimportance
                    dico_importance_keywords['importance_for_the_year'] = \
                    dico_importance_keywords['importance_for_the_year'] + 12 * [importancefortheyear]
                    mypreviousyearimportance = myincrementalvalue

    dfkeywords_importance = pd.DataFrame(dico_importance_keywords)
    return dfkeywords_importance

I define some variables and as in the other jupyter notebook, I run my function on subsets of my data.

In [4]:
# I create a list of all keywords I have in my dataset
mykeywords = list(set(dfkeywords.keyword.tolist()))

In [5]:
# I create here a dataframe where I have the importance of each keyword in this
dico_importance_keywords={'keyword':[],'year':[],'month':[],'importance_for_the_month':[],
                          'importance_to_the_month':[],
                         'importance_for_the_year':[],'importance_to_the_year':[]}

In [6]:
my_months = ['January','February','March','April','May','June',
             'July','August','September','October','November','December']

my_years = [2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]

In [7]:
numberkeywords = len(mykeywords)

fractionnumber = 100

list_listkeywords = []

In [8]:
for i in tqdm(range(fractionnumber)):
    start = i*math.floor(numberkeywords/fractionnumber)
    end = (i+1)*math.floor(numberkeywords/fractionnumber)
    if i == fractionnumber-1:
        mylistkeywords= mykeywords[start:]
    else:
        mylistkeywords = mykeywords[start:end]
    list_listkeywords.append(mylistkeywords)

100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 101803.50it/s]


In [9]:
listindex= [0,20,40,60,80,100]

In [10]:
items = ['keyword','year','month','importance_for_the_month',
'importance_to_the_month', 'importance_for_the_year','importance_to_the_year']

Here I run my function on subsets of my data, creating pandas dataframe and saving it.

In [None]:
for j in range(5):
    dico_importance_keywords = {'keyword': [], 'year': [], 'month': [], 'importance_for_the_month': [],
                                'importance_to_the_month': [],
                                'importance_for_the_year': [], 'importance_to_the_year': []}

    list_listkeywords_compute=list_listkeywords[listindex[j]:listindex[j+1]]
    list_df = list(map(lambda x: compute_info_keywords(x,dfkeywords,my_years,my_months), tqdm(list_listkeywords_compute)))
    for element in list_df:
        for item in items:
            mynewlist = element[item].tolist()
            dico_importance_keywords[item] = dico_importance_keywords[item] + mynewlist
    newfile = pd.DataFrame(dico_importance_keywords)
    newfile.to_pickle('data_creation_variables/df_importance_keywords'+str(j))

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [09:06<00:00, 27.34s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [07:50<00:00, 23.53s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [08:23<00:00, 25.19s/it]
 70%|█████████████████████████████████████████████████████████▍                        | 14/20 [06:26<02:48, 28.14s/it]

In [None]:
dico_importance_keywords = {'keyword': [], 'year': [], 'month': [], 'importance_for_the_month': [],
                            'importance_to_the_month': [],
                            'importance_for_the_year': [], 'importance_to_the_year': []}

I download all the pandas dataframes and I merge them in one big dataframe.

In [None]:
for j in range(5):
    with open('data_creation_variables/df_importance_keywords'+str(j), 'rb') as f:
        df_importance_keywords = pickle.load(f)
    for item in items:
        mynewlist = df_importance_keywords[item].tolist()
        dico_importance_keywords[item] = dico_importance_keywords[item] + mynewlist

In [None]:
dfkeywords_importance = pd.DataFrame(dico_importance_keywords)
dfkeywords_importance.to_pickle('data_creation_variables/dfkeywords_importance')

<div class="alert-info">
2. Dataframe with the average cosine similarity of keywords
</div>

In [None]:
mykeywords = list(set(dfkeywords.keyword.tolist()))

In [None]:
dico_average_cos_sim = {'keyword': [], 'average_sim': []}

I compute the average cosine similarity of each keyword and turn it into a pandas dataframe which I save.

In [None]:
for word in tqdm(mykeywords):
    info_word = dfkeywords.loc[dfkeywords['keyword'] == word].copy()
    info_cos_sim = info_word.cosine_similarity.tolist()
    #taking the average cosine similarity
    average_sim = sum(info_cos_sim) / len(info_cos_sim)
    dico_average_cos_sim['keyword'].append(word)
    dico_average_cos_sim['average_sim'].append(average_sim)

In [None]:
average_cos_sim_df = pd.DataFrame(dico_average_cos_sim)

In [None]:
average_cos_sim_df.to_pickle('data_creation_variables/df_average_cosim')