# Creation dataframe of keywords

_Foreword_

The goal of this notebook is to create a pandas dataframe, containing for each paper, the keywords extracted from the title and the abstract of the paper using keybert.

Importing the necessary libraries.

In [1]:
from myfunctions import get_dico_keywords, get_keyword,clean_text
import pandas as pd
import pickle
from tqdm import tqdm
import math

Importing df_full_cleaned, that was cleaned and processed by the file "dataexploration_full_data".

In [2]:
infile_data_full = open('../exploratory_analysis/data_exploratory_analysis/df_full_cleaned','rb')
df_full = pickle.load(infile_data_full)
infile_data_full.close()

We define some variables and the dictionary in which we will put all the information about the keywords.

In [3]:
myyears = [2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
mymonths = ['January', 'February', 'March', 'April', 'May', 'June', 'July',\
           'August', 'September', 'October', 'November', 'December']

In [4]:
items = ['paper','keyword','cosine_similarity','publication_date','year','month']

In [5]:
dicokeywords = {'paper':[],
                'keyword':[],
                'cosine_similarity':[],
                'publication_date':[],
                'year':[],
                'month':[]}

In [6]:
df_full = df_full[['paper','title','publication_date','year','month','abstract']]
df_full = df_full.drop_duplicates()

In [7]:
listpaper_first = list(set(df_full.paper.tolist()))

In [8]:
fractioning_number = 100

I extract all the keywords from all my papers and save them as lists.

In [9]:
for j in range(5):
    list_listkeywords = []
    list_listcosim = []
    
    #I first split the dataset in 5 subsets
    step = int(math.floor(len(listpaper_first)/ 5))
    start = j * step
    if j == 4:
        end = len(listpaper_first)
    else:
        end = (j + 1) * step
    listpaper = listpaper_first[start:end]
    
    #Then I split these subsets in even smaller subsets
    for i in tqdm(range(fractioning_number)):
        step = int(math.floor(len(listpaper) / fractioning_number))
        start = i*step
        end = (i+1)*step
        if i == fractioning_number-1:
            end = len(listpaper)
        restricted_listpaper=listpaper[start:end]
        
        #I choose only rows of my pandas dataframe of the paper I am analyzing
        mypapers =df_full[df_full['paper'].isin(restricted_listpaper)].copy()
        
        #I now extract the keywords
        for paper in restricted_listpaper:
            myinfos = mypapers.loc[mypapers['paper']==paper].copy()
            abstract = list(set(myinfos.abstract.tolist()))
            title = list(set(myinfos.title.tolist()))
            text = str(title[0] + abstract[0])

            text_c = clean_text(text)

            # this functions get_keyword take 99.99% of the running time for each loop
            mykeywords = get_keyword(text_c)

            numberkeywords = len(mykeywords)

            # actually mykeywords is a list of tuples (with the keyword and its cosine similarity which is a measure of 
            # the closeness of the keyword to the text)
            # I take every element of the list and then I take the
            # elements of the tuple I am interested in.
            listkeywords = []
            listcosim=[]
            for y in range(numberkeywords):
                infomykey = mykeywords[y]
                listkeywords.append(infomykey[0])
                listcosim.append(infomykey[1])
            list_listkeywords.append(listkeywords)
            list_listcosim.append(listcosim)
    
    # saving my list of keywords and cosine similarity
    with open("data_creation_variables/list_listkeywords"+str(j), "wb") as fp:  # Pickling
        pickle.dump(list_listkeywords, fp)
    with open("data_creation_variables/list_listcosim"+str(j), "wb") as fp:  # Pickling
        pickle.dump(list_listcosim, fp)

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [1:28:19<00:00, 53.00s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [1:23:40<00:00, 50.20s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [1:20:05<00:00, 48.06s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [1:19:12<00:00, 47.53s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [1:18:19<00:00, 46.99s/it]


In [10]:
# we now generate a list of keywords and cosine similarity out of all the lists which were created 
list_keywords = []
list_cosim = []
for j in range(5):
    with open("data_creation_variables/list_listkeywords"+str(j), 'rb') as f:
        list_keywords=list_keywords+pickle.load(f)
    with open("data_creation_variables/list_listcosim"+str(j), 'rb') as f:
        list_cosim = list_cosim + pickle.load(f)

In [11]:
df_full = df_full[['paper','publication_date','year','month']]

We define some functions that we will use to construct our pandas dataframe of keywords. It's quite clear what they do, they just take the publication date, year and month of each given paper.

In [12]:
def get_publication_date(paper,df_full):
    df = df_full.loc[df_full['paper']==paper].copy()
    listinfo = df.publication_date.tolist()
    return listinfo[0]

In [13]:
def get_year(paper,df_full):
    df = df_full.loc[df_full['paper']==paper].copy()
    listinfo = df.year.tolist()
    return listinfo[0]

In [14]:
def get_month(paper,df_full):
    df = df_full.loc[df_full['paper']==paper].copy()
    listinfo = df.month.tolist()
    return listinfo[0]

I create the list of information and so on and I just create the pandas dataframe out of it.

In [15]:
publication_list = list(map(lambda x: get_publication_date(x, df_full), tqdm(listpaper_first)))
month_list = list(map(lambda x: get_month(x, df_full), tqdm(listpaper_first)))
year_list = list(map(lambda x: get_year(x, df_full), tqdm(listpaper_first)))

100%|█████████████████████████████████████████████████████████████████████████| 131120/131120 [15:11<00:00, 143.80it/s]
100%|█████████████████████████████████████████████████████████████████████████| 131120/131120 [14:36<00:00, 149.56it/s]
100%|█████████████████████████████████████████████████████████████████████████| 131120/131120 [14:33<00:00, 150.10it/s]


In [16]:
dicokeywords['keyword']=list_keywords
dicokeywords['cosine_similarity']= list_cosim
dicokeywords['paper']=listpaper_first
dicokeywords['publication_date']=publication_list
dicokeywords['year']=year_list
dicokeywords['month']=month_list

In [17]:
dfkeywords = pd.DataFrame(dicokeywords)

In [18]:
# last but not least I explode the rows containing the keywords and 
#cosine similarity since there were lists of keywords and cosine similarity in these cells
dfkeywords =dfkeywords.explode(['keyword', 'cosine_similarity'])

In [19]:
dfkeywords.to_pickle('data_creation_variables/dfkeywords')