In [12]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

In [2]:
df_1 = pd.read_csv('./keywords/scopus_1.csv')
df_2 = pd.read_csv('./keywords/scopus_2.csv')
df_3 = pd.read_csv('./keywords/scopus_3.csv')

In [4]:
df = pd.concat([df_1, df_2, df_3], ignore_index=True)
df.fillna("", inplace=True)

# Preprocessing

## Get all keywords

In [9]:
def get_all_keywords(df, columns = ['Author Keywords', 'Index Keywords']):
    full_list = ""
    for idx, row in df.iterrows():
        for col in columns:
            full_list += row[col] + ";"
    full_list = full_list.split(';')
    full_list = [item.strip() for item in full_list]
    full_list = list(set(full_list))
    full_list.remove('')
    return full_list

In [10]:
tokens_list = get_all_keywords(df)

## Combine two words with same stems into a single keyword

In [15]:
def get_stemmed_words_list(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = " ".join([word for word in word_tokenize(text) if word not in nltk.corpus.stopwords.words('english')])
    stems = [stemmer.stem(word) for word in word_tokenize(text)]
    final_text = stems
    final_text = list(set(final_text))
    return final_text

In [21]:
def get_unique_keywords(tokens_list):
    tokenized_filter = [get_stemmed_words_list(i) for i in tokens_list]
    unique_tokens = []
    ind = 0
    replacement_map = {}
    for token in tokens_list:
        first_set = tokenized_filter[ind]
        j = 0
        for second in tokens_list:
            if j > ind:
                second_set = tokenized_filter[j]
                if second_set == first_set:
                    root = token
                    while root in replacement_map.keys():
                        root = replacement_map[root]
                    unique_tokens.append(root)
                    replacement_map[second] = root
            j += 1
        ind +=1
    
    return list(set(unique_tokens)), replacement_map

In [22]:
unique_tokens_list, first_replacement_map = get_unique_keywords(tokens_list)

## Find similar words based on cosine similarity score

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [33]:
token_embeddings = model.encode(unique_tokens_list)

In [34]:
sim = cosine_similarity(token_embeddings)

In [55]:
def create_similar_groups(sim, threshold = 0.9):
    groups = []
    ind = 0
    for elem in sim:
        args = np.argsort(-elem)
        similar = []   
        i = 0
        while(i<len(sim[0]) and sim[ind][args[i]] > threshold):
            if args[i] != ind : similar.append(args[i])
            i+=1
        groups.append(similar)
        ind += 1
    return groups

In [56]:
groups = create_similar_groups(sim)

In [57]:
def get_similar_words_from_groups(unique_tokens_list, groups):
    res = {}
    ind = 0
    for sim_list in groups:
        sim_list_keywords = [unique_tokens_list[i] for i in sim_list]
        res[unique_tokens_list[ind]] = sim_list_keywords
        ind += 1
    return res
        

In [59]:
# average number of similar words
sum = 0
for elem in groups:
    sum += len(elem)
sum/len(groups)

2.526416295353278

In [60]:
groups_words = get_similar_words_from_groups(unique_tokens_list, groups)

In [76]:
def create_replacement_map_from_sim_groups(groups_words):
    replacement_map = {}
    ind = 0
    for word in groups_words.keys():
        ind += 1
        sim_word_list = groups_words[word]
        for sim_word in sim_word_list:
            if sim_word not in replacement_map.keys(): replacement_map[sim_word] = []
            replacement_map[sim_word].append(word)
            
            # if word not in replacement_map.keys():
            #     replacement_map[sim_word] = word
            # else:
            #     # update replacement till the word has no other replacement:
            #     root = word
            #     i = 0
            #     while root in replacement_map.keys() and i < 5:
            #         root = replacement_map[root]
            #         i += 1
            #     replacement_map[sim_word] = word
        # print(ind)
    return replacement_map

In [77]:
replacement_map_with_multiple_options = create_replacement_map_from_sim_groups(groups_words)

In [78]:
sum = 0
for key in replacement_map_with_multiple_options.keys():
    sum += len(replacement_map_with_multiple_options[key])
sum/len(replacement_map_with_multiple_options)

4.302439024390244

## Replace keywords with the root keyword in csv

In [85]:
def replace_keywords_in_original_file(df, replacement_map, cols =['Index Keywords', 'Author Keywords']):
    for idx, row in df.iterrows():
        for col in cols:
            original = row[col]
            original_list = original.split(';')
            original_list = [keyword.strip() for keyword in original_list]
            new_list = []
            for keyword in original_list:
                if keyword in replacement_map.keys():
                    if type(replacement_map[keyword]) == list:
                        new_list.append(np.random.choice(replacement_map[keyword]))
                    else:
                        new_list.append(replacement_map[keyword])
                else:
                    new_list.append(keyword)
            
            new_list = list(set(new_list))
            df.loc[idx, col + ' New'] = ';'.join(new_list)
    return df
    

In [86]:
df_first = replace_keywords_in_original_file(df, first_replacement_map)

In [99]:
df_second = replace_keywords_in_original_file(df_first, replacement_map_with_multiple_options, cols=['Index Keywords New', 'Author Keywords New'])