### 1. data load

In [1]:
import pandas as pd

df_fws = pd.read_excel('data/Keyphrases/fws_keywords_less.xlsx')
df_abs = pd.read_excel('data/Keyphrases/abs_keywords.xlsx')
df_ts =  pd.read_excel('data/Keyphrases/titles_keywords.xlsx')
df_ts.head()

Unnamed: 0,2000,count2000,2001,count2001,2002,count2002,2003,count2003,2004,count2004,...,2017,count2017,2018,count2018,2019,count2019,2020,count2020,2021,count2021
0,corpus,7,machine translation,5,statistical machine translation,4,statistical machine translation,7,statistical machine translation,9,...,neural machine translation,23,neural machine translation,50,neural machine translation,48,neural machine translation,31,language model,52
1,hidden markov model,3,corpus,4,word sense disambiguation,2,question,5,feature,4,...,attention,11,word embeddings,18,attention,27,language model,23,neural machine translation,43
2,word sense disambiguation,3,constraint,4,grammar,2,web,3,conditional random field,4,...,neural network,8,attention,13,text generation,20,bert,23,bert,31
3,empirical study,3,question,3,hmm,2,support vector machine,3,feature selection,4,...,language,7,text,12,question,18,self,18,transformer,26
4,information extraction,3,grammar,3,word,2,information extraction,3,active learning,3,...,word embeddings,7,question,12,text,17,transformer,18,text,20


In [2]:
years = range(2000,2022)

abs_titles_dict = {}

for year in years:

    resDict = {}
    
    try:
        abs_words = df_abs[str(year)]
    except Exception as e:
        abs_words = df_abs[year]
        
    abs_weight = df_abs['count'+str(year)]
    
    for i in range(len(abs_words)):
        resDict[abs_words[i]] = abs_weight[i]
        
    try:
        title_words = df_ts[str(year)]
    except Exception as e:
        title_words = df_ts[year]
        
    title_weight = df_ts['count'+str(year)]
    
    for i in range(len(title_words)):
        if title_words[i] in resDict:
            resDict[title_words[i]] += title_weight[i]
        else:
            resDict[title_words[i]] = title_weight[i]
    
    elements = sorted(resDict.items(),key=lambda x:x[1],reverse=True)

    abs_titles_dict[str(year)] = elements

In [6]:
len(abs_titles_dict['2000'])

142

### 2. keywords rejust

* Remove hyphens
* Abbreviation merge
* Remove some meaningless modifier adjectives,such as: more classfiy 、classify;more tagger、tagger;different
* Remove some meaningless suffixes,such as: task、technique、technologies
* For words like bert and bert model, merge
* Combine the words that have been stemmed and extracted
* Remove stopwords

other words like dialogue system、dialog system and so on,we combine these words by human observation

#### 2.1 Write functional functions

In [7]:
# remove hyphen
# word - word 、 word- word、 word -word 三种形式
import regex as re

pattern = r'(\s*)-(\s*)'

def filterHyphen(words,values):
    resDict = {}
    
    for i in range(len(words)):
        resDict[words[i]] = values[i]
        
    for i in range(len(words)):
        word = words[i]
        newWord = re.sub(pattern,"",word)
        
        if newWord == word:
            continue
            
        if newWord in resDict.keys():
            resDict[newWord] += resDict[word]
            del resDict[word]
        else:
            resDict[newWord] = resDict[word]
            del resDict[word]
        
    return list(resDict.keys()),list(resDict.values())

In [8]:
# convert the word's acronyms into the original form
def combineByAcronym(words,values):
    resDict = {}
    
    for i in range(len(words)):
        resDict[words[i]] = values[i]
        
    for i in range(len(words)):
        word = words[i]
        chars = word.split(" ")
        new_str = ""
        
        if len(chars) > 1:
            for single_char in chars:
                new_str += single_char[0]
        
        if new_str in resDict.keys():
            resDict[word] += resDict[new_str]
            del resDict[new_str]
        
        
    return list(resDict.keys()),list(resDict.values())

In [9]:
# remove meaningless part from words
adjs = ['more','different','most','much','better','other','such','new','noun','verb']    
suffix = ['model','task','technique','technologies','technology','approch','system']

def combineBySpecialWordAdj(words,values):
    resDict = {}
    
    for i in range(len(words)):
        resDict[words[i]] = values[i]
    
    for i in range(len(words)):
        word = words[i]
        new_str = ""
        for adj in adjs:
            if adj in word:
                new_str = word.replace(adj,"")
                break
        
        new_str = new_str.strip()
        
        if new_str in resDict.keys():
            resDict[new_str] += resDict[word]
            del resDict[word]
        
    return list(resDict.keys()),list(resDict.values())

def combineBySpecialWordSuffix(words,values):
    resDict = {}
    
    for i in range(len(words)):
        resDict[words[i]] = values[i]
    
    for i in range(len(words)):
        word = words[i]
        new_str = ""
        for suf in suffix:
            if suf in word:
                new_str = word.replace(suf,"")
                break
        
        new_str = new_str.strip()
                  
        if new_str in resDict.keys():
            resDict[word] += resDict[new_str]
            del resDict[new_str]
        
    return list(resDict.keys()),list(resDict.values())

In [10]:
from nltk.stem import SnowballStemmer  
snowball_stemmer = SnowballStemmer("english")  

def combineByStem(words,values):
    resDict = {}
    newDict = {}
    
    for i in range(len(words)):
        resDict[words[i]] = values[i]
    
    for i in range(len(words)):
        word = words[i]
        phrase = ""
        for singleword in word.split(" "):
            phrase += snowball_stemmer.stem(singleword)
            phrase += " "
        phrase = phrase[:-1]
        
        if phrase in newDict:
            newDict[phrase] += resDict[word]
        else:
            newDict[phrase] = resDict[word]
    
    return list(newDict.keys()),list(newDict.values())

In [11]:
# remove stopwords
df_stopwords = pd.read_csv('data/raw/Stopwords.csv')
stopwords = df_stopwords['word'].tolist()

def filterStopwords(words,values):
    resDict = {}
    
    for i in range(len(words)):
        word = words[i]
        if word in stopwords:
            continue
        
        flag = False
        elements = word.split(" ")
        for element in elements:
            if element in ['other','task','future','direction','improve','improvement','problem']:
                flag = True
                break
        if flag == True:
            continue
        
        resDict[word] = values[i]
    
    return list(resDict.keys()),list(resDict.values())


#### 2.2 Unify processing flows

In [16]:
def rejust(datas,values):
    word_1,value_1 =  filterHyphen(datas,values)
    word_2,value_2 = combineByAcronym(word_1,value_1)
    word_3,value_3 = combineBySpecialWordAdj(word_2,value_2)
    word_4,value_4 = combineBySpecialWordSuffix(word_3,value_3)
    word_5,value_5 = filterStopwords(word_4,value_4)
    word_6,value_6 = combineByStem(word_5,value_5)
    word_7,value_7 = combineBySpecialWordAdj(word_6,value_6)
    
    resDict = {}
    for i in range(len(word_7)):
        resDict[word_7[i]] = value_7[i]
        
    return sorted(resDict.items(),key = lambda x:x[1],reverse=True)


#### 2.3 data save

In [40]:
import json

def SaveData(df,years,filename):
    allDict = {}
    for year in years:
        try:
            data = df[year]
        except Exception as e:
            data = df[int(year)]
            
        value = df['count'+str(year)]
        
        resDict = rejust(data.tolist(),value.tolist())
        allDict[str(year)] = resDict

    with open(filename,'w') as f:
        json.dump(allDict,f)

In [44]:
years = [str(year) for year in range(2000,2022)]
SaveData(df_abs,years,'abs_keywords_rejusted.json')

In [45]:
years = [str(year) for year in range(2000,2021)]
SaveData(df_fws,years,'fws_keywords_rejusted.json')