### 1.data load

In [None]:
import pandas as pd
import json

df_fws = pd.read_excel('data/raw/future work sentence.xlsx')
df_abs = pd.read_excel('data/raw/title and abstract.xlsx')

In [None]:
df_fws.head()

In [None]:
df_abs.head()

In [None]:
types = df_fws['FWS_TYPE'].tolist()
set(types)

In [None]:
# there nan value in abstract of df_abs
df_abs = df_abs[df_abs['abs']!=' 0']
df_fws = df_fws[df_fws['FWS_TYPE']!='other']

##### 1.1 EDA

In [None]:
len(df_fws['FWS'].tolist())

In [None]:
print("The number of the Articles in fws is {}".format(len(set(df_fws['ID']))))

In [None]:
print("The number of the Articles in abs is {}".format(len(df_abs['ID'])))

In [None]:
fws = df_fws['FWS'].tolist()
abs = df_abs['abs'].tolist()
titles = df_abs['title'].tolist()

In [None]:
# compare the sentence's average length of those sets
import numpy as np

len_fw = [len(element.split(" ")) for element in fws]
len_ab = [len(element.split(" ")) for element in list(set(abs))]
len_title = [len(element.split(" ")) for element in list(set(titles))]


print(np.sum(np.array(len_fw))/len(len_fw))

print(np.sum(np.array(len_ab))/len(len_ab))

print(np.sum(np.array(len_title))/len(len_title))


* By compare,the length of fws is shorter and the length of abstract is longer
* so when we extract the keywords from fws and abstract,we should control the number of keywords.For fws,the number is smaller

### 2.Keyword extract

#### KeyBert

link: https://towardsdatascience.com/enhancing-keybert-keyword-extraction-results-with-keyphrasevectorizers-3796fa93f4db

github: https://github.com/MaartenGr/KeyBERT

In [None]:
!pip install keybert
!pip install keyphrase-vectorizers

In [None]:
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer

kw_model = KeyBERT()
vectorizer = KeyphraseCountVectorizer()

In [None]:
keywords_fws = kw_model.extract_keywords(fws,vectorizer=KeyphraseCountVectorizer(),nr_candidates=2*3,top_n=3)

In [None]:
keywords_abs = kw_model.extract_keywords(abs,vectorizer=KeyphraseCountVectorizer(),nr_candidates=2*12,top_n=12)

In [None]:
keywords_titles = kw_model.extract_keywords(titles,vectorizer=KeyphraseCountVectorizer(),nr_candidates=2*3,top_n=3)

### 3.Simple preprocess to the extracted keyphrase

#### 3.1 rejust the keywords that extracted by keybert

* target：Remove meaningless words as much as possible and keep the most important words
* implement：
  * （1）Set threshold, and words less than threshold are removed
  * （2）In a collection, the word with the smaller value is first arranged, and if the word with the smaller value is included in the word with the larger value, the word with the smaller value is deleted

In [None]:
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
keywords_fws[:3]

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# In a sentence,filter some less important words
def filterByValue(datas):
  keywords = [data for data in datas]

  nKeywords = []
    
  for elements in keywords:
        wValues = [element[1] for element in elements]
        threshold = np.mean(np.array(wValues))
        
        words = []
        for element in elements:
            if element[1] >= threshold:
                words.append(element[0])
                
        nKeywords.append(words)

  return nKeywords


# Lemma
def filterByLemma(datas):
  keywords = [data for data in datas]

  nKeywords = []
    
  for elements in keywords:
        nElements = []
        
        for element in elements:
            phrase = ""
            for word in element[0].split(" "):
                phrase += lemmatizer.lemmatize(word)
                phrase += " "
                
            nElements.append((phrase[:-1],element[1]))
       
        nKeywords.append(nElements)
                
  return nKeywords
    
def filterBySingle(datas):
  # Remove the words that have inclusion relationships, with the following examples:
  # the original collection： human language、speech、synthetic speech、human language acquisition research、speech signal
  # should be removed: human language、speech

  # however,there is a problem with this approach, as follows:
  # the original collection：human language、language acquisition、human language acquisition reserach
  # According to the above rules, it should be removed：human language 、language acquisition，
  # But perhaps in other content sections,Language acquisition is the main word
  keywords = [data for data in datas]

  nKeywords = []

  for elements in keywords:
    words = []
    for i in range(len(elements)):
        word = elements[i][0]
        
        flag = False
        
        for j in range(i+1,len(elements)):
            # when i+1 > len(elements) , no errors will be reported and no build will continue to be made, so there is no impact 
            # word[:-1] The main purpose is to avoid the situation that the word "keyphrases" cannot be converted into "keyphrase" by lemma or stemm
            if word in elements[j][0] or word[:-1] in elements[j][0]:
                flag = True
                break
        
        if flag:
            continue
        else:
            words.append((word,elements[i][1]))
            
    nKeywords.append(words)
    
  return nKeywords

In [None]:
f_keywords = filterByLemma(keywords_fws)
f_keywords = filterBySingle(f_keywords)
f_keywords = filterByValue(f_keywords)
f_keywords[:3]

#### 3.2 Combine these phrases based on year

In [None]:
def combine(datas,years):
    resDict = {}
    for year in list(set(years)):
        resDict[year] = {}
    
    for i in range(len(years)):
        elements = datas[i]
        year = years[i]
        
        words = [element[0] for element in elements]
        
        for word in words:
            if word in resDict[year]:
                resDict[year][word] += 1
            else:
                resDict[year][word] = 1
        
    for year in list(set(years)):
        try:
            resDict[year] = sorted(resDict[year].items(),key=lambda x:x[1],reverse=True)
        except Exception as e:
            print(year)
            
    return resDict 

In [None]:
f_dict = combine(f_keywords,df_fws['Year'].tolist())

#### 3.3 data save

In [None]:
def saveData(datas,years,filename):
    resDict = combine(datas,years)
    
    df = pd.DataFrame()
    
    min_len = 10000
    for year in list(set(years)):
        length = len(resDict[year])
        if length < min_len:
            min_len = length
            
    for year in list(set(years)):
        df[str(year)] = [element[0] for element in resDict[year]][:min_len]
        df['count'+str(year)] = [element[1] for element in resDict[year]][:min_len]
    
    df.to_excel(filename)

In [None]:
saveData(f_keywords,df_fws['Year'].tolist(),'fws_keywords.xlsx')