In [1]:
import pandas as pd
import torch
import warnings
warnings.filterwarnings("ignore")

df = pd.read_excel('./data_2021-2024.xlsx')
df = df.dropna(subset=['摘要(译)(English)'])
df = df[df['摘要(译)(English)'] != '-']
df1 = df
df

In [2]:
def truncate_text(text, max_length=510, redundancy=20):
    if len(text) <= max_length:
        return text
    
    end_idx_cn = text.rfind('。', 0, max_length)
    end_idx_en = text.rfind('.', 0, max_length)
    
    end_idx = max(end_idx_cn, end_idx_en)
    
    if end_idx == -1:
        start_idx = max_length - redundancy if max_length > redundancy else 0
        end_idx = max_length + redundancy if len(text) > max_length + redundancy else len(text)
        return text[start_idx:end_idx]
    else:
        return text[:end_idx + 1]

df1['摘要(译)(English)'] = df1['摘要(译)(English)'].apply(lambda x: truncate_text(x, max_length=512))

In [3]:
df1 = df1[['摘要(译)(English)']]

In [4]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

device = torch.device("cpu")

tokenizer = DistilBertTokenizer.from_pretrained(r'distilbert-base-nli-mean-tokens')
model = DistilBertModel.from_pretrained(r'distilbert-base-nli-mean-tokens')
model = model.to(device)


def get_text_embedding(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :]
    return embedding

text = df1['摘要(译)(English)'].values.tolist()
embeddings = get_text_embedding(text)
embeddings = embeddings.detach().cpu().numpy()
print(embeddings.shape)

In [5]:
import umap
umap_model = umap.UMAP(n_components=3, random_state=2022)

umap_embeddings = umap_model.fit_transform(embeddings)
umap_embeddings.shape

In [6]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

SSE = []
for k in range(1, 8):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(umap_embeddings)
    SSE.append(kmeans.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(range(1, 8), SSE, 'o-')
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.title("Elbow Method For Optimal k")
plt.show()

In [7]:
from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=3, random_state=2024).fit(umap_embeddings)
labels = cluster.labels_

df['cluster'] = labels


df1['cluster'] = cluster.labels_
df1.head()

df1['cluster'].value_counts()

df2 = df1

In [9]:
df1.to_excel('s5.xlsx', index = False)

In [14]:
import re
from nltk.corpus import stopwords
def filter_text(text):    
    text = re.sub(r'\([^)]*\)', '', text)
    pattern = re.compile(r'[^a-zA-Z\s]')
    text = pattern.sub(' ', text)
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(filtered_words)
df2['摘要(译)(English)'] = df2['摘要(译)(English)'].astype(str)
df2['摘要(译)(English)'] = [filter_text(item) for item in df2['摘要(译)(English)']]

In [15]:
df3 = df2
df3['cluster'] = cluster.labels_

cluster0 = df3[df3['cluster']==0]

In [16]:
from keybert import KeyBERT

def get_top_n_words(text, top_n=10):
    model = KeyBERT(r'all-mpnet-base-v2')
    keywords = model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=top_n)
    return keywords

merged_text = ' '.join(cluster0['摘要(译)(English)'].tolist())

keywords = get_top_n_words(merged_text, top_n=10)
keywords

In [17]:
cluster0 = df3[df3['cluster']==0]
cluster1 = df3[df3['cluster']==1]
cluster2 = df3[df3['cluster']==2]
c0 = ' '.join(cluster0['摘要(译)(English)'].tolist())
c1 = ' '.join(cluster1['摘要(译)(English)'].tolist())
c2 = ' '.join(cluster2['摘要(译)(English)'].tolist())

In [18]:
df4 = []
common_words = get_top_n_words(c0, 10)
df4.append(common_words)
common_words = get_top_n_words(c1, 10)
df4.append(common_words)
common_words = get_top_n_words(c2, 10)
df4.append(common_words)
df5 = pd.DataFrame(df4)
df5 = df5.T
df5.columns = ['cluster0','cluster1','cluster2']
df5

In [19]:
df5.to_excel('result_2021-2024.xlsx',index=False)