In [27]:
import pandas as pd 
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.datasets import fetch_20newsgroups

df = pd.read_excel("savedrecs.xls")

print(df)

    Publication Type  Authors  Book Authors  Book Editors  Book Group Authors   
0                  J      NaN           NaN           NaN                 NaN  \
1                  C      NaN           NaN           NaN                 NaN   
2                  C      NaN           NaN           NaN                 NaN   
3                  C      NaN           NaN           NaN                 NaN   
4                  C      NaN           NaN           NaN                 NaN   
..               ...      ...           ...           ...                 ...   
695                C      NaN           NaN           NaN                 NaN   
696                J      NaN           NaN           NaN                 NaN   
697                J      NaN           NaN           NaN                 NaN   
698                C      NaN           NaN           NaN                 NaN   
699                C      NaN           NaN           NaN                 NaN   

     Author Full Names  Boo

In [28]:
df = df[["Article Title","Abstract"]]

In [29]:
docs = df["Abstract"].astype(str).tolist()
docs

['Maritime vessels equipped with visible and infrared cameras can complement other conventional sensors for object detection. However, application of computer vision techniques in maritime domain received attention only recently. The maritime environment offers its own unique requirements and challenges. Assessment of the quality of detections is a fundamental need in computer vision. However, the conventional assessment metrics suitable for usual object detection are deficient in the maritime setting. Thus, a large body of related work in computer vision appears inapplicable to the maritime setting at the first sight. We discuss the problem of defining assessment metrics suitable for maritime computer vision. We consider new bottom edge proximity metrics as assessment metrics for maritime computer vision. These metrics indicate that existing computer vision approaches are indeed promising for maritime computer vision and can play a foundational role in the emerging field of maritime c

In [30]:
type(docs)

list

In [31]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer()

In [32]:
model = BERTopic(
  embedding_model=embedding_model,        
  umap_model=umap_model,               
  hdbscan_model=hdbscan_model,              
  vectorizer_model=vectorizer_model, 
  ctfidf_model=ctfidf_model,
  nr_topics=50,
  n_gram_range=(1,2)       
)


In [33]:
topics, probabilities = model.fit_transform(docs)

In [34]:
topic_df = model.get_topic_info()
topic_df

Unnamed: 0,Topic,Count,Name
0,0,129,0_nan___
1,1,571,1_vision_computer_based_paper


In [40]:
get_topic = model.get_topic
get_topic(1)

[('vision', 0.09829921336888292),
 ('computer', 0.08904437265725755),
 ('based', 0.040366597993864724),
 ('paper', 0.03822021659227216),
 ('image', 0.035602446182882476),
 ('using', 0.0302605592985919),
 ('detection', 0.025884768266297212),
 ('algorithms', 0.02492948497032062),
 ('used', 0.02452760440652543),
 ('processing', 0.02355347387934571)]

In [39]:
zero=[]
number=[]
for i in range(len(topic_df)):
  zero.append('0')
  number.append(i-1)

topic_word_df=pd.DataFrame({
    'topic_num':number,
    'topic':topic_df['Name'],
    'w1':zero,
    'w2':zero,
    'w3':zero,
    'w4':zero,
    'w5':zero,
    'w6':zero,
    'w7':zero,
    'w8':zero,
    'w9':zero,
    'w10':zero
})

#각 토픽별 10개 단어 정리
col = topic_word_df.columns
for i in range(len(topic_word_df)):
  for j in range(10):
    topic_word_df[col[j+2]][i]=get_topic(i-1)[j][0]

TypeError: 'bool' object is not subscriptable