In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
from bertopic import BERTopic
from umap import UMAP
from dataclasses import asdict

from my_scientific_profile.database.papers import load_all_papers_from_s3
from my_scientific_profile.web_app.extensions import s3_client, S3_BUCKET

In [None]:
papers = load_all_papers_from_s3(s3_client=s3_client, s3_bucket=S3_BUCKET)

In [None]:
df = pd.json_normalize(asdict(p) for p in papers)
df.head()

In [None]:
df[["doi", "title", "abstract"]].info()

In [None]:
df.loc[df.abstract.isna()][["title", "abstract"]]

In [None]:
df_clean = df.loc[~df.abstract.isna()].reset_index()

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

In [None]:
df_clean["abstract_without_stopwords"] = df_clean['abstract'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
df_clean["abstract_lemmatized"] = df_clean['abstract_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))

In [None]:
umap_model = UMAP(
    n_neighbors=2, 
    n_components=2, 
    min_dist=0.0, 
    metric='cosine', 
    random_state=100
)
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True, min_topic_size=2)
topics, probabilities = topic_model.fit_transform(df_clean['abstract_lemmatized'])

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_topic(8)

In [None]:
topic_model.visualize_barchart(top_n_topics=12)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
df_clean["topic"] = topic_model.topics_

In [None]:
df_clean[["title", "topic"]].sort_values("topic")