In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
from bertopic import BERTopic
from umap import UMAP
from dataclasses import asdict
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
from bokeh.io import show, output_notebook
from bokeh.models import (
    BoxZoomTool, Circle, HoverTool,
    MultiLine, Plot, Range1d, ResetTool,
    NodesAndLinkedEdges,
    OpenURL, TapTool,
)
from bokeh.palettes import Spectral4
from bokeh.plotting import figure, from_networkx
from bokeh.transform import linear_cmap
import matplotlib.pyplot as plt


from my_scientific_profile.database.papers import load_all_papers_from_s3
from my_scientific_profile.web_app.extensions import s3_client, S3_BUCKET

In [None]:
papers = load_all_papers_from_s3(s3_client=s3_client, s3_bucket=S3_BUCKET)

In [None]:
df = pd.json_normalize(asdict(p) for p in papers)
df.head()

In [None]:
df[["doi", "title", "abstract"]].info()

In [None]:
df.loc[df.abstract.isna()][["title", "abstract"]]

In [None]:
df_clean = df.loc[~df.abstract.isna()].reset_index()

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

In [None]:
df_clean["abstract_without_stopwords"] = df_clean['abstract'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
df_clean["abstract_lemmatized"] = df_clean['abstract_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))

In [None]:
umap_model = UMAP(
    n_neighbors=2, 
    n_components=2, 
    min_dist=0.0, 
    metric='euclidean',
    random_state=100
)

In [None]:
topic_model = BERTopic(
    umap_model=umap_model,
    min_topic_size=2,
    top_n_words=10,
).fit(
    df_clean["abstract_lemmatized"], 
)

In [None]:
topic_model.get_topic_info()

In [None]:
from sentence_transformers import SentenceTransformer

# Create embeddings from the documents
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(df_clean["abstract_lemmatized"])

In [None]:
topic_labels = topic_model.generate_topic_labels(
    nr_words=3,
    topic_prefix=False,
    word_length=15,
    separator=" | "
)
topic_model.set_topic_labels(topic_labels)

In [None]:
topic_model.visualize_documents(
    docs=df_clean["title"],  # df_clean["title"]
    embeddings=embeddings,
    hide_annotations=False,
    custom_labels=True,
    title="Literature graph",
).update_traces(marker_size=20)

In [None]:
plotly_obj = topic_model.visualize_documents(
    docs=df_clean["index"],  # df_clean["title"]
    embeddings=embeddings,
    hide_annotations=False,
    custom_labels=True,
    title="Literature graph",
).update_traces(marker_size=20)
plotly_obj

In [None]:
df_coord = pd.json_normalize([{"paper_id": int(index), "x":x, "y": y} for d in plotly_obj.data for index, x, y in zip(d["hovertext"], d["x"], d["y"]) if not np.isnan(index)])
df_coord["topic"] = df_clean.iloc[df_coord["paper_id"].values].topic.values
df_coord["topic_name"] = df_coord.apply(lambda x: topic_labels[int(x["topic"])], axis=1)
df_coord["title"] = df_coord.apply(lambda x: f"{df_clean.iloc[x['paper_id']].title[:50] + '...'}" if len(df_clean.iloc[x["paper_id"]].title) > 50 else df_clean.iloc[x["paper_id"]].title, axis=1)
df_coord["doi"] = df_clean.iloc[df_coord["paper_id"]].doi.values
df_coord["journal"] = df_clean.iloc[df_coord["paper_id"]]["journal.name"].values
df_coord["year"] = df_clean.iloc[df_coord["paper_id"]]["year"].values
df_coord.head()

In [None]:
plot = figure(
    width=800, 
    height=800, 
    title="Literature graph", 
    x_axis_location=None, 
    y_axis_location=None, 
    tooltips=[
        ("Topic", "@topic_name"),
        ("Title", "@title"),
        ("Ref", "@journal (@year)"),
        ("DOI", "@doi"),
    ]
)
plot.add_layout(Legend(), 'below')
# plot.grid.grid_line_color = None
plot.scatter(
    x="x", 
    y="y", 
    source=df_coord, 
    size=20,
    alpha=0.7,
    line_width=0,
    legend_field="topic_name",
    fill_color=linear_cmap("topic", "Turbo256", 0, len(topic_labels))
)
output_notebook()
show(plot)

In [None]:
df_clean["topic"] = topic_model.topics_
df_clean.head()

## Networkx & Bokeh

In [None]:
paper_offset = 100
topic_attrs = {
    k: {"topic": f"Topic: {v}", "id": (k+1) / len(topic_labels)} for k, v in enumerate(topic_labels)
}
topic_attrs = {
    **topic_attrs,
    len(topic_labels): {"key": "", "id": np.nan}
}
paper_attrs = {
    paper_offset + k: {
        "topic": topic_labels[top],
        "title": f"{t[:40] + '...' if len(t) >40 else t}",
        "id": (top+1) / len(topic_labels),
        "doi": doi,
    } for k, (t, top, doi) in enumerate(df_clean[["title", "topic", "doi"]].to_numpy())
}
attrs = {**topic_attrs, **paper_attrs}

In [None]:
G = nx.Graph()
G.add_nodes_from(topic_model.topic_labels_.keys())
G.add_node(len(topic_labels))
G.add_edges_from(
    [(len(topic_labels), i) for i in range(len(topic_labels))]
)
G.add_nodes_from(df_clean["index"] + paper_offset)
nx.set_node_attributes(G, attrs)
G.add_edges_from(
    [(i+paper_offset, t) for i, t in df_clean[["index", "topic"]].to_numpy()]
)

In [None]:
plot = figure(width=500, height=500, x_range=(-1.2, 1.2), y_range=(-1.2, 1.2),
              x_axis_location=None, y_axis_location=None, 
              title="Graph Interaction Demo", 
              tooltips=[
                  ("Topic", "@topic"),
                  ("Title", "@title"),
                  ("doi", "@doi"),
              ])
plot.grid.grid_line_color = None

graph_renderer = from_networkx(G, nx.spring_layout, scale=1., center=(0, 0))
graph_renderer.node_renderer.glyph = Circle(
    size=15, 
    fill_color=linear_cmap("id", "Set3_12", 1/len(topic_labels), 1.)
)
graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

graph_renderer.edge_renderer.glyph = MultiLine(
    line_color="#CCCCCC", line_alpha=0.5, line_width=5.
)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)

graph_renderer.selection_policy = NodesAndLinkedEdges()
graph_renderer.inspection_policy = NodesAndLinkedEdges()

plot.renderers.append(graph_renderer)

output_notebook()
show(plot)

## Wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
def create_one_wordcloud(model):
    text = {word: value for entry in model.get_topics().values() for word, value in entry}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Show wordcloud
create_wordcloud(topic_model, topic=0)

In [None]:
create_one_wordcloud(topic_model)