In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
from bertopic import BERTopic
from umap import UMAP
from dataclasses import asdict
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
from bokeh.io import show, output_notebook
from bokeh.models import (
    BoxZoomTool, Circle, HoverTool,
    MultiLine, Plot, Range1d, ResetTool,
    NodesAndLinkedEdges,
    OpenURL, TapTool,
)
from bokeh.palettes import Spectral4
from bokeh.plotting import figure, from_networkx
from bokeh.transform import linear_cmap
import matplotlib.pyplot as plt


from my_scientific_profile.database.papers import load_all_papers_from_s3
from my_scientific_profile.database.aws_s3 import s3_client, S3_BUCKET

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tbereau/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tbereau/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tbereau/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
papers = load_all_papers_from_s3(s3_client=s3_client, s3_bucket=S3_BUCKET)

In [3]:
df = pd.json_normalize(asdict(p) for p in papers)
df.head()

Unnamed: 0,doi,title,publication_date,authors,citation_count,bib_entry,abstract,tldr,year,journal.name,...,journal.pages,journal.volume,open_access.is_open_access,open_access.open_access_status,open_access.landing_page_url,open_access.pdf_url,embedding.x,embedding.y,embedding.topic_number,embedding.topic_name
0,10.1021/acs.jctc.3c00201,Condensed-Phase Molecular Representation to Li...,2023-07-03 12:50:55,"[{'given': 'Bernadette', 'family': 'Mohr', 'af...",0,"@article{Mohr_2023,\n\tdoi = {10.1021/acs.jctc...",Molecular design requires systematic and broad...,,2023,Journal of Chemical Theory and Computation,...,,,True,hybrid,https://doi.org/10.1021/acs.jctc.3c00201,,0.249262,10.144167,1,| |
1,10.1039/D3BM00412K,Inverse design of viral infectivity-enhancing ...,2023-06-15 11:01:18,"[{'given': 'Kübra', 'family': 'Kaygisiz', 'aff...",0,"@article{Kaygisiz_2023,\n\tdoi = {10.1039/d3bm...",Amyloid-like nanofibers from self-assembling p...,These de novo sequences are the shortest activ...,2023,Biomaterials Science,...,,,True,hybrid,https://doi.org/10.1039/d3bm00412k,https://pubs.rsc.org/en/content/articlepdf/202...,-0.143314,10.587479,3,| |
2,10.1021/acsfoodscitech.2c00251,Identifying Sequential Residue Patterns in Bit...,2022-11-09 14:00:22,"[{'given': 'Arghya', 'family': 'Dutta', 'affil...",1,"@article{Dutta_2022,\n\tdoi = {10.1021/acsfood...","The primary structures of peptides, originatin...",This work proposes a method that coarse-grains...,2022,ACS Food Science &amp; Technology,...,1773-1780,2.0,True,hybrid,https://doi.org/10.1021/acsfoodscitech.2c00251,https://pubs.acs.org/doi/pdf/10.1021/acsfoodsc...,2.160865,10.472014,8,mtp | pc | electrostatics
3,10.1063/5.0104914,Broad chemical transferability in structure-ba...,2022-08-12 11:51:35,"[{'given': 'Kiran', 'family': 'Kanekal', 'affi...",2,"@article{Kanekal_2022,\n\tdoi = {10.1063/5.010...",Compared to top-down coarse-grained (CG) model...,,2022,The Journal of Chemical Physics,...,104102,157.0,True,hybrid,https://doi.org/10.1063/5.0104914,https://aip.scitation.org/doi/pdf/10.1063/5.01...,-0.313834,9.078019,9,peptide | umami | peptides
4,10.3389/fchem.2022.982757,Benchmarking coarse-grained models of organic ...,2022-09-09 05:40:47,"[{'given': 'Marc', 'family': 'Stieffenhofer', ...",2,"@article{Stieffenhofer_2022,\n\tdoi = {10.3389...",The potential of mean force is an effective co...,The reintroduced details enable force computat...,2022,Frontiers in Chemistry,...,,10.0,True,gold,https://doi.org/10.3389/fchem.2022.982757,https://www.frontiersin.org/articles/10.3389/f...,-0.35733,8.789751,9,peptide | umami | peptides


In [None]:
df.to_json("all_papers.json")
# df.to_csv("all_papers.csv")

In [4]:
df[["doi", "title", "abstract"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   doi       73 non-null     object
 1   title     73 non-null     object
 2   abstract  58 non-null     object
dtypes: object(3)
memory usage: 1.8+ KB


In [5]:
df.loc[df.abstract.isna()][["title", "abstract"]]

Unnamed: 0,title,abstract
5,FAIR data enabling new horizons for materials ...,
8,Induced asymmetries in membranes,
10,Publisher’s Note: “Data-driven equation for dr...,
13,Computer simulations of lipid regulation by mo...,
14,Finite-size transitions in complex membranes,
27,Molecular dynamics trajectories for 630 coarse...,
35,Hoobas: A highly object-oriented builder for m...,
43,Efficient potential of mean force calculation ...,
48,An in-silico walker,
49,Concurrent parametrization against static and ...,


In [6]:
df_clean = df.loc[~df.abstract.isna()].reset_index()

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

179

In [8]:
df_clean["abstract_without_stopwords"] = df_clean['abstract'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
df_clean["abstract_lemmatized"] = df_clean['abstract_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))

In [9]:
umap_model = UMAP(
    n_neighbors=2, 
    n_components=2, 
    min_dist=0.0, 
    metric='euclidean',
    random_state=100
)

In [10]:
topic_model = BERTopic(
    umap_model=umap_model,
    min_topic_size=2,
    top_n_words=10,
).fit(
    df_clean["abstract_lemmatized"], 
)

2023-07-04 18:26:44,740 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2023-07-04 18:26:45,072 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8,-1_modeling_molecular_cg_structure,"[modeling, molecular, cg, structure, datadrive...",[Syndiotactic polystyrene (sPS) exhibit comple...
1,0,6,0_cardiolipin_permeability_coarsegrained_membrane,"[cardiolipin, permeability, coarsegrained, mem...",[Unraveling relation chemical structure small ...
2,1,5,1_sequence_dispersion_intermolecular_growth,"[sequence, dispersion, intermolecular, growth,...",[urgent need biomaterials support tissue heali...
3,2,5,2_state_reweighting_trajectory_dimensionality,"[state, reweighting, trajectory, dimensionalit...","[Discrete-space kinetic models, i.e., Markov s..."
4,3,5,3_model_transition_kinetic_simulation,"[model, transition, kinetic, simulation, struc...",[generic coarse-grained (CG) protein model pre...
5,4,5,4_mtp_pc_electrostatics_method,"[mtp, pc, electrostatics, method, electrostati...",[performance multipole (MTP) point charge (PC)...
6,5,4,5_peptide_umami_peptides_bitter,"[peptide, umami, peptides, bitter, pattern, re...",[Interfacial system core fascinating phenomeno...
7,6,4,6_model_cg_coarsegrained_correlation,"[model, cg, coarsegrained, correlation, struct...","[Compared top-down coarse-grained (CG) models,..."
8,7,4,7_ml_learning_space_chemical,"[ml, learning, space, chemical, threebody, pol...",[Designing advanced membrane material machine-...
9,8,3,8_time_ffcf_scale_simulations,"[time, ffcf, scale, simulations, infrared, sim...",[solvent dynamic around fluorinated acetonitri...


In [12]:
from sentence_transformers import SentenceTransformer

# Create embeddings from the documents
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(df_clean["abstract_lemmatized"])

2023-07-04 18:26:51,430 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-07-04 18:26:51,648 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
topic_labels = topic_model.generate_topic_labels(
    nr_words=3,
    topic_prefix=False,
    word_length=15,
    separator=" | "
)
topic_model.set_topic_labels(topic_labels)

In [22]:
topic_model.visualize_documents(
    docs=df_clean["title"],  # df_clean["title"]
    embeddings=embeddings,
    hide_annotations=False,
    custom_labels=True,
    title="Literature graph",
).update_traces(marker_size=20)

In [23]:
plotly_obj = topic_model.visualize_documents(
    docs=df_clean["index"],  # df_clean["title"]
    embeddings=embeddings,
    hide_annotations=False,
    custom_labels=True,
    title="Literature graph",
).update_traces(marker_size=20)
plotly_obj

In [24]:
df_coord = pd.json_normalize([{"paper_id": int(index), "x":x, "y": y} for d in plotly_obj.data for index, x, y in zip(d["hovertext"], d["x"], d["y"]) if not np.isnan(index)])
df_coord["topic"] = df_clean.iloc[df_coord["paper_id"].values].topic.values
df_coord["topic_name"] = df_coord.apply(lambda x: topic_labels[int(x["topic"])], axis=1)
df_coord["title"] = df_coord.apply(lambda x: f"{df_clean.iloc[x['paper_id']].title[:50] + '...'}" if len(df_clean.iloc[x["paper_id"]].title) > 50 else df_clean.iloc[x["paper_id"]].title, axis=1)
df_coord["doi"] = df_clean.iloc[df_coord["paper_id"]].doi.values
df_coord["journal"] = df_clean.iloc[df_coord["paper_id"]]["journal.name"].values
df_coord["year"] = df_clean.iloc[df_coord["paper_id"]]["year"].values
df_coord.head()

IndexError: positional indexers are out-of-bounds

In [25]:
plot = figure(
    width=800, 
    height=800, 
    title="Literature graph", 
    x_axis_location=None, 
    y_axis_location=None, 
    tooltips=[
        ("Topic", "@topic_name"),
        ("Title", "@title"),
        ("Ref", "@journal (@year)"),
        ("DOI", "@doi"),
    ]
)
plot.add_layout(Legend(), 'below')
# plot.grid.grid_line_color = None
plot.scatter(
    x="x", 
    y="y", 
    source=df_coord, 
    size=20,
    alpha=0.7,
    line_width=0,
    legend_field="topic_name",
    fill_color=linear_cmap("topic", "Turbo256", 0, len(topic_labels))
)
output_notebook()
show(plot)

NameError: name 'Legend' is not defined

In [26]:
df_clean["topic"] = topic_model.topics_
df_clean.head()

Unnamed: 0,index,doi,title,publication_date,authors,citation_count,bib_entry,abstract,tldr,year,...,open_access.open_access_status,open_access.landing_page_url,open_access.pdf_url,embedding.x,embedding.y,embedding.topic_number,embedding.topic_name,abstract_without_stopwords,abstract_lemmatized,topic
0,0,10.1021/acs.jctc.3c00201,Condensed-Phase Molecular Representation to Li...,2023-07-03 12:50:55,"[{'given': 'Bernadette', 'family': 'Mohr', 'af...",0,"@article{Mohr_2023,\n\tdoi = {10.1021/acs.jctc...",Molecular design requires systematic and broad...,,2023,...,hybrid,https://doi.org/10.1021/acs.jctc.3c00201,,0.249262,10.144167,1,| |,Molecular design requires systematic broadly a...,Molecular design requires systematic broadly a...,0
1,1,10.1039/D3BM00412K,Inverse design of viral infectivity-enhancing ...,2023-06-15 11:01:18,"[{'given': 'Kübra', 'family': 'Kaygisiz', 'aff...",0,"@article{Kaygisiz_2023,\n\tdoi = {10.1039/d3bm...",Amyloid-like nanofibers from self-assembling p...,These de novo sequences are the shortest activ...,2023,...,hybrid,https://doi.org/10.1039/d3bm00412k,https://pubs.rsc.org/en/content/articlepdf/202...,-0.143314,10.587479,3,| |,Amyloid-like nanofibers self-assembling peptid...,Amyloid-like nanofibers self-assembling peptid...,1
2,2,10.1021/acsfoodscitech.2c00251,Identifying Sequential Residue Patterns in Bit...,2022-11-09 14:00:22,"[{'given': 'Arghya', 'family': 'Dutta', 'affil...",1,"@article{Dutta_2022,\n\tdoi = {10.1021/acsfood...","The primary structures of peptides, originatin...",This work proposes a method that coarse-grains...,2022,...,hybrid,https://doi.org/10.1021/acsfoodscitech.2c00251,https://pubs.acs.org/doi/pdf/10.1021/acsfoodsc...,2.160865,10.472014,8,mtp | pc | electrostatics,"primary structures peptides, originating food ...","primary structure peptides, originating food p...",5
3,3,10.1063/5.0104914,Broad chemical transferability in structure-ba...,2022-08-12 11:51:35,"[{'given': 'Kiran', 'family': 'Kanekal', 'affi...",2,"@article{Kanekal_2022,\n\tdoi = {10.1063/5.010...",Compared to top-down coarse-grained (CG) model...,,2022,...,hybrid,https://doi.org/10.1063/5.0104914,https://aip.scitation.org/doi/pdf/10.1063/5.01...,-0.313834,9.078019,9,peptide | umami | peptides,"Compared top-down coarse-grained (CG) models, ...","Compared top-down coarse-grained (CG) models, ...",6
4,4,10.3389/fchem.2022.982757,Benchmarking coarse-grained models of organic ...,2022-09-09 05:40:47,"[{'given': 'Marc', 'family': 'Stieffenhofer', ...",2,"@article{Stieffenhofer_2022,\n\tdoi = {10.3389...",The potential of mean force is an effective co...,The reintroduced details enable force computat...,2022,...,gold,https://doi.org/10.3389/fchem.2022.982757,https://www.frontiersin.org/articles/10.3389/f...,-0.35733,8.789751,9,peptide | umami | peptides,potential mean force effective coarse-grained ...,potential mean force effective coarse-grained ...,6


## Networkx & Bokeh

In [None]:
paper_offset = 100
topic_attrs = {
    k: {"topic": f"Topic: {v}", "id": (k+1) / len(topic_labels)} for k, v in enumerate(topic_labels)
}
topic_attrs = {
    **topic_attrs,
    len(topic_labels): {"key": "", "id": np.nan}
}
paper_attrs = {
    paper_offset + k: {
        "topic": topic_labels[top],
        "title": f"{t[:40] + '...' if len(t) >40 else t}",
        "id": (top+1) / len(topic_labels),
        "doi": doi,
    } for k, (t, top, doi) in enumerate(df_clean[["title", "topic", "doi"]].to_numpy())
}
attrs = {**topic_attrs, **paper_attrs}

In [None]:
G = nx.Graph()
G.add_nodes_from(topic_model.topic_labels_.keys())
G.add_node(len(topic_labels))
G.add_edges_from(
    [(len(topic_labels), i) for i in range(len(topic_labels))]
)
G.add_nodes_from(df_clean["index"] + paper_offset)
nx.set_node_attributes(G, attrs)
G.add_edges_from(
    [(i+paper_offset, t) for i, t in df_clean[["index", "topic"]].to_numpy()]
)

In [None]:
plot = figure(width=500, height=500, x_range=(-1.2, 1.2), y_range=(-1.2, 1.2),
              x_axis_location=None, y_axis_location=None, 
              title="Graph Interaction Demo", 
              tooltips=[
                  ("Topic", "@topic"),
                  ("Title", "@title"),
                  ("doi", "@doi"),
              ])
plot.grid.grid_line_color = None

graph_renderer = from_networkx(G, nx.spring_layout, scale=1., center=(0, 0))
graph_renderer.node_renderer.glyph = Circle(
    size=15, 
    fill_color=linear_cmap("id", "Set3_12", 1/len(topic_labels), 1.)
)
graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

graph_renderer.edge_renderer.glyph = MultiLine(
    line_color="#CCCCCC", line_alpha=0.5, line_width=5.
)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)

graph_renderer.selection_policy = NodesAndLinkedEdges()
graph_renderer.inspection_policy = NodesAndLinkedEdges()

plot.renderers.append(graph_renderer)

output_notebook()
show(plot)

## Wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
def create_one_wordcloud(model):
    text = {word: value for entry in model.get_topics().values() for word, value in entry}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Show wordcloud
create_wordcloud(topic_model, topic=0)

In [None]:
create_one_wordcloud(topic_model)