# Calculation of coherence for Figure 2d

In [1]:
import pandas as pd
import json

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

[nltk_data] Downloading package punkt to /Users/yoshi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
path = '***'

# 1. Data loading

In [4]:
df=pd.read_csv(path+'data/Polymer2.csv')
df

Unnamed: 0.1,Unnamed: 0,Reference Type,Year,Title,Author,Journal,Abstract,Times Cited,ISSN
0,0,Journal Article,1990,"Arylidene Polymers .5. Synthesis, Characteriza...","Abdalla, M. A.",Journal of Macromolecular Science-Chemistry,"A new [(2-oxo-1,3-cyclopentanediylidene)bis(me...",,0022-233x
1,0,Journal Article,1990,"Apatite Coating on Ceramics, Metals and Polyme...","Abe, Y., Kokubo, T. and Yamamuro, T.",Journal of Materials Science-Materials in Medi...,A novel method of apatite coating is presented...,14.0,0957-4530
2,0,Journal Article,1990,New Aspects of Polymer Drugs,"Akashi, M. and Takemoto, K.",Advances in Polymer Science,This article reviews results on the studies of...,9.0,0065-3195
3,0,Journal Article,1990,Polymer Gels in Uniaxial Deformation,"Alexander, S. and Rabin, Y.",Journal of Physics-Condensed Matter,The recent analysis of osmotic pressure in a s...,1.0,0953-8984
4,0,Journal Article,1990,Glass-Transition in Polymers - Freezing of Rot...,"Allegra, G., Bignotti, F., Gargani, L. and Co...",Macromolecules,A nonlattice model based on space-filling crit...,,0024-9297
...,...,...,...,...,...,...,...,...,...
275733,0,Journal Article,2024,Corona: An effective polarization strategy of ...,"Yi, J., Ye, Z. W., Zhang, S. X., Song, Y. H.,...",Applied Energy,Piezoelectric nanogenerators (PENGs) are mushr...,,0306-2619
275734,0,Journal Article,2024,Electrospray deposited plant-based polymer nan...,"Yontar, A. K. and Çevik, S.",Progress in Organic Coatings,The present study was designed to apply the El...,,0300-9440
275735,0,Journal Article,2024,Shape- and polymer-considered simulation to un...,"Zeng, Y. C., Wang, H., Liang, D. F., Yuan, W....",Journal of Hazardous Materials,Environmental microplastics (MPs) constitute v...,,0304-3894
275736,0,Journal Article,2024,"High-efficient Ag(I) ion binding, Ag(0) nanopa...","Zhang, J., Pu, N., Li, M. L., Sang, W. H., He...",Separation and Purification Technology,Interest in functional silver has rapidly grow...,,1383-5866


In [5]:
df['Title']

0         Arylidene Polymers .5. Synthesis, Characteriza...
1         Apatite Coating on Ceramics, Metals and Polyme...
2                              New Aspects of Polymer Drugs
3                      Polymer Gels in Uniaxial Deformation
4         Glass-Transition in Polymers - Freezing of Rot...
                                ...                        
275733    Corona: An effective polarization strategy of ...
275734    Electrospray deposited plant-based polymer nan...
275735    Shape- and polymer-considered simulation to un...
275736    High-efficient Ag(I) ion binding, Ag(0) nanopa...
275737    An interpenetrating polymer networks based on ...
Name: Title, Length: 275738, dtype: object

# 2. Calculation of coherence for titles

In [15]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in df['Title']]
dictionary = Dictionary(tokenized_docs)

model_name = ["BERT", "SciBERT", "MatSciBERT"]
coherence_list = []

for name in model_name:
    with open(path + "models/title/topic_words_" + name + "_title", "r", encoding="utf-8") as f:
        topic_words = json.load(f)

    filtered_topic_words = [
        [word for word in topic if word in dictionary.token2id]
        for topic in topic_words
        if isinstance(topic, list)
    ]

    filtered_topic_words = [topic for topic in filtered_topic_words if topic]

    coherence_model = CoherenceModel(
        topics=filtered_topic_words,
        texts=tokenized_docs,
        dictionary=dictionary,
        coherence='c_v'
    )

    score = coherence_model.get_coherence()
    print(f"Coherence score for {name}: {score}")
    coherence_list.append((name, score))

pd.DataFrame(coherence_list, columns=["Model", "Coherence"]).to_csv(path + "models/title/coherence_title.csv", index=False)

Coherence score for BERT: 0.4402981375759279
Coherence score for SciBERT: 0.4606004303810137
Coherence score for MatSciBERT: 0.47871349070873087


# 3. Calculation of coherence for abstracts

In [17]:
tokenized_docs = [word_tokenize(doc.lower()) for doc in df['Abstract']]
dictionary = Dictionary(tokenized_docs)

model_name = ["BERT", "SciBERT", "MatSciBERT"]
coherence_list = []

for name in model_name:
    with open(path + "models/abstract/topic_words_" + name + "_abstract", "r", encoding="utf-8") as f:
        topic_words = json.load(f)

    filtered_topic_words = [
        [word for word in topic if word in dictionary.token2id]
        for topic in topic_words
        if isinstance(topic, list)
    ]

    filtered_topic_words = [topic for topic in filtered_topic_words if topic]

    coherence_model = CoherenceModel(
        topics=filtered_topic_words,
        texts=tokenized_docs,
        dictionary=dictionary,
        coherence='c_v'
    )

    score = coherence_model.get_coherence()
    print(f"Coherence score for {name}: {score}")
    coherence_list.append((name, score))

pd.DataFrame(coherence_list, columns=["Model", "Coherence"]).to_csv(path + "models/abstract/coherence_abstract.csv", index=False)

Coherence score for BERT: 0.4560397628500703
Coherence score for SciBERT: 0.5333310594604677
Coherence score for MatSciBERT: 0.54881815539565
