In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
kw_list = pd.read_csv('kws.csv').iloc[:,-1].values.tolist()
kw_list[:5]

['meteorology',
 'periodic table',
 'weather forecasting',
 'earthquakes',
 'stars']

In [3]:
science_regions = [
'Mathematics and Logic',
'Systems science',
'Data science',
# 'Information science',
'Physics',
'Chemistry',
# 'Biology',
'Earth science',
'Astronomy',
'Economics',
'Political science',
# 'Sociology',
'Psychology',
'Anthropology',
# 'Statistics',
# 'Computer science',
'Systems engineering',
'Data engineering',
'Information engineering',
'Engineering',
'Agricultural science',
'Medicine',
# 'Pharmacy',
# 'Accounting',
# 'Business administration',
# 'Finance',
# 'Jurisprudence',
'Pedagogy'
]

In [4]:

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')

def get_bert_embeddings(text_list):
    embeddings = []
    with torch.no_grad():
        for text in text_list:
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

region_embeddings = get_bert_embeddings(science_regions)

keyword_embeddings = get_bert_embeddings(kw_list)

similarities = cosine_similarity(keyword_embeddings, region_embeddings)

closest_regions = np.argmax(similarities, axis=1)

results = pd.DataFrame({
    'keyword': kw_list,
    'category': [science_regions[i] for i in closest_regions],
    'similarity_score': np.max(similarities, axis=1)
})

category_counts = results.groupby('category').size().sort_values(ascending=False)
print("\nKeywords per category:")
print(category_counts)

results.to_csv('keyword_classifications.csv', index=False)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Keywords per category:
category
Physics                    346
Systems engineering        158
Earth science               95
Astronomy                   87
Chemistry                   84
Anthropology                66
Medicine                    56
Pedagogy                    47
Data engineering            37
Systems science             34
Economics                   30
Political science           30
Data science                29
Agricultural science        28
Psychology                  19
Engineering                 13
Mathematics and Logic       12
Information engineering      9
dtype: int64


In [5]:

print("\nTop 10 keywords per category:")
for category in science_regions:
    category_keywords = results[results['category'] == category].sort_values('similarity_score', ascending=False)
    print(f"\n{category}:")
    print(category_keywords['keyword'].head(10).tolist())

avg_similarity = results['similarity_score'].mean()
min_similarity = results['similarity_score'].min()
max_similarity = results['similarity_score'].max()

print(f"\nClassification Statistics:")
print(f"Average similarity score: {avg_similarity:.3f}")
print(f"Minimum similarity score: {min_similarity:.3f}")
print(f"Maximum similarity score: {max_similarity:.3f}")

threshold = 0.5  
low_confidence = results[results['similarity_score'] < threshold]
if len(low_confidence) > 0:
    print("\nLow confidence classifications (similarity < 0.5):")
    print(low_confidence.sort_values('similarity_score'))


Top 10 keywords per category:

Mathematics and Logic:
['discrete mathematics', 'number theory', 'set theory', 'differential equations', 'turing machines', 'logical positivism', 'ohms law', 'bells theorem', 'celestial mechanics', 'phylogenetic trees']

Systems science:
['systems biology', 'computational social science', 'transportation science', 'complex systems', 'complexity science', 'forensic science', 'cognitive science', 'cognitive robotics', 'computational biology', 'cognitive computing']

Data science:
['network science', 'convergence science', 'material science', 'statistical process control', 'surface science', 'rocket science', 'cheminformatics', 'astroinformatics', 'semantic web', 'sleep science']

Physics:
['physics', 'atomic physics', 'chemical physics', 'biology', 'plasma physics', 'solar physics', 'physical biology', 'nuclear energy', 'cosmology', 'statistical physics']

Chemistry:
['chemistry', 'synthetic chemistry', 'theoretical chemistry', 'spectroscopy', 'computation

In [6]:
results

Unnamed: 0,keyword,category,similarity_score
0,meteorology,Earth science,0.850049
1,periodic table,Chemistry,0.739016
2,weather forecasting,Earth science,0.828568
3,earthquakes,Astronomy,0.812728
4,stars,Astronomy,0.801230
...,...,...,...
1175,seafloor spreading,Earth science,0.688241
1176,quantum tunneling,Physics,0.809199
1177,industrialorganizational psychology,Psychology,0.797232
1178,patient confidentiality,Pedagogy,0.713582


In [10]:
u = results['category'].unique()
print(u)
print(len(u))

['Earth science' 'Chemistry' 'Astronomy' 'Anthropology' 'Physics'
 'Psychology' 'Medicine' 'Systems engineering' 'Information engineering'
 'Political science' 'Pedagogy' 'Economics' 'Mathematics and Logic'
 'Engineering' 'Agricultural science' 'Systems science' 'Data science'
 'Data engineering']
18


In [13]:
sorted(u)

['Agricultural science',
 'Anthropology',
 'Astronomy',
 'Chemistry',
 'Data engineering',
 'Data science',
 'Earth science',
 'Economics',
 'Engineering',
 'Information engineering',
 'Mathematics and Logic',
 'Medicine',
 'Pedagogy',
 'Physics',
 'Political science',
 'Psychology',
 'Systems engineering',
 'Systems science']

In [14]:
results

Unnamed: 0,keyword,category,similarity_score
0,meteorology,Earth science,0.850049
1,periodic table,Chemistry,0.739016
2,weather forecasting,Earth science,0.828568
3,earthquakes,Astronomy,0.812728
4,stars,Astronomy,0.801230
...,...,...,...
1175,seafloor spreading,Earth science,0.688241
1176,quantum tunneling,Physics,0.809199
1177,industrialorganizational psychology,Psychology,0.797232
1178,patient confidentiality,Pedagogy,0.713582
