In [6]:
import os
import pandas as pd
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import RDF, RDFS, OWL, XSD, DC
from Levenshtein import distance as edit_distance
from tqdm import tqdm
from datasketch import MinHash, MinHashLSH
from collections import defaultdict
import tomllib

In [3]:
import sys
from pathlib import Path

# Add the src directory to the Python path
sys.path.append(str(Path().resolve() / "src"))

## Parsing

In [10]:
from parser import parse_and_extract_articles_langs_from_dirs

ROOT = Path().resolve()
config_path =  "config.toml"

with open(config_path, "rb") as f:
    config = tomllib.load(f)

rdf_dirs = [ROOT / Path(d) for d in config["paths"]["rdf_dirs"]]

In [26]:
en_articles, jp_articles = parse_and_extract_articles_langs_from_dirs(rdf_dirs)

Found 50000 RDF files in /Users/vlermanda/Main/Internship/data/articles/resourcedump_000130/rdf


Parsing:   0%|          | 0/50000 [00:00<?, ?it/s]

Parsed 15898 English and 18265 Japanese articles; skipped 0 empty files.
Found 50000 RDF files in /Users/vlermanda/Main/Internship/data/articles/resourcedump_000700/rdf


Parsing:   0%|          | 0/50000 [00:00<?, ?it/s]

https://cir.nii.ac.jp/all?q=GABA_{A}%20receptor does not look like a valid URI, trying to serialize this will break.


Parsed 11929 English and 48252 Japanese articles; skipped 0 empty files.
Found 50000 RDF files in /Users/vlermanda/Main/Internship/data/articles/resourcedump_001367/rdf


Parsing:   0%|          | 0/50000 [00:00<?, ?it/s]

https://cir.nii.ac.jp/all?q=%3CSUP%3E13%3C/SUP%3EC%E2%80%93{<SUP>1</SUP>H}%20Noise%20Decoupling does not look like a valid URI, trying to serialize this will break.
https://cir.nii.ac.jp/all?q=%3CSUP%3E13%3C/SUP%3EC%E2%80%93{<SUP>1</SUP>H}%20NMR does not look like a valid URI, trying to serialize this will break.
https://cir.nii.ac.jp/all?q=Poly{3-[3-(4-hydroxyphenyl)phthalidyl]-4-hydroxystyrene-<I>co</I>-4-hydroxystyrene} does not look like a valid URI, trying to serialize this will break.
https://cir.nii.ac.jp/all?q=%3CSUP%3E13%3C/SUP%3EC%E2%80%93{<SUP>1</SUP>H}%20NMR does not look like a valid URI, trying to serialize this will break.


Parsed 11757 English and 10062 Japanese articles; skipped 37296 empty files.


In [27]:
# convert to DataFrames
df_en = pd.DataFrame.from_dict(en_articles, orient='index').reset_index(drop=True)
df_jp = pd.DataFrame.from_dict(jp_articles, orient='index').reset_index(drop=True)

In [28]:
cache_dir = ROOT / "data" / "cache"

df_en.to_pickle(cache_dir / "df_en.pkl")
df_jp.to_pickle(cache_dir / "df_jp.pkl")

In [13]:
import re

# Define a regex pattern to match Japanese characters (Hiragana, Katakana, Kanji)
jp_pattern = re.compile(r'[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff]')

# Find rows with Japanese characters in the title
jp_mask = df_en['title'].apply(lambda x: bool(jp_pattern.search(str(x))))
num_removed = jp_mask.sum()

# Remove those rows
df_en = df_en[~jp_mask].reset_index(drop=True)

print(f"Number of removed entries: {num_removed}")

Number of removed entries: 549


In [None]:
# Define basic regex patterns for Spanish and French detection
spanish_pattern = re.compile(r'\b(el|la|los|las|un|una|unos|unas|de|y|en|que|por|para|con|sin|del|al|se|su|sus|es|son|como|pero|más|o|le|lo|mi|tu|su|nos|vos|ellos|ellas|este|esta|estos|estas|ese|esa|esos|esas|aquel|aquella|aquellos|aquellas)\b', re.IGNORECASE)
french_pattern = re.compile(r'\b(le|la|les|un|une|des|du|de|et|en|que|pour|avec|sans|dans|sur|par|au|aux|ce|cette|ces|son|sa|ses|est|sont|comme|mais|plus|ou|mon|ton|notre|votre|leur|leurs|il|elle|ils|elles|cet|cette|ceux|celles)\b', re.IGNORECASE)

accented_pattern = re.compile(r'[áéíóúüñçàèìòùâêîôûëïüœæ]', re.IGNORECASE)

# Remove rows with Spanish/French stopwords or accented characters in the title or abstract
es_fr_mask = (
    df_en['title'].apply(lambda x: bool(spanish_pattern.search(str(x))) or
                                   bool(french_pattern.search(str(x))) or
                                   bool(accented_pattern.search(str(x))))
    | df_en['abstract'].apply(lambda x: bool(spanish_pattern.search(str(x))) or
                                         bool(french_pattern.search(str(x))) or
                                         bool(accented_pattern.search(str(x))))
)
num_es_fr_removed = es_fr_mask.sum()
df_en = df_en[~es_fr_mask].reset_index(drop=True)

print(f"Number of removed Spanish/French entries (including accented): {num_es_fr_removed}")

Number of removed Spanish/French entries (including accented): 49


In [149]:
avg_char_length = df_en['abstract'].apply(len).mean()
avg_word_length = df_en['abstract'].apply(lambda x: len(x.split())).mean()

print(f"Average number of characters: {avg_char_length:.2f}")
print(f"Average number of words: {avg_word_length:.2f}")

Average number of characters: 334.53
Average number of words: 50.47


In [155]:
# Cluster Sizes 
cluster_sizes = [len(docs) for docs in clusters.values()]
print(f"Number of clusters: {len(cluster_sizes)}")

if cluster_sizes:
	print(f"Mean cluster size: {np.mean(cluster_sizes):.2f}")
	print(f"Median cluster size: {np.median(cluster_sizes)}")
	print(f"Max cluster size: {np.max(cluster_sizes)}")
	print(f"Min cluster size: {np.min(cluster_sizes)}")
	print(f"Number of clusters with size > 10: {sum(size > 10 for size in cluster_sizes)}")
else:
	print("No clusters found.")

Number of clusters: 36484
Mean cluster size: 1.02
Median cluster size: 1.0
Max cluster size: 35
Min cluster size: 1
Number of clusters with size > 10: 5


In [None]:
from preprocess import extract_keywords_df

# Extract keywords from the DataFrame
df_en_kw, topics = extract_keywords_df(
    df_en,
    text_cols=("title", "abstract"),
    top_k=8,
    ngram_range=(1, 3),
    stop_words="english",
    keywords_col="keywords"
)

Extracting keywords: 100%|██████████| 39035/39035 [42:56<00:00, 15.15it/s]   


In [17]:
topics_df = pd.DataFrame(topics, columns=["phrase", "frequency"])
topics_df.to_csv("extracted_topics.csv", index=False)

In [18]:
# print the top 10 topics
print("Top 10 topics:")
print(topics_df.head(10).to_string(index=False))

Top 10 topics:
                phrase  frequency
             education        174
               history        173
antibacterial activity        156
                   law        154
                 japan        142
              japanese        123
         antibacterial        120
     cephem antibiotic        115
                   art        108
                  book        108


In [19]:
freq_treshold = 5
vocab = {kw for kw, df in topics if df >= freq_treshold}

In [20]:
from openai import OpenAI
import csv, time

top_level_topics = [
    "Natural Sciences",
    "Engineering and Technology",
    "Medical and Health Sciences",
    "Agricultural Sciences",
    "Social Sciences",
    "Arts and Humanities",
]

SYSTEM_PROMPT = """
For each keyword decide which TOP-LEVEL RESEARCH FIELD it belongs to.
Return a CSV: keyword,field.
Use exactly one of:
{fields}""".format(fields=" | ".join(top_level_topics))

def map_keywords(keywords, model="gpt-4o-mini", batch=100):
    client   = OpenAI()
    mapping  = {}

    for start in tqdm(range(0, len(keywords), batch), desc="LLM-mapping"):
        chunk = keywords[start:start+batch]
        user  = "\n".join(chunk)

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user",   "content": user},
            ],
            temperature=0.2,
        )

        for row in csv.reader(response.choices[0].message.content.splitlines()):
            if len(row) < 2: 
                continue
            kw, field = row[0].strip(), row[1].strip()
            if field in top_level_topics:
                mapping[kw] = field

        time.sleep(0.2)

    return mapping

In [21]:
import json, pathlib

keyword_field = map_keywords(list(vocab))

LLM-mapping: 100%|██████████| 31/31 [07:42<00:00, 14.91s/it]


In [22]:
from collections import Counter

field_counts = Counter(keyword_field.values())
field_counts_df = pd.DataFrame(field_counts.items(), columns=["field", "count"]).sort_values("count", ascending=False)

print(field_counts_df)

                         field  count
1  Medical and Health Sciences   1003
3              Social Sciences    992
2          Arts and Humanities    455
4   Engineering and Technology    249
0             Natural Sciences    238
5        Agricultural Sciences     87


In [170]:
# Add 'field' column to df_en_kw by mapping each paper's main topic to its field
def get_main_field(keywords, keyword_field):
    for kw in keywords:
        if kw in keyword_field:
            return keyword_field[kw]
    return "Error Name"

df_en_kw['field'] = df_en_kw['keywords'].apply(lambda kws: get_main_field(kws, keyword_field))