In [115]:
import os
import lxml
from pathlib import Path
import pandas as pd
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import RDF, RDFS, OWL, XSD, DC
from Levenshtein import distance as edit_distance
from tqdm import tqdm
from datasketch import MinHash, MinHashLSH
from collections import defaultdict

## Parsing

In [53]:
# remove all xml and html tags from a string using beautifulsoup
from bs4 import BeautifulSoup
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

In [54]:
def parse_and_extract_articles(rdf_dir):
    CIR     = "https://cir.nii.ac.jp/schema/1.0/"
    DC_NS   = "http://purl.org/dc/elements/1.1/"
    JPCOAR  = "https://github.com/JPCOAR/schema/blob/master/2.0/"

    rdf_files = list(Path(rdf_dir).glob("*.rdf"))
    print(f"Found {len(rdf_files)} RDF files in {rdf_dir}")

    rows, graphs = [], {}
    empty = 0

    for f in tqdm(rdf_files, desc="Parsing"):
        if os.path.getsize(f)==0:
            empty+=1; continue
        g = Graph()
        try:
            g.parse(f, format="application/rdf+xml")
        except Exception as e:
            print("Parse error:", e); continue

        # find article subject via dc:title
        subs = list(g.subjects(URIRef(DC_NS+"title"),None))
        if not subs: continue
        art = subs[0]

        # title
        titles = list(g.objects(art, URIRef(DC_NS+"title")))
        title = next((t for t in titles if isinstance(t, Literal) and t.language!="ja"), titles[0])

        # DOI
        pid = g.value(art, URIRef(CIR+"productIdentifier"))
        ident = g.value(pid, URIRef(CIR+"identifier")) if pid else None
        doi = str(ident) if isinstance(ident, Literal) and ident.datatype==URIRef(CIR+"DOI") else ""

        # abstract
        desc = g.value(art, URIRef(CIR+"description"))
        note = g.value(desc, URIRef(CIR+"notation")) if desc else None
        abstract = remove_html_tags(str(note)) if isinstance(note, Literal) else ""

        rows.append({
            "uri": str(art), "title":str(title),
            "doi":doi, "abstract":abstract
        })
        graphs[str(art)] = g

    print(f"Parsed {len(rows)} articles; skipped {empty} empty files.")
    return pd.DataFrame(rows), graphs

In [116]:
from pathlib import Path
from rdflib import Graph, URIRef, Literal
import os
from tqdm.auto import tqdm

def remove_html_tags(text):
    import re
    return re.sub(r'<.*?>', '', text)

def parse_and_extract_articles_langs(rdf_dir):
    CIR     = "https://cir.nii.ac.jp/schema/1.0/"
    DC_NS   = "http://purl.org/dc/elements/1.1/"

    rdf_files = list(Path(rdf_dir).glob("*.rdf"))
    print(f"Found {len(rdf_files)} RDF files in {rdf_dir}")

    eng_articles = {}  # English
    jpn_articles = {}  # Japanese (or default)
    empty = 0

    for f in tqdm(rdf_files, desc="Parsing"):
        if os.path.getsize(f) == 0:
            empty += 1
            continue

        g = Graph()
        try:
            g.parse(f, format="application/rdf+xml")
        except Exception as e:
            print("Parse error:", e)
            continue

        # find article subject via dc:title
        subs = list(g.subjects(URIRef(DC_NS + "title"), None))
        if not subs:
            continue
        art = subs[0]
        uri = str(art)

        # ----- Titles -----
        titles = list(g.objects(art, URIRef(DC_NS + "title")))
        title_en = next((t for t in titles if isinstance(t, Literal) and t.language == "en"), None)
        title_jp = next((t for t in titles if isinstance(t, Literal) and (t.language is None or t.language == "ja")), None)

        # ----- Abstracts -----
        CIR_desc = URIRef(CIR + "description")
        descs = list(g.objects(art, CIR_desc))
        abs_en, abs_jp = None, None
        for desc in descs:
            notes = list(g.objects(desc, URIRef(CIR + "notation")))
            for note in notes:
                if isinstance(note, Literal):
                    if note.language == "en":
                        abs_en = remove_html_tags(str(note))
                    elif note.language is None or note.language == "ja":
                        abs_jp = remove_html_tags(str(note))

        # ----- DOI -----
        pid = g.value(art, URIRef(CIR + "productIdentifier"))
        ident = g.value(pid, URIRef(CIR + "identifier")) if pid else None
        doi = str(ident) if isinstance(ident, Literal) and ident.datatype == URIRef(CIR + "DOI") else ""

        # ----- Store per language -----
        if title_en or abs_en:
            eng_articles[uri] = {
                "uri": uri,
                "title": str(title_en) if title_en else "",
                "doi": doi,
                "abstract": abs_en if abs_en else ""
            }
        if title_jp or abs_jp:
            jpn_articles[uri] = {
                "uri": uri,
                "title": str(title_jp) if title_jp else "",
                "doi": doi,
                "abstract": abs_jp if abs_jp else ""
            }

    print(f"Parsed {len(eng_articles)} English and {len(jpn_articles)} Japanese articles; skipped {empty} empty files.")
    return eng_articles, jpn_articles

In [117]:
rdf_dir_130 = "/Users/vlermanda/Main/Internship/resourcedump_000130/rdf"
rdf_dir_700 = "/Users/vlermanda/Main/Internship/resourcedump_000700/rdf"
rdf_dir_1367 = "/Users/vlermanda/Main/Internship/resourcedump_001367/rdf"

rdf_dirs = [rdf_dir_130, rdf_dir_700, rdf_dir_1367]

# parse and merge articles per language
def parse_and_merge_articles(rdf_dirs):
    eng_articles, jpn_articles = {}, {}
    for rdf_dir in rdf_dirs:
        eng, jpn = parse_and_extract_articles_langs(rdf_dir)
        eng_articles.update(eng)
        jpn_articles.update(jpn)
    return eng_articles, jpn_articles

In [118]:
en_articles, jp_articles = parse_and_merge_articles(rdf_dirs)

Found 50000 RDF files in /Users/vlermanda/Main/Internship/resourcedump_000130/rdf


Parsing:   0%|          | 0/50000 [00:00<?, ?it/s]

Parsed 15898 English and 18265 Japanese articles; skipped 0 empty files.
Found 50000 RDF files in /Users/vlermanda/Main/Internship/resourcedump_000700/rdf


Parsing:   0%|          | 0/50000 [00:00<?, ?it/s]

https://cir.nii.ac.jp/all?q=GABA_{A}%20receptor does not look like a valid URI, trying to serialize this will break.


Parsed 11929 English and 48252 Japanese articles; skipped 0 empty files.
Found 50000 RDF files in /Users/vlermanda/Main/Internship/resourcedump_001367/rdf


Parsing:   0%|          | 0/50000 [00:00<?, ?it/s]

https://cir.nii.ac.jp/all?q=%3CSUP%3E13%3C/SUP%3EC%E2%80%93{<SUP>1</SUP>H}%20Noise%20Decoupling does not look like a valid URI, trying to serialize this will break.
https://cir.nii.ac.jp/all?q=%3CSUP%3E13%3C/SUP%3EC%E2%80%93{<SUP>1</SUP>H}%20NMR does not look like a valid URI, trying to serialize this will break.
https://cir.nii.ac.jp/all?q=Poly{3-[3-(4-hydroxyphenyl)phthalidyl]-4-hydroxystyrene-<I>co</I>-4-hydroxystyrene} does not look like a valid URI, trying to serialize this will break.
https://cir.nii.ac.jp/all?q=%3CSUP%3E13%3C/SUP%3EC%E2%80%93{<SUP>1</SUP>H}%20NMR does not look like a valid URI, trying to serialize this will break.


Parsed 11757 English and 10062 Japanese articles; skipped 37296 empty files.


In [126]:
# convert to DataFrames
df_en = pd.DataFrame.from_dict(en_articles, orient='index').reset_index(drop=True)
df_jp = pd.DataFrame.from_dict(jp_articles, orient='index').reset_index(drop=True)

In [127]:
df_en

Unnamed: 0,uri,title,doi,abstract
0,https://cir.nii.ac.jp/crid/1130000794844585472...,Differential equations : an introduction to ba...,,
1,https://cir.nii.ac.jp/crid/1130000794842855424...,Beyond fragmentation : integrating Mission and...,,
2,https://cir.nii.ac.jp/crid/1130000794842090112...,Sisters,,
3,https://cir.nii.ac.jp/crid/1130000794832703744...,Contrast enhancement of the inner ear in magne...,,
4,https://cir.nii.ac.jp/crid/1130000794849657216...,More than they bargain for : Toba Batak wood c...,,
...,...,...,...,...
39579,https://cir.nii.ac.jp/crid/1390282681266829696...,Synthesis of New Organic Crosslinking Reagents...,10.1295/polymj.25.379,New organic crosslinking reagents 1 containing...
39580,https://cir.nii.ac.jp/crid/1390282681255552640...,Effect of aztreonam on platelet aggregation an...,10.11250/chemotherapy1953.39.247,"In recent years, there have been reports of he..."
39581,https://cir.nii.ac.jp/crid/1390282681258421120...,PHARMACOKINETICS OF MT-141,10.11250/chemotherapy1953.32.supplement5_76,MT-141 was intravenously infused to rabbits at...
39582,https://cir.nii.ac.jp/crid/1390282681262299264...,Clinical evaluation of pazufloxacin mesilate f...,10.11250/chemotherapy1995.48.673,"The clinical efficacy, safety and usefulness o..."


In [135]:
import re

# Define a regex pattern to match Japanese characters (Hiragana, Katakana, Kanji)
jp_pattern = re.compile(r'[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff]')

# Find rows with Japanese characters in the title
jp_mask = df_en['title'].apply(lambda x: bool(jp_pattern.search(str(x))))
num_removed = jp_mask.sum()

# Remove those rows
df_en = df_en[~jp_mask].reset_index(drop=True)

print(f"Number of removed entries: {num_removed}")

Number of removed entries: 549


In [None]:
# Define basic regex patterns for Spanish and French detection
spanish_pattern = re.compile(r'\b(el|la|los|las|un|una|unos|unas|de|y|en|que|por|para|con|sin|del|al|se|su|sus|es|son|como|pero|más|o|le|lo|mi|tu|su|nos|vos|ellos|ellas|este|esta|estos|estas|ese|esa|esos|esas|aquel|aquella|aquellos|aquellas)\b', re.IGNORECASE)
french_pattern = re.compile(r'\b(le|la|les|un|une|des|du|de|et|en|que|pour|avec|sans|dans|sur|par|au|aux|ce|cette|ces|son|sa|ses|est|sont|comme|mais|plus|ou|mon|ton|notre|votre|leur|leurs|il|elle|ils|elles|cet|cette|ceux|celles)\b', re.IGNORECASE)

accented_pattern = re.compile(r'[áéíóúüñçàèìòùâêîôûëïüœæ]', re.IGNORECASE)

# Remove rows with Spanish/French stopwords or accented characters in the title or abstract
es_fr_mask = (
    df_en['title'].apply(lambda x: bool(spanish_pattern.search(str(x))) or
                                   bool(french_pattern.search(str(x))) or
                                   bool(accented_pattern.search(str(x))))
    | df_en['abstract'].apply(lambda x: bool(spanish_pattern.search(str(x))) or
                                         bool(french_pattern.search(str(x))) or
                                         bool(accented_pattern.search(str(x))))
)
num_es_fr_removed = es_fr_mask.sum()
df_en = df_en[~es_fr_mask].reset_index(drop=True)

print(f"Number of removed Spanish/French entries (including accented): {num_es_fr_removed}")

Number of removed Spanish/French entries (including accented): 49


In [149]:
avg_char_length = df_en['abstract'].apply(len).mean()
avg_word_length = df_en['abstract'].apply(lambda x: len(x.split())).mean()

print(f"Average number of characters: {avg_char_length:.2f}")
print(f"Average number of words: {avg_word_length:.2f}")

Average number of characters: 334.53
Average number of words: 50.47


In [130]:
import re
import numpy as np
import matplotlib.pyplot as plt
from datasketch import MinHash, MinHashLSH
import re, nltk

def k_shingles(text, k=5):
    text = str(text)  # In case there are NaNs
    text = re.sub(r"[^A-Za-z0-9 ]+", " ", text.lower())  # Remove punctuation, lowercase
    text = text.replace(" ", "_")
    shingles = set()
    for i in range(len(text) - k + 1):
        shingle = text[i:i+k]
        shingles.add(shingle)
    return shingles

nltk.download("stopwords"); from nltk.corpus import stopwords
STOP = set(stopwords.words("english"))
def tokens(text):
    text = re.sub(r"[^A-Za-z0-9 ]+", " ", text.lower())
    # remove punctuation, numbers and 1-character words
    text = re.sub(r"\b\w{1}\b", "", text)
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return [w for w in text.split() if w not in STOP]

def make_mh(elements, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for sh in elements:
        m.update(sh.encode("utf8"))
    return m

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vlermanda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [154]:
# Parameters
num_perm = 128
lsh = MinHashLSH(threshold=0.2, num_perm=num_perm)
k = 3
split_type = "tokens"  # "k-shingles" or "tokens"
minhashes = []

df_articles = df_en

# Create MinHash Signatures and Index in LSH using k-shingles
if (split_type == "k-shingles"):
    for i, row in tqdm(df_articles.iterrows(), total=len(df_articles)):
        combined = f"{row['title']} {row['abstract']}"
        shingles = k_shingles(combined, k=k)
        mh = make_mh(shingles, num_perm=num_perm)
        minhashes.append(mh)
        lsh.insert(row['uri'], mh)

tokens_list = []
# Create MinHash Signatures and Index in LSH using tokens
if (split_type == "tokens"):
    for i, row in tqdm(df_articles.iterrows(), total=len(df_articles)):
        combined = f"{row['title']} {row['abstract']}"
        tokens_list = tokens(combined)
        # print(f"Tokens for {row['uri']}: {tokens_list}")
        mh_tokens = make_mh(tokens_list, num_perm=num_perm)
        minhashes.append(mh_tokens)
        lsh.insert(row['uri'], mh_tokens)

# Form Clusters 
clusters = dict()
for i, mh in enumerate(minhashes):
    neighbors = lsh.query(mh)
    cluster_key = frozenset(neighbors)
    if cluster_key not in clusters:
        clusters[cluster_key] = []
    clusters[cluster_key].append(i)

  0%|          | 0/37090 [00:00<?, ?it/s]

In [155]:
# Cluster Sizes 
cluster_sizes = [len(docs) for docs in clusters.values()]
print(f"Number of clusters: {len(cluster_sizes)}")

if cluster_sizes:
	print(f"Mean cluster size: {np.mean(cluster_sizes):.2f}")
	print(f"Median cluster size: {np.median(cluster_sizes)}")
	print(f"Max cluster size: {np.max(cluster_sizes)}")
	print(f"Min cluster size: {np.min(cluster_sizes)}")
	print(f"Number of clusters with size > 10: {sum(size > 10 for size in cluster_sizes)}")
else:
	print("No clusters found.")

Number of clusters: 36484
Mean cluster size: 1.02
Median cluster size: 1.0
Max cluster size: 35
Min cluster size: 1
Number of clusters with size > 10: 5


In [None]:
# Extract uris from the largest cluster
largest_cluster = max(clusters.values(), key=len)
print(f"Largest cluster size: {len(largest_cluster)}")
largest_cluster_uris = [df_articles.iloc[i]['uri'] for i in largest_cluster]
# Print the titles of the articles in the largest cluster
for uri in largest_cluster_uris:
    title = df_articles[df_articles['uri'] == uri]['title'].values[0]
    abstract = df_articles[df_articles['uri'] == uri]['abstract'].values[0]
    print(f"URI:{uri}\n Title: {title}\nAbstract: {abstract}\n")

Largest cluster size: 35
URI:https://cir.nii.ac.jp/crid/1050571007507393408.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007507388160.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007525760384.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007525764096.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007507390464.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007519860480.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007525759104.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007507389696.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007513810560.rdf
 Title: Title page, Content, etc.
Abstract: 

URI:https://cir.nii.ac.jp/crid/1050571007525758336.rdf
 Title: Tit

In [157]:
# Get the top 10 biggest clusters by size
top_clusters = sorted(clusters.values(), key=len, reverse=True)[:10]

for idx, cluster in enumerate(top_clusters, 1):
    print(f"\nCluster #{idx} (size: {len(cluster)})")
    for i in cluster:
        row = df_articles.iloc[i]
        print(f"URI: {row['uri']}\nTitle: {row['title']}\nAbstract: {row['abstract']}\n")


Cluster #1 (size: 35)
URI: https://cir.nii.ac.jp/crid/1050571007507393408.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007507388160.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007525760384.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007525764096.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007507390464.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007519860480.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007525759104.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007507389696.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007513810560.rdf
Title: Title page, Content, etc.
Abstract: 

URI: https://cir.nii.ac.jp/crid/1050571007525758336.rdf
Title: Title

In [160]:
from keybert import KeyBERT
from collections import Counter

def extract_keywords_df(
    df: pd.DataFrame,
    text_cols: tuple[str, ...] = ("title", "abstract"),
    kw_model: KeyBERT | None = None,
    top_k: int = 8,
    ngram_range: tuple[int, int] = (1, 3),
    stop_words: str = "english",
    keywords_col: str = "keywords",
) -> tuple[pd.DataFrame, list[tuple[str, int]]]:
    """
    Add a `keywords_col` list column to `df` and return corpus-level topics.

    Parameters
    ----------
    df           : DataFrame with text columns (e.g. title, abstract)
    text_cols    : columns to concatenate as the document text
    kw_model     : existing KeyBERT instance (optional)
    top_k        : phrases to keep per row
    ngram_range  : (min_n, max_n) for candidate phrases
    stop_words   : passed through to KeyBERT
    keywords_col : name of the new column to create

    Returns
    -------
    df_out       : same dataframe with an added `keywords_col`
    topics       : list of (phrase, document_frequency) tuples
    """
    if kw_model is None:
        kw_model = KeyBERT("paraphrase-MiniLM-L6-v2")

    docs = (
        df[list(text_cols)]
        .fillna("")                                 # NaNs → ""
        .apply(lambda row: " ".join(map(str, row)), axis=1)
        .tolist()
    )

    all_keywords: list[list[str]] = []
    for doc in tqdm(docs, desc="Extracting keywords"):
        kws = kw_model.extract_keywords(
            doc,
            keyphrase_ngram_range=ngram_range,
            stop_words=stop_words,
            top_n=top_k,
        )
        all_keywords.append([phrase for phrase, _ in kws])

    df_out = df.copy()
    df_out[keywords_col] = all_keywords

    df_counter = Counter()
    for kw_list in all_keywords:
        df_counter.update(set(kw_list))  # document frequency

    topics = sorted(df_counter.items(), key=lambda x: (-x[1], x[0]))
    return df_out, topics

In [161]:
# Extract keywords from the DataFrame
df_en_kw, topics = extract_keywords_df(
    df_en,
    text_cols=("title", "abstract"),
    top_k=8,
    ngram_range=(1, 3),
    stop_words="english",
    keywords_col="keywords"
)

Extracting keywords:   0%|          | 0/37090 [00:00<?, ?it/s]

In [162]:
topics_df = pd.DataFrame(topics, columns=["phrase", "frequency"])
topics_df.to_csv("extracted_topics.csv", index=False)

In [163]:
# print the top 10 topics
print("Top 10 topics:")
print(topics_df.head(10).to_string(index=False))

Top 10 topics:
                phrase  frequency
             education        174
               history        171
                   law        153
antibacterial activity        147
                 japan        141
              japanese        123
         antibacterial        112
     cephem antibiotic        109
                  book        108
                   art        104


In [164]:
freq_treshold = 5
vocab = {kw for kw, df in topics if df >= freq_treshold}

In [165]:
from openai import OpenAI
import csv, time

top_level_topics = [
    "Natural Sciences",
    "Engineering and Technology",
    "Medical and Health Sciences",
    "Agricultural Sciences",
    "Social Sciences",
    "Arts and Humanities",
    "Error Name"
]

SYSTEM_PROMPT = """
For each keyword decide which TOP-LEVEL RESEARCH FIELD it belongs to.
Return a CSV: keyword,field.
Use exactly one of:
{fields}""".format(fields=" | ".join(top_level_topics))

def map_keywords(keywords, model="gpt-4o-mini", batch=100):
    client   = OpenAI()
    mapping  = {}

    for start in tqdm(range(0, len(keywords), batch), desc="LLM-mapping"):
        chunk = keywords[start:start+batch]
        user  = "\n".join(chunk)

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user",   "content": user},
            ],
            temperature=0.2,
        )

        for row in csv.reader(response.choices[0].message.content.splitlines()):
            if len(row) < 2: 
                continue
            kw, field = row[0].strip(), row[1].strip()
            if field in top_level_topics:
                mapping[kw] = field

        time.sleep(0.2)

    return mapping

In [168]:
import json, pathlib

MAP_FILE = pathlib.Path("keyword2field.json")
if MAP_FILE.exists():
    with MAP_FILE.open() as fh:
        keyword_field = json.load(fh)
else:
    keyword_field = map_keywords(list(vocab))
    json.dump(keyword_field, MAP_FILE.open("w"), indent=2)

LLM-mapping:   0%|          | 0/29 [00:00<?, ?it/s]

In [169]:
from collections import Counter

field_counts = Counter(keyword_field.values())
field_counts_df = pd.DataFrame(field_counts.items(), columns=["field", "count"]).sort_values("count", ascending=False)

print(field_counts_df)

                         field  count
0  Medical and Health Sciences    925
1              Social Sciences    814
3          Arts and Humanities    381
5                   Error Name    255
2   Engineering and Technology    227
4             Natural Sciences    181
6        Agricultural Sciences     80


In [170]:
# Add 'field' column to df_en_kw by mapping each paper's main topic to its field
def get_main_field(keywords, keyword_field):
    for kw in keywords:
        if kw in keyword_field:
            return keyword_field[kw]
    return "Error Name"

df_en_kw['field'] = df_en_kw['keywords'].apply(lambda kws: get_main_field(kws, keyword_field))

In [172]:
df_en_kw[df_en_kw['field'] == "Error Name"]

Unnamed: 0,uri,title,doi,abstract,keywords,field
1,https://cir.nii.ac.jp/crid/1130000794842855424...,Beyond fragmentation : integrating Mission and...,,,"[evangelical theological education, theologica...",Error Name
2,https://cir.nii.ac.jp/crid/1130000794842090112...,Sisters,,,[sisters],Error Name
3,https://cir.nii.ac.jp/crid/1130000794832703744...,Contrast enhancement of the inner ear in magne...,,,"[intravenous gadolinium injection, hours intra...",Error Name
4,https://cir.nii.ac.jp/crid/1130000794849657216...,More than they bargain for : Toba Batak wood c...,,,"[carvers western travellers, travellers utopic...",Error Name
5,https://cir.nii.ac.jp/crid/1130000794848301824...,The lost children,,,"[lost children, lost, children]",Error Name
...,...,...,...,...,...,...
37085,https://cir.nii.ac.jp/crid/1390282681257558272...,CLINICAL EXPERIENCE WITH SISOMICIN IN COMPLICA...,10.11250/chemotherapy1953.26.supplement3_285,1. Sisomicin was applied to 21 patients with c...,"[infections sisomicin applied, clinical experi...",Error Name
37086,https://cir.nii.ac.jp/crid/1390282681266829696...,Synthesis of New Organic Crosslinking Reagents...,10.1295/polymj.25.379,New organic crosslinking reagents 1 containing...,"[organic crosslinking reagents, new organic cr...",Error Name
37087,https://cir.nii.ac.jp/crid/1390282681255552640...,Effect of aztreonam on platelet aggregation an...,10.11250/chemotherapy1953.39.247,"In recent years, there have been reports of he...","[elderly patients azt, patients azt administer...",Error Name
37088,https://cir.nii.ac.jp/crid/1390282681258421120...,PHARMACOKINETICS OF MT-141,10.11250/chemotherapy1953.32.supplement5_76,MT-141 was intravenously infused to rabbits at...,"[pharmacokinetics mt 141, mt 141 kidney, kidne...",Error Name
