In [1]:
# September 2025
# Script to preprocess data from wos and ce database
# Violeta Berdejo-Espinola

In [2]:
#%pip install polars _FastText numpy fastexcel

import polars as pl

In [3]:
# function to normalize text

import unicodedata

def normalize_text(text):
    text = text.strip().replace("\n", "")
    text = text.replace("\n", "")
    return unicodedata.normalize('NFKC', text)
# @params: (form unistr) --> convert strings to one of the four Unicode normalization forms NFC, NFD, NFKC, NFKD)

# read repo data

In [4]:
# read repo data

df_repo = pl.read_csv('../data/from_majom/majom_japanese_040725.csv')
print(f'articles in repo raw: {len(df_repo)}')

# rename columns journal name

df_repo = df_repo.rename({"name": "journal",
                "pub_year": "year"
                })

# normalize journal name, article title, and abastract

df_repo = df_repo.with_columns(
    pl.col('title_en')
    .map_elements(normalize_text, return_dtype=pl.Utf8).alias("title_en"),
    pl.col('title_ja')
    .map_elements(normalize_text, return_dtype=pl.Utf8).alias("title_ja"),
    pl.col('abstract_en')
    .map_elements(normalize_text, return_dtype=pl.Utf8).alias('abstract_en'),
    pl.col('abstract_ja')
    .map_elements(normalize_text, return_dtype=pl.Utf8).alias('abstract_ja'),
    pl.col("journal")
    .map_elements(normalize_text, return_dtype=pl.Utf8).alias("journal")
)

articles in repo raw: 7713


# detect language of titles in repo 


In [5]:
from fasttext.FastText import _FastText

model_path = 'lid.176.ftz'
model = _FastText(model_path=model_path)

# function to detect languages

def get_lang(text: str) -> str:
    lang, _ = model.predict(text)
    lang = lang[0].removeprefix('__label__')
    # conf = conf[0]

    return lang

# detect languages

df_repo = df_repo.with_columns(
    pl.col("title_ja")
    .map_elements(get_lang, return_dtype=pl.Utf8).alias("language"),
    pl.col("abstract_ja")
    .map_elements(get_lang, return_dtype=pl.Utf8).alias("language_abs")
)
print(f'articles in repo: {len(df_repo)}')

languages = df_repo["language"].value_counts()

# filter japanese only

df_other_lang_title = df_repo.filter(
    pl.col('language') != 'ja')

df_other_lang_abstract = df_repo.filter(
    pl.col('language_abs') != 'ja')

df_repo = df_repo.filter(
    pl.col('language') == 'ja')

df_repo = df_repo.filter(
    pl.col('language_abs') == 'ja')

print(f'articles in repo japanese only: {len(df_repo)}')
print(f'non-japanese articles removed from repo: {len(df_other_lang_title)}')
print(f'non-japanese articles removed from repo: {len(df_other_lang_abstract)}')

articles in repo: 7713
articles in repo japanese only: 5321
non-japanese articles removed from repo: 219
non-japanese articles removed from repo: 2246


# remove unwanted articles 

In [6]:
df_repo = df_repo.filter(
    ~pl.col("title_en").str.contains("In Memoriam|Editorial|Correction|Notes|Untitled|Retracted"))

print(f'articles in repo: {len(df_repo)} -> editorials, in memoriam removed')

# drop duplicates and nas

df_repo_dupes_en = df_repo.filter(
    pl.col("title_en").is_duplicated())
df_repo_dupes_ja = df_repo.filter(
    pl.col("title_ja").is_duplicated())

df_repo = df_repo.unique(subset=['title_ja'])
df_repo = df_repo.unique(subset=['title_en'])

# print(f'NAs in repo:\n{df_repo.isna().sum()}')
print(f'articles in repo with english titles dupes: {len(df_repo_dupes_en)}')
print(f'articles in repo with japanese titles dupes: {len(df_repo_dupes_ja)}')
print(f'articles in repo no dupes: {len(df_repo)}')

articles in repo: 5305 -> editorials, in memoriam removed
articles in repo with english titles dupes: 37
articles in repo with japanese titles dupes: 40
articles in repo no dupes: 5278


# read pos data

In [7]:
# translate data 

df_pos = pl.read_excel('../data/from_translate/Master_Japanese copy.xlsx')
print(f'articles in positives raw: {len(df_pos)}')

# rename columns

df_pos = df_pos.rename({
    'Title - non-English language':'title_ja',
    'Title - English': 'title_en',
    'Abstract - non-English': 'abstract_ja',
    'Journal':'journal',
    'Language': 'language',
    'Year': 'year'
})

# normalize journal name, article title, abstract
 
df_pos = df_pos.with_columns(
    pl.col('title_ja')
    .map_elements(normalize_text, return_dtype=pl.Utf8),
    pl.col('title_en')
    .map_elements(normalize_text, return_dtype=pl.Utf8),
    pl.col('abstract_ja')
    .map_elements(normalize_text, return_dtype=pl.Utf8),
    pl.col('journal')
    .map_elements(normalize_text, return_dtype=pl.Utf8)
)

# subset metadata

df_pos = df_pos.select(
    pl.col(['title_ja', 'title_en', 'abstract_ja', 'journal', 'year']
           )
)

# drop duplicates and nas/empty strings

df_pos_dupes = df_pos.filter(
    pl.col("title_en").is_duplicated()
)

df_pos = df_pos.unique(subset=['title_ja'])
df_pos = df_pos.unique(subset=['title_en'])

df_empty_strings = df_pos.filter(
    pl.col("abstract_ja") == "")

# df_pos = df_pos.filter(
#     pl.col("abstract_ja") != "")

print(f"articles in positive dupes: {len(df_pos_dupes)}")
print(f'articles in positives no dupes: {len(df_pos)}')
print(f"articles with no abstract: {len(df_empty_strings)}")

articles in positives raw: 326
articles in positive dupes: 11
articles in positives no dupes: 316
articles with no abstract: 130


# detect language of titles in pos and repo



In [8]:
# remove or replace newlines with spaces -> lang detector returns error otherwise

df_pos = df_pos.with_columns(
    pl.col('title_ja')
    .str.replace_all(r"\n", " ")  
)

# detect languages

df_pos = df_pos.with_columns(
    pl.col('title_ja')
    .map_elements(get_lang, return_dtype=pl.Utf8)
    .alias('language')
)

# filter japanese only

df_pos_other_lang = df_pos.filter(
    pl.col('language') != 'ja')

df_pos = df_pos.filter(
    pl.col('language') == 'ja')

df_pos = df_pos.filter(
    pl.col('language') == 'ja')

print(f'japanese pos articles: {len(df_pos)}')
print(f'non-japanese articles removed from pos: {len(df_pos_other_lang)}')

japanese pos articles: 307
non-japanese articles removed from pos: 9


In [9]:
df_pos["abstract_ja"].null_count()

4

# restrict year range
documents in repo outside year range

In [10]:
df_pos_journal = df_pos["journal"].unique()
print(f"journals in pos: {len(df_pos_journal)}")

journals in pos: 19


In [11]:
# journal years in repo 

df_repo_journals = df_repo.group_by(
    pl.col("journal")).agg(by=pl.col("year")
                           .unique()
                           .sort()
                           .cast(pl.Int64)
                           )
    
df_repo_journals = df_repo_journals.explode("by")

# apply the filter dynamically

df_pos_filtered = df_pos.join(df_repo_journals , left_on=["year","journal"], right_on=["by", "journal"], how='semi')

df_repo_filtered = df_repo.join(df_repo_journals , left_on=["year","journal"], right_on=["by", "journal"], how='semi')

print(len(df_repo_filtered), len(df_pos_filtered), len(df_pos))

5278 108 307


# titles in pos that are in repo

In [12]:
df_pos = df_pos.with_columns(
    pl.col('title_en')
    .is_in(df_repo_filtered['title_en'])
    .alias("exists_in_repo")
    )

df_pos["exists_in_repo"].value_counts()

exists_in_repo,count
bool,u32
True,76
False,231


In [13]:
# save to disk 
df_pos.write_csv('../data/outputs_pre-processing/pos_jap_pre-processed.csv')
df_pos_dupes.write_csv('../data/outputs_pre-processing/pos_jap_duplicates_removed.csv')
df_pos_other_lang.write_csv('../data/outputs_pre-processing/pos_jap_nonjap_removed.csv')
df_empty_strings.write_csv('../data/outputs_pre-processing/pos_jap_empty_abstract.csv')

df_repo.write_csv('../data/outputs_pre-processing/repo_jap_pre-processed.csv')
df_repo_filtered.write_csv('../data/outputs_pre-processing/repo_jap_pre-processed_year_range_ok.csv') 
df_repo_dupes_en.write_csv('../data/outputs_pre-processing/repo_jap_en_duplicates_removed.csv')
df_repo_dupes_ja.write_csv('../data/outputs_pre-processing/repo_jap_ja_duplicates_removed.csv') 
df_other_lang_abstract.write_csv("../data/outputs_pre-processing/repo_jap_nonjap_abs_removed.csv")
df_other_lang_title.write_csv("../data/outputs_pre-processing/repo_jap_nonjap_title_removed.csv")