In [7]:
import pandas as pd
import spacy
import os
import ast
import gc
import time

# Libraries for language detection. Avoid warnings
from langdetect import detect, DetectorFactory
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='langdetect')
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

# Ensure consistent language detection results across runs
DetectorFactory.seed = 0

In [8]:
# Load spaCy models

nlp_models = {}
nlp_models['en'] = spacy.load("en_core_web_lg")
nlp_models['es'] = spacy.load("es_core_news_lg")
nlp_models['fr'] = spacy.load("fr_core_news_lg")
nlp_models['de'] = spacy.load("de_core_news_lg")
nlp_models['it'] = spacy.load("it_core_news_lg")
nlp_models['ja'] = spacy.load("fr_core_news_lg")
nlp_models['pl'] = spacy.load("pl_core_news_lg")
nlp_models['pt'] = spacy.load("pt_core_news_lg")
nlp_models['ru'] = spacy.load("ru_core_news_lg")
nlp_models['sv'] = spacy.load("sv_core_news_lg")
nlp_models['undetermined'] = spacy.load("xx_ent_wiki_sm")

In [9]:
# Input directory (dir. with csvs containing parsed articles)
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_update_2025-11\


In [10]:
list_csvs = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs.append(file)

n_csvs = len(list_csvs)

In [16]:
# Extract the affiliation of the last author (or, if that is not available, of any author) of each article
# Then, do NER on the affiliation, add result to dataframe and save it
no_affiliation = []
parsed_csvs = []
if os.path.exists(DF_input+"csv files with NER.txt"):
    with open(DF_input+"csv files with NER.txt", "r") as f:
        for line in f:
            parsed_csvs.append(line[:-1])
    
for csv in list_csvs:
    if csv not in parsed_csvs: 
        start = time.time()
        NER_input = {} # PMIDs used as keys, affiliation for this PMID as values
        NER_lastAuthor = {} # PMIDs used as keys, NER of affiliation as values
        # Part 1: Extract affiliation of last (or, if not available, any other) author of each article
        print(f"Extracting affiliation of articles in csv: {csv} ({str(list_csvs.index(csv)+1)}/{str(n_csvs)})")
        df= pd.read_csv(DF_input + csv)
        for i in range(df.index[-1] + 1): # Iterate through all rows, including the last one
            try:
                authors_data = ast.literal_eval(df.at[i, "Authors"])
                last_author = authors_data[-1]
        
                affiliation_found = False
                if "Affiliation" in last_author and last_author["Affiliation"]:
                    # If the last author has an "Affiliation" key and it's not empty,
                    # take the first one. You can modify this to take any specific one
                    # or iterate if you have a preference.
                    NER_input[df.at[i,"PMID"]] = last_author["Affiliation"][0]
                    affiliation_found = True
                else:
                    # If the last author doesn't have an "Affiliation" key or it's empty,
                    # try to find any affiliation within their entry.
                    for el in authors_data:
                        if el["Affiliation"] is not None and len(el["Affiliation"]) != 0:
                            NER_input[df.at[i,"PMID"]] = el["Affiliation"][0]
                            affiliation_found = True
                            break # Stop after finding the affiliation for any author                    
        
                if not affiliation_found:
                    NER_input[df.at[i,"PMID"]] = None
                    no_affiliation.append(df.iat[i, 1])
        
            except (SyntaxError, ValueError, IndexError, KeyError):
                # Handle cases where ast.literal_eval fails, or index is out of bounds,
                # or 'Affiliation' key is not found in the expected structure.
                NER_input[df.at[i,"PMID"]] = None
                no_affiliation.append(df.iat[i, 1])
    
        # Part 2: Do NER on the affiliation to extract structured info
        print(f"Doing NER of articles in csv: {csv} ({str(list_csvs.index(csv)+1)}/{str(n_csvs)})")
        for article in list(NER_input.keys()):
            # Default language in affiliation text
            detected_lang = "undetermined"
    
            # Attempt to detect affiliation text language
            try:
                detected_lang = detect(NER_input[article])
            except Exception as e:
                detected_lang = "undetermined"
    
            # Use the model for the detected language (or a multilingual model if language not detected or no model available for the detected language)
            # Do NER with this model on the affiliation
            try:
                # Load model
                nlp = nlp_models.get(detected_lang)

                if nlp == None:
                    nlp = nlp_models.get("undetermined")
    
                # Load affiliation text, do NER
                doc = nlp(NER_input[article])
                sentence_entities = []
    
                # Extract entities of interest
                for ent in doc.ents:
                    if ent.label_ in ["ORG", "LOC", "GPE"]:
                        sentence_entities.append({"text": ent.text, "label": ent.label_})
    
                # Save the entities associated to the article DOI
                NER_lastAuthor[article] = ({"entities": sentence_entities})
    
            # If NER not possible, store an empty dict
            except ValueError:
                NER_lastAuthor[article] = {}
                
        # Create df from dictionary
        df_NER_lastAuthor = pd.DataFrame.from_dict(NER_lastAuthor, orient="index")
        df_NER_lastAuthor = df_NER_lastAuthor.rename(columns={"entities": "NER_lastAuthor"})
        df_NER_lastAuthor["PMID_NER"] = df_NER_lastAuthor.index
    
        # Merge with original df, save
        df_save = pd.merge(df, df_NER_lastAuthor, left_on= "PMID", right_on="PMID_NER", how="left")
        df_save = df_save.drop(columns=["PMID_NER"])
        df_save.to_csv(DF_input+csv, index=False)
        with open(DF_input+"csv files with NER.txt", "a") as f:
            f.write(csv+"\n")
        parsed_csvs.append(csv)
        del df, df_save, df_NER_lastAuthor
        gc.collect()
        print(f"--Processing time: {str(round(time.time() - start, 2))} s")

Extracting affiliation of articles in csv: parsedXMLs_update_2025_66400.csv (1/1)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedXMLs_update_2025_66400.csv (1/1)
--Processing time: 632.99 s
