In [34]:
import pandas as pd
import spacy
import os
import ast

# Libraries for language detection. Avoid warnings
from langdetect import detect, DetectorFactory
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='langdetect')

# Ensure consistent language detection results across runs
DetectorFactory.seed = 0

In [113]:
# LoadspaCy models

nlp_models = {}
nlp_models['en'] = spacy.load("en_core_web_lg")
nlp_models['es'] = spacy.load("es_core_news_lg")
nlp_models['fr'] = spacy.load("fr_core_news_lg")
nlp_models['de'] = spacy.load("de_core_news_lg")
nlp_models['it'] = spacy.load("it_core_news_lg")
nlp_models['ja'] = spacy.load("fr_core_news_lg")
nlp_models['pl'] = spacy.load("pl_core_news_lg")
nlp_models['pt'] = spacy.load("pt_core_news_lg")
nlp_models['ru'] = spacy.load("ru_core_news_lg")
nlp_models['sv'] = spacy.load("sv_core_news_lg")
nlp_models['undetermined'] = spacy.load("xx_ent_wiki_sm")

In [116]:
# Trial sentences paper
df = pd.read_csv(os.path.join("C:/", "users", "svalb", "OneDrive", "Escritorio", "Data_40_years_cancer_studies", "parsedXMLs", "parsedX_4454000.csv"))

In [150]:
df.head(3)

Unnamed: 0,Type,PMID,DOI,Journal,Title,Abstract,Authors,MeshHeadings,Chemicals,PublicationTypes,PublicationDate,Language,Keywords,Book Accession,Publisher,BookTitle
0,Article,38728618,10.1200/JCO.24.00374,Journal of clinical oncology : official journa...,Clinical Trials-Real-World Data to Build a Fut...,,"[{'Name': 'Catherine C Fahey', 'Affiliation': ...","Humans, Clinical Trials as Topic, Neoplasms",,['Editorial'],2024.0,eng,,,,
1,Article,38728647,10.3855/jidc.17870,Journal of infection in developing countries,Serum β-klotho is a potential biomarker for th...,Hepatitis B virus (HBV) infection is a global ...,"[{'Name': 'Xin Miao', 'Affiliation': ['Departm...","Humans, Male, Female, Biomarkers, Middle Aged,...","['Biomarkers', 'Klotho Proteins', 'Glucuronida...","['Journal Article', ""Research Support, Non-U.S...",2024.0,eng,"['HBV', 'KLB', 'cirrhosis', 'hepatitis B virus...",,,
2,Article,38728649,10.3855/jidc.18374,Journal of infection in developing countries,Trends in typhoid fever during the COVID-19 pa...,Pakistan has been experiencing an extensively ...,"[{'Name': 'Carly Ching', 'Affiliation': ['Depa...","Typhoid Fever, Pakistan, Humans, COVID-19, Sal...",['Anti-Bacterial Agents'],['Journal Article'],2024.0,eng,"['COVID-19', 'drug resistance', 'typhoid']",,,


In [154]:
# Extract the affiliation of the last author of each article
NER_input = {}
no_affiliation = []
for i in range(df.index[-1] + 1): # Iterate through all rows, including the last one
    try:
        authors_data = ast.literal_eval(df.iat[i, 6])
        last_author = authors_data[-1]

        affiliation_found = False
        if "Affiliation" in last_author and last_author["Affiliation"]:
            # If the last author has an "Affiliation" key and it's not empty,
            # take the first one. You can modify this to take any specific one
            # or iterate if you have a preference.
            NER_input[df.iat[i,1].item()] = last_author["Affiliation"][0]
            affiliation_found = True
        else:
            # If the last author doesn't have an "Affiliation" key or it's empty,
            # try to find any affiliation within their entry.
            # This part assumes "Affiliation" is always a list of strings if present.
            for key, value in last_author.items():
                if key == "Affiliation" and isinstance(value, list) and value:
                    NER_input[df.iat[i,1].item()] = value[0] # Taking the first one found
                    affiliation_found = True
                    break # Stop after finding the first affiliation for this author

        if not affiliation_found:
            NER_input[df.iat[i,1].item()] = None
            no_affiliation.append(df.iat[i, 1])

    except (SyntaxError, ValueError, IndexError, KeyError):
        # Handle cases where ast.literal_eval fails, or index is out of bounds,
        # or 'Affiliation' key is not found in the expected structure.
        no_affiliation.append(df.iat[i, 1])

In [155]:
NER_input

{38728618: 'National Cancer Institute, National Institutes of Health, Bethesda, MD.',
 38728647: 'Department of Hepatology, Chongqing University Three Gorges Hospital, Chongqing, China.',
 38728649: 'Shaukat Khanum Memorial Cancer Hospital and Research Centre, Lahore, Pakistan.',
 38728653: 'Department of Biosciences, School of Natural Sciences and Munich Institute of Biomedical Engineering, Technical University of Munich, Boltzmannstraße 11, 85748 Garching, Germany.',
 38728659: 'Biomedical Research Imaging Center, Department of Radiology, and UNC Lineberger Comprehensive Cancer Center, University of North Carolina-Chapel Hill, 125 Mason Farm Road, Marsico Hall, Chapel Hill, North Carolina 27599, United States.',
 38728674: 'Department of General Medicine, IQRAA International Hospital, Kozhikode, Kerala, India.',
 38728749: 'Department of Physiology and Pathophysiology, Rady College of Medicine, Max Rady Faculty of Health Sciences, The Institute of Cardiovascular Sciences, St. Bonifac

In [157]:
NER_lastAuthor = {}
for article in list(NER_input.keys()):
    detected_lang = "undetermined"
    try:
        detected_lang = detect(sentence)
    except Exception as e:
        continue

    try:
        
        nlp = nlp_models.get(detected_lang, nlp_models["undetermined"])

        doc = nlp(NER_input[article])
        sentence_entities = []

        for ent in doc.ents:
            if ent.label_ in ["ORG", "LOC", "GPE"]:
                sentence_entities.append({"text": ent.text, "label": ent.label_})
    
        NER_lastAuthor[article] = ({"entities": sentence_entities})

    except ValueError:
        NER_lastAuthor[article] = None

In [158]:
NER_input[38728653]

'Department of Biosciences, School of Natural Sciences and Munich Institute of Biomedical Engineering, Technical University of Munich, Boltzmannstraße 11, 85748 Garching, Germany.'

In [159]:
NER_lastAuthor[38728653]

{'entities': [{'text': 'Department of Biosciences', 'label': 'ORG'},
  {'text': 'School of Natural Sciences and Munich Institute of Biomedical Engineering',
   'label': 'ORG'},
  {'text': 'Technical University of Munich', 'label': 'ORG'},
  {'text': 'Boltzmannstraße', 'label': 'LOC'},
  {'text': 'Garching', 'label': 'LOC'},
  {'text': 'Germany', 'label': 'LOC'}]}