In [2]:
import pandas as pd
import spacy
import os
import ast
import gc
import time

# Libraries for language detection. Avoid warnings
from langdetect import detect, DetectorFactory
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='langdetect')

# Ensure consistent language detection results across runs
DetectorFactory.seed = 0

In [3]:
# Load spaCy models

nlp_models = {}
nlp_models['en'] = spacy.load("en_core_web_lg")
nlp_models['es'] = spacy.load("es_core_news_lg")
nlp_models['fr'] = spacy.load("fr_core_news_lg")
nlp_models['de'] = spacy.load("de_core_news_lg")
nlp_models['it'] = spacy.load("it_core_news_lg")
nlp_models['ja'] = spacy.load("fr_core_news_lg")
nlp_models['pl'] = spacy.load("pl_core_news_lg")
nlp_models['pt'] = spacy.load("pt_core_news_lg")
nlp_models['ru'] = spacy.load("ru_core_news_lg")
nlp_models['sv'] = spacy.load("sv_core_news_lg")
nlp_models['undetermined'] = spacy.load("xx_ent_wiki_sm")

In [4]:
# Input directory (dir. with csvs containing parsed articles)
DF_input = input().strip()

   C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs\


In [5]:
list_csvs = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs.append(file)

n_csvs = len(list_csvs)

In [10]:
# Extract the affiliation of the last author (or, if that is not available, of any author) of each article
# Then, do NER on the affiliation, add result to dataframe and save it
no_affiliation = []
parsed_csvs = []
with open(DF_input+"csv files with NER.txt", "r") as f:
    for line in f:
        parsed_csvs.append(line[:-1])
    
for csv in list_csvs:
    if csv not in parsed_csvs: 
        start = time.time()
        NER_input = {} # PMIDs used as keys, affiliation for this PMID as values
        NER_lastAuthor = {} # PMIDs used as keys, NER of affiliation as values
        # Part 1: Extract affiliation of last (or, if not available, any other) author of each article
        print("Extracting affiliation of articles in csv: " + csv + " (" + str(list_csvs.index(csv)+1) + "/" + str(n_csvs) + ")")
        df= pd.read_csv(DF_input + csv)
        for i in range(df.index[-1] + 1): # Iterate through all rows, including the last one
            try:
                authors_data = ast.literal_eval(df.iat[i, 6])
                last_author = authors_data[-1]
        
                affiliation_found = False
                if "Affiliation" in last_author and last_author["Affiliation"]:
                    # If the last author has an "Affiliation" key and it's not empty,
                    # take the first one. You can modify this to take any specific one
                    # or iterate if you have a preference.
                    NER_input[df.iat[i,1]] = last_author["Affiliation"][0]
                    affiliation_found = True
                else:
                    # If the last author doesn't have an "Affiliation" key or it's empty,
                    # try to find any affiliation within their entry.
                    for el in authors_data:
                        if el["Affiliation"] is not None and len(el["Affiliation"]) != 0:
                            NER_input[df.iat[i,1]] = el["Affiliation"][0]
                            affiliation_found = True
                            break # Stop after finding the affiliation for any author                    
        
                if not affiliation_found:
                    NER_input[df.iat[i,1]] = None
                    no_affiliation.append(df.iat[i, 1])
        
            except (SyntaxError, ValueError, IndexError, KeyError):
                # Handle cases where ast.literal_eval fails, or index is out of bounds,
                # or 'Affiliation' key is not found in the expected structure.
                NER_input[df.iat[i,1]] = None
                no_affiliation.append(df.iat[i, 1])
    
        # Part 2: Do NER on the affiliation to extract structured info
        print("Doing NER of articles in csv: " + csv + " (" + str(list_csvs.index(csv)+1) + "/" + str(n_csvs) + ")")
        for article in list(NER_input.keys()):
            # Default language in affiliation text
            detected_lang = "undetermined"
    
            # Attempt to detect affiliation text language
            try:
                detected_lang = detect(NER_input[article])
            except Exception as e:
                continue
    
            # Use the model for the detected language (or a multilingual model if language not detected or no model available for the detected language)
            # Do NER with this model on the affiliation
            try:
                # Load model
                nlp = nlp_models.get(detected_lang, nlp_models["undetermined"])
    
                # Load affiliation text, do NER
                doc = nlp(NER_input[article])
                sentence_entities = []
    
                # Extract entities of interest
                for ent in doc.ents:
                    if ent.label_ in ["ORG", "LOC", "GPE"]:
                        sentence_entities.append({"text": ent.text, "label": ent.label_})
    
                # Save the entities associated to the article DOI
                NER_lastAuthor[article] = ({"entities": sentence_entities})
    
            # If NER not possible, store an empty dict
            except ValueError:
                NER_lastAuthor[article] = {}
                
        # Create df from dictionary
        df_NER_lastAuthor = pd.DataFrame.from_dict(NER_lastAuthor, orient="index")
        df_NER_lastAuthor = df_NER_lastAuthor.rename(columns={"entities": "NER_lastAuthor"})
        df_NER_lastAuthor["PMID_NER"] = df_NER_lastAuthor.index
    
        # Merge with original df, save
        df_save = pd.merge(df, df_NER_lastAuthor, left_on= "PMID", right_on="PMID_NER", how="left")
        df_save = df_save.drop(columns=["PMID_NER"])
        df_save.to_csv(DF_input+csv, index=False)
        with open(DF_input+"csv files with NER.txt", "a") as f:
            f.write(csv+"\n")
        parsed_csvs.append(csv)
        del df, df_save, df_NER_lastAuthor
        gc.collect()
        print("--Processing time: " + str(round(time.time() - start, 2)) + " s")

Extracting affiliation of articles in csv: parsedX_1100000.csv (3/45)
Doing NER of articles in csv: parsedX_1100000.csv (3/45)
--Processing time: 825.37 s
Extracting affiliation of articles in csv: parsedX_1200000.csv (4/45)
Doing NER of articles in csv: parsedX_1200000.csv (4/45)
--Processing time: 835.34 s
Extracting affiliation of articles in csv: parsedX_1300000.csv (5/45)
Doing NER of articles in csv: parsedX_1300000.csv (5/45)
--Processing time: 845.69 s
Extracting affiliation of articles in csv: parsedX_1400000.csv (6/45)
Doing NER of articles in csv: parsedX_1400000.csv (6/45)
--Processing time: 839.83 s
Extracting affiliation of articles in csv: parsedX_1500000.csv (7/45)
Doing NER of articles in csv: parsedX_1500000.csv (7/45)
--Processing time: 845.18 s
Extracting affiliation of articles in csv: parsedX_1600000.csv (8/45)
Doing NER of articles in csv: parsedX_1600000.csv (8/45)
--Processing time: 840.93 s
Extracting affiliation of articles in csv: parsedX_1700000.csv (9/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_1800000.csv (10/45)
--Processing time: 842.72 s
Extracting affiliation of articles in csv: parsedX_1900000.csv (11/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_1900000.csv (11/45)
--Processing time: 845.77 s
Extracting affiliation of articles in csv: parsedX_200000.csv (12/45)
Doing NER of articles in csv: parsedX_200000.csv (12/45)
--Processing time: 636.2 s
Extracting affiliation of articles in csv: parsedX_2000000.csv (13/45)
Doing NER of articles in csv: parsedX_2000000.csv (13/45)
--Processing time: 852.09 s
Extracting affiliation of articles in csv: parsedX_2100000.csv (14/45)
Doing NER of articles in csv: parsedX_2100000.csv (14/45)
--Processing time: 846.56 s
Extracting affiliation of articles in csv: parsedX_2200000.csv (15/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_2200000.csv (15/45)
--Processing time: 863.21 s
Extracting affiliation of articles in csv: parsedX_2300000.csv (16/45)
Doing NER of articles in csv: parsedX_2300000.csv (16/45)
--Processing time: 968.37 s
Extracting affiliation of articles in csv: parsedX_2400000.csv (17/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_2400000.csv (17/45)
--Processing time: 993.24 s
Extracting affiliation of articles in csv: parsedX_2500000.csv (18/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_2500000.csv (18/45)
--Processing time: 986.72 s
Extracting affiliation of articles in csv: parsedX_2600000.csv (19/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_2600000.csv (19/45)
--Processing time: 928.26 s
Extracting affiliation of articles in csv: parsedX_2700000.csv (20/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_2700000.csv (20/45)
--Processing time: 929.62 s
Extracting affiliation of articles in csv: parsedX_2800000.csv (21/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_2800000.csv (21/45)
--Processing time: 940.96 s
Extracting affiliation of articles in csv: parsedX_2900000.csv (22/45)
Doing NER of articles in csv: parsedX_2900000.csv (22/45)
--Processing time: 944.47 s
Extracting affiliation of articles in csv: parsedX_300000.csv (23/45)
Doing NER of articles in csv: parsedX_300000.csv (23/45)
--Processing time: 449.74 s
Extracting affiliation of articles in csv: parsedX_3000000.csv (24/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3000000.csv (24/45)
--Processing time: 928.53 s
Extracting affiliation of articles in csv: parsedX_3100000.csv (25/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3100000.csv (25/45)
--Processing time: 930.7 s
Extracting affiliation of articles in csv: parsedX_3200000.csv (26/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3200000.csv (26/45)
--Processing time: 948.77 s
Extracting affiliation of articles in csv: parsedX_3300000.csv (27/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3300000.csv (27/45)
--Processing time: 988.94 s
Extracting affiliation of articles in csv: parsedX_3400000.csv (28/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3400000.csv (28/45)
--Processing time: 997.5 s
Extracting affiliation of articles in csv: parsedX_3500000.csv (29/45)
Doing NER of articles in csv: parsedX_3500000.csv (29/45)
--Processing time: 1038.01 s
Extracting affiliation of articles in csv: parsedX_3600000.csv (30/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3600000.csv (30/45)
--Processing time: 952.86 s
Extracting affiliation of articles in csv: parsedX_3700000.csv (31/45)
Doing NER of articles in csv: parsedX_3700000.csv (31/45)
--Processing time: 948.55 s
Extracting affiliation of articles in csv: parsedX_3800000.csv (32/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3800000.csv (32/45)
--Processing time: 943.98 s
Extracting affiliation of articles in csv: parsedX_3900000.csv (33/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_3900000.csv (33/45)
--Processing time: 1010.38 s
Extracting affiliation of articles in csv: parsedX_400000.csv (34/45)
Doing NER of articles in csv: parsedX_400000.csv (34/45)
--Processing time: 87.3 s
Extracting affiliation of articles in csv: parsedX_4000000.csv (35/45)
Doing NER of articles in csv: parsedX_4000000.csv (35/45)
--Processing time: 1015.67 s
Extracting affiliation of articles in csv: parsedX_4100000.csv (36/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_4100000.csv (36/45)
--Processing time: 1019.86 s
Extracting affiliation of articles in csv: parsedX_4200000.csv (37/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_4200000.csv (37/45)
--Processing time: 1017.14 s
Extracting affiliation of articles in csv: parsedX_4300000.csv (38/45)


  df= pd.read_csv(DF_input + csv)


Doing NER of articles in csv: parsedX_4300000.csv (38/45)
--Processing time: 1011.17 s
Extracting affiliation of articles in csv: parsedX_4400000.csv (39/45)
Doing NER of articles in csv: parsedX_4400000.csv (39/45)
--Processing time: 1016.62 s
Extracting affiliation of articles in csv: parsedX_4454000.csv (40/45)
Doing NER of articles in csv: parsedX_4454000.csv (40/45)
--Processing time: 549.12 s
Extracting affiliation of articles in csv: parsedX_500000.csv (41/45)
Doing NER of articles in csv: parsedX_500000.csv (41/45)
--Processing time: 804.04 s
Extracting affiliation of articles in csv: parsedX_600000.csv (42/45)
Doing NER of articles in csv: parsedX_600000.csv (42/45)
--Processing time: 826.38 s
Extracting affiliation of articles in csv: parsedX_700000.csv (43/45)
Doing NER of articles in csv: parsedX_700000.csv (43/45)
--Processing time: 843.87 s
Extracting affiliation of articles in csv: parsedX_800000.csv (44/45)
Doing NER of articles in csv: parsedX_800000.csv (44/45)
--Proc