In [27]:
import os
import time
import ast
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import chain

# DType warning when importing .csv files with parsed PMIDs. Avoid showing the warning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
# Directory with the articles' dataset
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_first_update_match_GLOBOCAN\


In [4]:
# CSV with the finished Globocan dataset
GLOBOCAN_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\Globocan_dataset_ready.csv


In [5]:
# Import file names of csvs containint articles' data 
list_dfs_papers = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_dfs_papers.append(file)

list_dfs_papers

['parsedXMLs_first_upd_100000.csv',
 'parsedXMLs_first_upd_200000.csv',
 'parsedXMLs_first_upd_300000.csv',
 'parsedXMLs_first_upd_340800.csv']

In [6]:
# Show cancer types in Globocan dataset
df_globocan = pd.read_csv(GLOBOCAN_input)

set(df_globocan["Cancer"].to_list())

{'Anal cancer',
 'Bladder cancer',
 'Brain cancer',
 'Breast cancer',
 'Cervical cancer',
 'Colon cancer',
 'Colorectal cancer',
 'Esophageal cancer',
 'Gallbladder cancer',
 'Hodgkin lymphoma',
 'Kaposi sarcoma',
 'Kidney cancer',
 'Laryngeal cancer',
 'Leukemia',
 'Liver cancer',
 'Lung cancer',
 'Mesothelioma',
 'Mouth cancer',
 'Multiple myeloma',
 'Non-Hodgkin lymphoma',
 'Ovarian cancer',
 'Pancreatic cancer',
 'Penile cancer',
 'Prostate cancer',
 'Rectal cancer',
 'Salivary gland cancer',
 'Skin cancer',
 'Stomach cancer',
 'Testicular cancer',
 'Throat cancer',
 'Thyroid cancer',
 'Uterine cancer',
 'Vaginal cancer',
 'Vulvar cancer'}

In [7]:
# Show country names in Globocan dataset
set(df_globocan["Country"].to_list())

{'Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo Republic',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'DR Congo',
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hungary',
 'Iceland',
 'Ind

In [8]:
## Extract country names in the articles' dataset
articles_dataset_countries_list = []

# Import each csv with articles' data and extract country names
for csv in tqdm(list_dfs_papers):
    print(f"Parsing csv: {csv}, {list_dfs_papers.index(csv)+1} / {len(list_dfs_papers)}")
    df = pd.read_csv(DF_input + csv)
    list_countries = list(set(df["Country"]))
    articles_dataset_countries_list.append(list_countries)
    del df, list_countries

# Flatten list, remove nan, convert to set
articles_dataset_countries = set([el for el in (chain(*articles_dataset_countries_list)) if str(el) != "nan"])

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Parsing csv: parsedXMLs_first_upd_100000.csv, 1 / 4


 25%|█████████████████████                                                               | 1/4 [00:05<00:16,  5.47s/it]

Parsing csv: parsedXMLs_first_upd_200000.csv, 2 / 4


 50%|██████████████████████████████████████████                                          | 2/4 [00:10<00:10,  5.44s/it]

Parsing csv: parsedXMLs_first_upd_300000.csv, 3 / 4


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:16<00:05,  5.45s/it]

Parsing csv: parsedXMLs_first_upd_340800.csv, 4 / 4


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:18<00:00,  4.63s/it]


In [9]:
# Unique entity names in the Globocan dataset
globocan_dataset_countries = set(df_globocan["Country"])

# Countries appearing in Globocan dataset and missing in articles' dataset
globocan_dataset_countries.difference(articles_dataset_countries)

{'Belize',
 'Cabo Verde',
 'Djibouti',
 'Guinea',
 'Guyana',
 'Laos',
 'Lesotho',
 'Mauritania',
 'Micronesia, Fed. Sts.',
 'New Caledonia',
 'Solomon Islands',
 'South Sudan',
 'Tajikistan',
 'Turkmenistan',
 'Vanuatu'}

No country in the Globocan dataset is absent from the articles' dataset

In [10]:
# Entities appearing in articles' dataset and missing in Globocan dataset
articles_dataset_countries.difference(globocan_dataset_countries)

{'Aruba'}

# Matching country names in Globocan and articles' datasets

There are some naming discrepancies in the articles's dataset with respect to the Globocan dataset (e.g. there is no specification of **Taiwan** or **Hong Kong** in the Globocan dataset, there is only **China**). Here, these discrepancies are solved. In the case just mentioned, for instance, country of articles with coutry of publishing correspoding to **Taiwan** or **Hong Kong** are renamed to **China**. Other renamings are performed so that the country names and corresponding political entities are as similar as possible in the Globocan and the articles' datasets, so that comparisons performed downstream are as accurate as possible

In [11]:
# Remove or rename the coountries appearing in articles' dataset and missing in Globocan dataset

for csv in tqdm(list_dfs_papers):
    print(f"Parsing csv: {csv}, {list_dfs_papers.index(csv)+1} / {len(list_dfs_papers)}")
    df = pd.read_csv(DF_input + csv)

    ## REMOVE ARTICLES FROM NON-MATCHING COUNTRIES
    # Remove articles with country name "Andorra" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Andorra"].index, inplace = True)

    # Remove articles with country name "Tonga" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Tonga"].index, inplace = True)

    # Remove articles with country name "St. Kitts and Nevis" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "St. Kitts and Nevis"].index, inplace = True)

    # Remove articles with country name "Liechtenstein" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Liechtenstein"].index, inplace = True)

    # Remove articles with country name "St. Vincent and the Grenadines" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "St. Vincent and the Grenadines"].index, inplace = True)

    # Remove articles with country name "Antigua and Barbuda" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Antigua and Barbuda"].index, inplace = True)

    # Remove articles with country name "Monaco" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Monaco"].index, inplace = True)

    # Remove articles with country name "Tuvalu" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Tuvalu"].index, inplace = True)

    # Remove articles with country name "Dominica" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Dominica"].index, inplace = True)

    # Remove articles with country name "Palau" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Palau"].index, inplace = True)

    # Remove articles with country name "Marshall Islands" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Marshall Islands"].index, inplace = True)

    # Remove articles with country name "Grenada" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Grenada"].index, inplace = True)

    # Remove articles with country name "Vatican" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Vatican"].index, inplace = True)

    # Remove articles with country name "Vatican" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Kosovo"].index, inplace = True)

    # Remove articles with country name "Seychelles" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Seychelles"].index, inplace = True)

    # Remove articles with country name "San Marino" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "San Marino"].index, inplace = True)

    # Remove articles with country name "Andorra" (no matching data in Globocan dataset)
    df.drop(df.loc[df["Country"] == "Andorra"].index, inplace = True)

    ## RENAME COUNTRIES
    # Rename Country in articles with Country name "Curacao" to Netherlands
    df.loc[df["Country"] == "Curacao", "Country"] = "Netherlands"

    # Rename Country in articles with Country name "Isle of Man" to United Kingdom
    df.loc[df["Country"] == "Isle of Man", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "Aruba" to Netherlands
    df.loc[df["Country"] == "Aruba", "Country"] = "Netherlands"

    # Rename Country in articles with Country name "Bonaire, Saint Eustatius and Saba" to Netherlands
    df.loc[df["Country"] == "Bonaire, Saint Eustatius and Saba", "Country"] = "Netherlands"

    # Rename Country in articles with Country name "St. Pierre and Miquelon" to France
    df.loc[df["Country"] == "St. Pierre and Miquelon", "Country"] = "France"

    # Rename Country in articles with Country name "Jersey" to United Kingdom
    df.loc[df["Country"] == "Jersey", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "Hong Kong" to China
    df.loc[df["Country"] == "Hong Kong", "Country"] = "China"

    # Rename Country in articles with Country name "Bermuda" to United Kingdom
    df.loc[df["Country"] == "Bermuda", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "Faroe Islands" to Denmark
    df.loc[df["Country"] == "Faroe Islands", "Country"] = "Denmark"

    # Rename Country in articles with Country name "Guernsey" to United Kingdom
    df.loc[df["Country"] == "Guernsey", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "Anguilla" to United Kingdom
    df.loc[df["Country"] == "Anguilla", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "Greenland" to Denmark
    df.loc[df["Country"] == "Greenland", "Country"] = "Denmark"

    # Rename Country in articles with Country name "Northern Mariana Islands" to United States
    df.loc[df["Country"] == "Northern Mariana Islands", "Country"] = "United States"

    # Rename Country in articles with Country name "Cayman Islands" to United Kingdom
    df.loc[df["Country"] == "Cayman Islands", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "Macau" to China
    df.loc[df["Country"] == "Macau", "Country"] = "China"

    # Rename Country in articles with Country name "Taiwan" to China
    df.loc[df["Country"] == "Taiwan", "Country"] = "China"

    # Rename Country in articles with Country name "British Virgin Islands" to United Kingdom
    df.loc[df["Country"] == "British Virgin Islands", "Country"] = "United Kingdom"

    # Rename Country in articles with Country name "American Samoa" to United States
    df.loc[df["Country"] == "American Samoa", "Country"] = "United States"

    # Rename Country in articles with Country name "United States Virgin Islands" to United States
    df.loc[df["Country"] == "United States Virgin Islands", "Country"] = "United States"

    # Rename Country in articles with Country name "Sint Maarten" to Netherlands
    df.loc[df["Country"] == "Sint Maarten", "Country"] = "Netherlands"

    # Rename Country in articles with Country name "Mayotte" to France
    df.loc[df["Country"] == "Mayotte", "Country"] = "France"

    df.to_csv(DF_input + csv, index = False)
    del df

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Parsing csv: parsedXMLs_first_upd_100000.csv, 1 / 4


 25%|█████████████████████                                                               | 1/4 [00:14<00:42, 14.14s/it]

Parsing csv: parsedXMLs_first_upd_200000.csv, 2 / 4


 50%|██████████████████████████████████████████                                          | 2/4 [00:27<00:27, 13.90s/it]

Parsing csv: parsedXMLs_first_upd_300000.csv, 3 / 4


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:42<00:14, 14.42s/it]

Parsing csv: parsedXMLs_first_upd_340800.csv, 4 / 4


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:48<00:00, 12.23s/it]


In [12]:
# Remaining non-matching countries
## Extract country names in the articles' dataset
articles_dataset_countries_list = []

# Import each csv with articles' data and extract country names
for csv in tqdm(list_dfs_papers):
    print(f"Parsing csv: {csv}, {list_dfs_papers.index(csv)+1} / {len(list_dfs_papers)}")
    df = pd.read_csv(DF_input + csv)
    list_countries = list(set(df["Country"]))
    articles_dataset_countries_list.append(list_countries)
    del df, list_countries

# Flatten list, remove nan, convert to set
articles_dataset_countries = set([el for el in (chain(*articles_dataset_countries_list)) if str(el) != "nan"])

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Parsing csv: parsedXMLs_first_upd_100000.csv, 1 / 4


 25%|█████████████████████                                                               | 1/4 [00:06<00:18,  6.07s/it]

Parsing csv: parsedXMLs_first_upd_200000.csv, 2 / 4


 50%|██████████████████████████████████████████                                          | 2/4 [00:11<00:11,  5.98s/it]

Parsing csv: parsedXMLs_first_upd_300000.csv, 3 / 4


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:17<00:05,  5.94s/it]

Parsing csv: parsedXMLs_first_upd_340800.csv, 4 / 4


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:20<00:00,  5.03s/it]


In [13]:
# Countries appearing in articles' dataset and missing in Globocan dataset
articles_dataset_countries.difference(globocan_dataset_countries)

set()

No more non-matching countries between the Globocan and the articles' dataset

# Matching cancer names in Globocan and articles' datasets

There are also some naming discrepancies in the cancer names articles's dataset with respect to the Globocan dataset (e.g. the articles' dataset contains a category **Breast cancer** as well as a category **Ductal carcinoma**, which is a type of Breast cancer, whereas the Globocan dataset contains only **Breast cancer**). Here, cancer names/subtypes are homogeneized between the Globocan and the articles' dataset.

In [14]:
# Show cancer types available in the Globocan dataset
globocan_dataset_cancers = set(df_globocan["Cancer"].to_list())
globocan_dataset_cancers

{'Anal cancer',
 'Bladder cancer',
 'Brain cancer',
 'Breast cancer',
 'Cervical cancer',
 'Colon cancer',
 'Colorectal cancer',
 'Esophageal cancer',
 'Gallbladder cancer',
 'Hodgkin lymphoma',
 'Kaposi sarcoma',
 'Kidney cancer',
 'Laryngeal cancer',
 'Leukemia',
 'Liver cancer',
 'Lung cancer',
 'Mesothelioma',
 'Mouth cancer',
 'Multiple myeloma',
 'Non-Hodgkin lymphoma',
 'Ovarian cancer',
 'Pancreatic cancer',
 'Penile cancer',
 'Prostate cancer',
 'Rectal cancer',
 'Salivary gland cancer',
 'Skin cancer',
 'Stomach cancer',
 'Testicular cancer',
 'Throat cancer',
 'Thyroid cancer',
 'Uterine cancer',
 'Vaginal cancer',
 'Vulvar cancer'}

In [15]:
# Show cancer types available in the articles' dataset
## Extract cancer names in the articles' dataset
articles_dataset_cancers_list = []

# Import each csv with articles' data and extract cancer names
for csv in tqdm(list_dfs_papers):
    print(f"Parsing csv: {csv}, {list_dfs_papers.index(csv)+1} / {len(list_dfs_papers)}")
    df = pd.read_csv(DF_input + csv)
    list_cancers = [ast.literal_eval(el) for el in df["Cancer"].to_list() if len(ast.literal_eval(el)) > 0]
    articles_dataset_cancers_list.append(list_cancers)
    del df, list_cancers

# Flatten the lists and get a set of available cancer types
def recursive_chain(iterable):
    for item in iterable:
        if isinstance(item, list):
            yield from recursive_chain(item)
        else:
            yield item

articles_dataset_cancers = set(list(recursive_chain(articles_dataset_cancers_list)))

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Parsing csv: parsedXMLs_first_upd_100000.csv, 1 / 4


 25%|█████████████████████                                                               | 1/4 [00:07<00:21,  7.29s/it]

Parsing csv: parsedXMLs_first_upd_200000.csv, 2 / 4


 50%|██████████████████████████████████████████                                          | 2/4 [00:14<00:14,  7.41s/it]

Parsing csv: parsedXMLs_first_upd_300000.csv, 3 / 4


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:22<00:07,  7.52s/it]

Parsing csv: parsedXMLs_first_upd_340800.csv, 4 / 4


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:25<00:00,  6.35s/it]


In [16]:
articles_dataset_cancers

{'Acute lymphoblastic leukemia',
 'Acute myeloid leukemia',
 'Adenocarcinoma',
 'Adenocarcinoma of the colon',
 'Adenocarcinoma of the pancreas',
 'Adenoid cystic carcinoma',
 'Adrenocortical carcinoma',
 'Aggressive systemic mastocytosis',
 'Alveolar soft part sarcoma',
 'Ampullary cancer',
 'Ampullary carcinoma',
 'Anal cancer',
 'Anaplastic thyroid cancer',
 'Angiosarcoma',
 'Appendix cancer',
 'Astrocytoma',
 'Atypical chronic myeloid leukemia',
 'Basal cell carcinoma',
 'Bladder cancer',
 'Bone cancer',
 'Brain cancer',
 'Breast cancer',
 'Cervical cancer',
 'Cholangiocarcinoma',
 'Chondrosarcoma',
 'Chordoma',
 'Chronic eosinophilic leukemia',
 'Chronic lymphocytic leukemia',
 'Chronic myeloid leukemia',
 'Chronic neutrophilic leukemia',
 'Clear cell carcinoma',
 'Colon cancer',
 'Colorectal cancer',
 'Cutaneous mastocytosis',
 'Desmoplastic small round cell tumor',
 'Diffuse large B-cell lymphoma',
 'Ductal carcinoma',
 'Duodenal cancer',
 'Ear cancer',
 'Endometrial cancer',
 '

Cancer types in the present in the Globocan dataset but not in the articles' dataset

In [17]:
globocan_dataset_cancers.difference(articles_dataset_cancers)

set()

Cancer types in the present in the articles' dataset but not in the Globocan dataset

In [18]:
articles_dataset_cancers.difference(globocan_dataset_cancers)

{'Acute lymphoblastic leukemia',
 'Acute myeloid leukemia',
 'Adenocarcinoma',
 'Adenocarcinoma of the colon',
 'Adenocarcinoma of the pancreas',
 'Adenoid cystic carcinoma',
 'Adrenocortical carcinoma',
 'Aggressive systemic mastocytosis',
 'Alveolar soft part sarcoma',
 'Ampullary cancer',
 'Ampullary carcinoma',
 'Anaplastic thyroid cancer',
 'Angiosarcoma',
 'Appendix cancer',
 'Astrocytoma',
 'Atypical chronic myeloid leukemia',
 'Basal cell carcinoma',
 'Bone cancer',
 'Cholangiocarcinoma',
 'Chondrosarcoma',
 'Chordoma',
 'Chronic eosinophilic leukemia',
 'Chronic lymphocytic leukemia',
 'Chronic myeloid leukemia',
 'Chronic neutrophilic leukemia',
 'Clear cell carcinoma',
 'Cutaneous mastocytosis',
 'Desmoplastic small round cell tumor',
 'Diffuse large B-cell lymphoma',
 'Ductal carcinoma',
 'Duodenal cancer',
 'Ear cancer',
 'Endometrial cancer',
 'Endometrial stromal sarcoma',
 'Ependymoma',
 'Epithelioid sarcoma',
 'Essential thrombocythemia',
 'Ewing sarcoma',
 'Extraskele

for cancer in articles_dataset_cancers.difference(temp_df.columns):
    temp_df[cancer] = 0

In [20]:
# Here, cancers which can be considered synonyms or subtypes of main cancers present in the 
# Globocan dataset are combined with these main cancers. 
# Subsequently, these synonyms and subtypes are removed
cancers_remove_matched = []

for csv in tqdm(list_dfs_papers):
    df = pd.read_csv(DF_input + csv)

    # Some dfs don't contain all cancer types of the whole dataset. For these cases, add each missing cancer with value 0 for all papers
    for cancer in articles_dataset_cancers.difference(df.columns):
        df[cancer] = 0

    ## Synonyms and subtypes of brain cancer
    # Schwannoma is left out since it affects most frequently the peripheral nervous system
    df["Brain cancer"] = df["Brain cancer"] + df["Astrocytoma"] + df["Ependymoma"] + df["Glioblastoma"] + df["Oligodendroglioma"] + df["Meningioma"] + df["Medulloblastoma"] + df["Primitive neuroectodermal tumor"]
    df["Brain cancer"] = np.where(df["Brain cancer"] > 0, 1, df["Brain cancer"])
    cancers_remove_matched += ["Astrocytoma", "Ependymoma", "Glioblastoma", "Oligodendroglioma", "Meningioma", "Medulloblastoma", "Primitive neuroectodermal tumor"]
    
    ## Synonyms and subtypes of breast cancer
    df["Breast cancer"] = df["Breast cancer"] + df["Ductal carcinoma"] + df["Lobular carcinoma"]
    df["Breast cancer"] = np.where(df["Breast cancer"] > 0, 1, df["Breast cancer"])
    cancers_remove_matched += ["Ductal carcinoma", "Lobular carcinoma"]

    ## Synonyms and subtypes of colon cancer
    df["Colon cancer"] = df["Colon cancer"] + df["Adenocarcinoma of the colon"]
    df["Colon cancer"] = np.where(df["Colon cancer"] > 0, 1, df["Colon cancer"])
    cancers_remove_matched += ["Adenocarcinoma of the colon"]

    ## Synonyms and subtypes of kidney cancer
    df["Kidney cancer"] = df["Kidney cancer"] + df["Renal cell carcinoma"] + df["Wilms tumor"]
    df["Kidney cancer"] = np.where(df["Kidney cancer"] > 0, 1, df["Kidney cancer"])
    cancers_remove_matched += ["Renal cell carcinoma", "Wilms tumor"]

    ## Synonyms and subtypes of leukemia
    df["Leukemia"] = df["Leukemia"] + df["Acute lymphoblastic leukemia"] + df["Acute myeloid leukemia"] + df["Atypical chronic myeloid leukemia"] + df["Chronic eosinophilic leukemia"] + df["Chronic lymphocytic leukemia"] + df["Chronic myeloid leukemia"] + df["Chronic neutrophilic leukemia"] + df["Hairy cell leukemia"] + df["Juvenile myelomonocytic leukemia"] + df["Mast cell leukemia"]
    df["Leukemia"] = np.where(df["Leukemia"] > 0, 1, df["Leukemia"])
    cancers_remove_matched += ["Acute lymphoblastic leukemia", "Acute myeloid leukemia", "Atypical chronic myeloid leukemia", "Chronic eosinophilic leukemia", "Chronic lymphocytic leukemia", "Chronic myeloid leukemia", "Chronic neutrophilic leukemia", "Hairy cell leukemia", "Juvenile myelomonocytic leukemia", "Mast cell leukemia"]

    ## Synonyms and subtypes of liver cancer
    # Cholangiocarcinoma is left out since it is not always associated to the liver
    df["Liver cancer"] = df["Liver cancer"] + df["Hepatocellular carcinoma"] + df["Fibrolamellar carcinoma"]
    df["Liver cancer"] = np.where(df["Liver cancer"] > 0, 1, df["Liver cancer"])
    cancers_remove_matched += ["Hepatocellular carcinoma", "Fibrolamellar carcinoma"]

    ## Synonyms and subtypes of lung cancer
    df["Lung cancer"] = df["Liver cancer"] + df["Non-small cell lung cancer"] + df["Small cell lung cancer"]
    df["Lung cancer"] = np.where(df["Lung cancer"] > 0, 1, df["Lung cancer"])
    cancers_remove_matched += ["Non-small cell lung cancer", "Small cell lung cancer"]
    
    ## Synonyms and subtypes of non-Hodgkin lymphoma
    df["Non-Hodgkin lymphoma"] = df["Non-Hodgkin lymphoma"] + df["Diffuse large B-cell lymphoma"] + df["Follicular lymphoma"] + df["Marginal zone lymphoma"] + df["Mucosa-associated lymphoid tissue lymphoma"]  + df["Peripheral T-cell lymphoma"] 
    df["Non-Hodgkin lymphoma"] = np.where(df["Non-Hodgkin lymphoma"] > 0, 1, df["Non-Hodgkin lymphoma"])
    cancers_remove_matched += ["Diffuse large B-cell lymphoma", "Follicular lymphoma", "Marginal zone lymphoma", "Mucosa-associated lymphoid tissue lymphoma", "Peripheral T-cell lymphoma"]
    
    ## Synonyms and subtypes of skin cancer
    # This category combines melanoma and non-melanoma skin cancers, as in Globocan
    # Squamous cell carcinoma is left out since it can affect other areas beyond the skin
    df["Skin cancer"] = df["Skin cancer"] + df["Melanoma"] + df["Squamous cell carcinoma of the skin"] + df["Basal cell carcinoma"] + df["Merkel cell carcinoma"] + df["Sebaceous carcinoma"] + df["Sebaceous gland carcinoma"]
    df["Skin cancer"] = np.where(df["Skin cancer"] > 0, 1, df["Skin cancer"])
    cancers_remove_matched += ["Melanoma", "Squamous cell carcinoma of the skin", "Basal cell carcinoma", "Merkel cell carcinoma", "Sebaceous carcinoma", "Sebaceous gland carcinoma"]

    ## Synonyms and subtypes of ovarian cancer
    df["Ovarian cancer"] = df["Ovarian cancer"] + df["Granulosa cell tumor"]
    df["Ovarian cancer"] = np.where(df["Ovarian cancer"] > 0, 1, df["Ovarian cancer"])
    cancers_remove_matched += ["Granulosa cell tumor"]

    ## Synonyms and subtypes of pancreatic cancer
    df["Pancreatic cancer"] = df["Pancreatic cancer"] + df["Adenocarcinoma of the pancreas"]
    df["Pancreatic cancer"] = np.where(df["Pancreatic cancer"] > 0, 1, df["Pancreatic cancer"])
    cancers_remove_matched += ["Adenocarcinoma of the pancreas"]

    ## Synonyms and subtypes of stomach cancer
    df["Stomach cancer"] = df["Stomach cancer"] + df["Gastric adenocarcinoma"]
    df["Stomach cancer"] = np.where(df["Stomach cancer"] > 0, 1, df["Stomach cancer"])
    cancers_remove_matched += ["Gastric adenocarcinoma"]

    ## Synonyms and subtypes of thyroid cancer
    df["Thyroid cancer"] = df["Thyroid cancer"] + df["Papillary thyroid cancer"]  + df["Follicular thyroid cancer"]  + df["Medullary thyroid cancer"]  + df["Anaplastic thyroid cancer"]
    df["Thyroid cancer"] = np.where(df["Thyroid cancer"] > 0, 1, df["Thyroid cancer"])
    cancers_remove_matched += ["Papillary thyroid cancer", "Follicular thyroid cancer", "Medullary thyroid cancer", "Anaplastic thyroid cancer"]

    ## Synonyms and subtypes of uterine cancer
    df["Uterine cancer"] = df["Uterine cancer"] + df["Endometrial cancer"] + df["Endometrial stromal sarcoma"]
    df["Uterine cancer"] = np.where(df["Uterine cancer"] > 0, 1, df["Uterine cancer"])
    cancers_remove_matched += ["Endometrial cancer", "Endometrial stromal sarcoma"]

    ## Synonyms and subtypes of vulvar cancer
    df["Vulvar cancer"] = df["Vulvar cancer"] + df["Vulval cancer"]
    df["Vulvar cancer"] = np.where(df["Vulvar cancer"] > 0, 1, df["Vulvar cancer"])
    cancers_remove_matched += ["Vulval cancer"]

    # Remove columns corresponding to synonyms and subtypes of main cancer types
    all_df_columns = set(df.columns.to_list())
    columns_df_keep = list(all_df_columns.difference(set(cancers_remove_matched)))
    df = df[columns_df_keep]
    
    df.to_csv(DF_input + csv, index = False)
    del df

  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] = 0
  df[cancer] =

In [21]:
# All columns in the articles' dataset
all_columns_articles_dataset = []

for csv in tqdm(list_dfs_papers):
    df = pd.read_csv(DF_input + csv)
    for column in df.columns.to_list():
        if column not in all_columns_articles_dataset:
            all_columns_articles_dataset.append(column)

    del df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.41s/it]


In [28]:
# Cancer types which are not matched to Globocan cancers
columns_articles_data = ["PMID", "DOI", "Journal", "Title", "BookTitle", "Book Accession", "Publisher", "Abstract", "PublicationDate", "Authors", "NER_lastAuthor",
                        "Country", "Country_source", "Keywords", "MeshHeadings", "Language", "Type", "PublicationTypes", "Chemicals"] 

columns_species = ["Species", "Undetermined_Species", "human", "patient", "primate", "macaque", "monkey", "Chimpanzee",
                   "rat", "mouse", "mice", "rabbit", "Lagomorph", "guinea pig", "zebrafish", "medaka", "fruit fly",
                   "Syrian hamster", "Homo sapiens", "H.sapiens", "Rhesus macaque", "Rhesus monkey", "Macaca mulatta",
                   "M. mulatta", "Macaca fascicularis", "M. fasticularis", "Cynomolgus", "Pan troglodytes", "P. troglodytes",
                   "Rattus norvegicus", "R. norvegicus" "Mus musculus", "M. musculus", "Oryctolagus cuniculus", "O. cuniculus",
                   "Cavia porcellus", "C. porcellus", "Danio rerio", "D. rerio", "Drosophila", "D. melanogaster", 
                   "Mesocricetus auratus", "M. auratus", "Oryzias latipes", "O. latipes", "Caenorhabditis elegans" "C. elegans"]

cancer_columns_keep = list(globocan_dataset_cancers) + [ "Cancer", "Undetermined_Cancer"]

columns_keep = columns_articles_data + cancer_columns_keep + columns_species

other_cancers = set(all_columns_articles_dataset).difference(set(columns_keep))
other_cancers

{'Adenocarcinoma',
 'Adenoid cystic carcinoma',
 'Adrenocortical carcinoma',
 'Aggressive systemic mastocytosis',
 'Alveolar soft part sarcoma',
 'Ampullary cancer',
 'Ampullary carcinoma',
 'Angiosarcoma',
 'Appendix cancer',
 'Bone cancer',
 'Cholangiocarcinoma',
 'Chondrosarcoma',
 'Chordoma',
 'Clear cell carcinoma',
 'Cutaneous mastocytosis',
 'Desmoplastic small round cell tumor',
 'Duodenal cancer',
 'Ear cancer',
 'Epithelioid sarcoma',
 'Essential thrombocythemia',
 'Ewing sarcoma',
 'Extraskeletal chondrosarcoma',
 'Extraskeletal myxoid chondrosarcoma',
 'Extraskeletal osteosarcoma',
 'Eye cancer',
 'Fallopian tube cancer',
 'Fibrosarcoma',
 'Gastrointestinal stromal tumor',
 'Giant cell tumor of bone',
 'Head and neck cancer',
 'Ileal cancer',
 'Jejunal cancer',
 'Leiomyoma',
 'Leiomyosarcoma',
 'Liposarcoma',
 'Lymphoma',
 'Malignant fibrous histiocytoma',
 'Malignant mesenchymoma',
 'Malignant peripheral nerve sheath tumor',
 'Mantle cell lymphoma',
 'Mast cell sarcoma',
 

In [23]:
# Deal with articles with cancer types not in the main cancer types and not being synonyms or subtypes of these
# Some cancer types (e.g. squamous cell carcinoma) can affect several tissues/organs and therefore were not  
# matched to any main cancer type in the previous step. These cancers and any other cancer
# which was not matched will be combined into an "Other cancers" category

for csv in tqdm(list_dfs_papers):
    df = pd.read_csv(DF_input + csv)

    ## Dealing with other cancer types beyond Globocan cancers and synonyms/subtypes
    # Create a 
    df["Other cancer"] = df[list(other_cancers)].sum(axis = 1)

    # Remove columns corresponding to other cancer types
    all_df_columns = set(df.columns.to_list())
    columns_df_keep = list(all_df_columns.difference(set(other_cancers)))
    df = df[columns_df_keep]
    
    df.to_csv(DF_input + csv, index = False)
    del df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:48<00:00, 12.10s/it]


In [24]:
# Remaining columns in the dataset
set(pd.read_csv(DF_input + list_dfs_papers[1]).columns.to_list())

{'Abstract',
 'Anal cancer',
 'Authors',
 'Bladder cancer',
 'Book Accession',
 'BookTitle',
 'Brain cancer',
 'Breast cancer',
 'Cancer',
 'Cavia porcellus',
 'Cervical cancer',
 'Chemicals',
 'Chimpanzee',
 'Colon cancer',
 'Colorectal cancer',
 'Country',
 'Country_source',
 'D. melanogaster',
 'DOI',
 'Danio rerio',
 'Drosophila',
 'Esophageal cancer',
 'Gallbladder cancer',
 'Hodgkin lymphoma',
 'Homo sapiens',
 'Journal',
 'Kaposi sarcoma',
 'Keywords',
 'Kidney cancer',
 'Language',
 'Laryngeal cancer',
 'Leukemia',
 'Liver cancer',
 'Lung cancer',
 'Macaca fascicularis',
 'Macaca mulatta',
 'MeshHeadings',
 'Mesocricetus auratus',
 'Mesothelioma',
 'Mouth cancer',
 'Multiple myeloma',
 'NER_lastAuthor',
 'Non-Hodgkin lymphoma',
 'Oryctolagus cuniculus',
 'Ovarian cancer',
 'PMID',
 'Pan troglodytes',
 'Pancreatic cancer',
 'Penile cancer',
 'Prostate cancer',
 'PublicationDate',
 'PublicationTypes',
 'Publisher',
 'Rattus norvegicus',
 'Rectal cancer',
 'Rhesus macaque',
 'Rhes

In [25]:
# Rename "Undetermined_Cancer" to "Undetermined cancer"
for csv in tqdm(list_dfs_papers):
    df = pd.read_csv(DF_input + csv)
    df.rename(columns = {"Undetermined_Cancer" : "Undetermined cancer"}, inplace=True)
    df.to_csv(DF_input + csv, index = False)
    del df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:47<00:00, 11.75s/it]


In [26]:
temp_df = pd.read_csv(DF_input + list_dfs_papers[0])
temp_df.columns.to_list()

['Chimpanzee',
 'PublicationTypes',
 'Species',
 'Oryzias latipes',
 'Cavia porcellus',
 'Laryngeal cancer',
 'Syrian hamster',
 'Undetermined_Species',
 'Abstract',
 'PublicationDate',
 'M. musculus',
 'monkey',
 'BookTitle',
 'Anal cancer',
 'Publisher',
 'Kidney cancer',
 'macaque',
 'Hodgkin lymphoma',
 'human',
 'Rhesus monkey',
 'Bladder cancer',
 'D. melanogaster',
 'Title',
 'Thyroid cancer',
 'Book Accession',
 'Kaposi sarcoma',
 'Lung cancer',
 'Penile cancer',
 'Breast cancer',
 'Leukemia',
 'Brain cancer',
 'Liver cancer',
 'Homo sapiens',
 'fruit fly',
 'Prostate cancer',
 'Esophageal cancer',
 'DOI',
 'Keywords',
 'Stomach cancer',
 'Non-Hodgkin lymphoma',
 'Country_source',
 'Ovarian cancer',
 'Macaca fascicularis',
 'medaka',
 'mice',
 'rat',
 'PMID',
 'Country',
 'Mesothelioma',
 'Chemicals',
 'Macaca mulatta',
 'Colon cancer',
 'NER_lastAuthor',
 'Pancreatic cancer',
 'Gallbladder cancer',
 'Uterine cancer',
 'Journal',
 'Type',
 'Multiple myeloma',
 'Lagomorph',
 'Me