In [1]:
import pandas as pd
import os
import re
from itertools import chain
from tqdm import tqdm
import time

# DType warning when importing .csv files with parsed PMIDs. Avoid showing the warning
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

In [2]:
# Directory with Globocal data separated by cancer
CSV_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\Globocan_data\


In [78]:
# Directory to save Globocan curated dataset
Globocan_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [3]:
# Directory with articles' data (to extract country names)
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs\


In [41]:
# Import file names of csvs containint Globocan data 
list_csvs = []

for file in os.listdir(CSV_input):
    if file[-4:] == ".csv":
        list_csvs.append(file)

list_csvs

['dataset-inc-both-sexes-in-2022-anus.csv',
 'dataset-inc-both-sexes-in-2022-bladder.csv',
 'dataset-inc-both-sexes-in-2022-brain-central-nervous-system.csv',
 'dataset-inc-both-sexes-in-2022-breast.csv',
 'dataset-inc-both-sexes-in-2022-cervix-uteri.csv',
 'dataset-inc-both-sexes-in-2022-colon.csv',
 'dataset-inc-both-sexes-in-2022-colorectum.csv',
 'dataset-inc-both-sexes-in-2022-corpus-uteri.csv',
 'dataset-inc-both-sexes-in-2022-gallbladder.csv',
 'dataset-inc-both-sexes-in-2022-hodgkin-lymphoma.csv',
 'dataset-inc-both-sexes-in-2022-hypopharynx.csv',
 'dataset-inc-both-sexes-in-2022-kaposi-sarcoma.csv',
 'dataset-inc-both-sexes-in-2022-kidney.csv',
 'dataset-inc-both-sexes-in-2022-larynx.csv',
 'dataset-inc-both-sexes-in-2022-leukaemia.csv',
 'dataset-inc-both-sexes-in-2022-lip-oral-cavity.csv',
 'dataset-inc-both-sexes-in-2022-liver-and-intrahepatic-bile-ducts.csv',
 'dataset-inc-both-sexes-in-2022-melanoma-of-skin.csv',
 'dataset-inc-both-sexes-in-2022-mesothelioma.csv',
 'datas

In [5]:
# Import file names of csvs containint articles' data 

list_dfs_papers = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_dfs_papers.append(file)

list_dfs_papers

['parsedX_100000.csv',
 'parsedX_1000000.csv',
 'parsedX_1100000.csv',
 'parsedX_1200000.csv',
 'parsedX_1300000.csv',
 'parsedX_1400000.csv',
 'parsedX_1500000.csv',
 'parsedX_1600000.csv',
 'parsedX_1700000.csv',
 'parsedX_1800000.csv',
 'parsedX_1900000.csv',
 'parsedX_200000.csv',
 'parsedX_2000000.csv',
 'parsedX_2100000.csv',
 'parsedX_2200000.csv',
 'parsedX_2300000.csv',
 'parsedX_2400000.csv',
 'parsedX_2500000.csv',
 'parsedX_2600000.csv',
 'parsedX_2700000.csv',
 'parsedX_2800000.csv',
 'parsedX_2900000.csv',
 'parsedX_300000.csv',
 'parsedX_3000000.csv',
 'parsedX_3100000.csv',
 'parsedX_3200000.csv',
 'parsedX_3300000.csv',
 'parsedX_3400000.csv',
 'parsedX_3500000.csv',
 'parsedX_3600000.csv',
 'parsedX_3700000.csv',
 'parsedX_3800000.csv',
 'parsedX_3900000.csv',
 'parsedX_400000.csv',
 'parsedX_4000000.csv',
 'parsedX_4100000.csv',
 'parsedX_4200000.csv',
 'parsedX_4300000.csv',
 'parsedX_4400000.csv',
 'parsedX_4454000.csv',
 'parsedX_500000.csv',
 'parsedX_600000.csv'

In [56]:
# Import dataframes
list_dfs = []
for csv in list_csvs:
    df = pd.read_csv(CSV_input + csv)
    df["Cancer"] = csv.split("in-2022-")[1].split(".csv")[0]
    list_dfs.append(df)
    del df

# Bind into one dataframe
df_globocan = pd.concat(list_dfs, axis = 0)

# Drop variable "Country" (contains country numeric code, useless)
df_globocan = df_globocan.drop(columns=["Country"])

## Note

For each cancer, there is information of independent countries as well as aggregated information for relevant supranational entities (e.g. WHO East Mediterranean Region (EMRO)). Information concerning these supranational entities is removed

In [57]:
df_globocan.head()

Unnamed: 0,Alpha-3 code,Cancer code,Population code (ISO/UN),Label,Sex,Number,95% UI low,95% UI high,Number.1,ASR (World),Crude rate,Cumulative risk,Cancer
0,AFG,10.0,4.0,Afghanistan,0.0,110,15.0,793.0,110,0.47,0.27,0.06,anus
1,ALB,10.0,8.0,Albania,0.0,11,4.0,28.0,11,0.24,0.38,0.03,anus
2,DZA,10.0,12.0,Algeria,0.0,158,48.0,517.0,158,0.34,0.35,0.04,anus
3,AGO,10.0,24.0,Angola,0.0,38,35.0,41.0,38,0.24,0.11,0.04,anus
4,AZE,10.0,31.0,Azerbaijan,0.0,106,8.0,1471.0,106,0.87,1.0,0.08,anus


## Note

Country naming will follow the same pattern as the country names in the articles' dataset. Country names in the Globocan dataset will be replaced by the name in the articles' dataset when different.

In [9]:
## Extract country names in the articles' dataset
articles_dataset_countries_list = []

# Import each csv with articles' data and extract country names
for csv in tqdm(list_dfs_papers):
    start = time.time()
    print(f"Parsing csv: {csv}, {list_dfs_papers.index(csv)+1} / {len(list_dfs_papers)}")
    df = pd.read_csv(DF_input + csv)
    list_countries = list(set(df["Country"]))
    articles_dataset_countries_list.append(list_countries)
    del df, list_countries
    print(f"--Parsing time: {round(time.time()-start, 2)}")

# Flatten list, remove nan, convert to set
articles_dataset_countries = set([el for el in (chain(*articles_dataset_countries_list)) if str(el) != "nan"])

  0%|                                                                                                                                                                                      | 0/45 [00:00<?, ?it/s]

Parsing csv: parsedX_100000.csv, 1 / 45


  2%|███▊                                                                                                                                                                          | 1/45 [00:02<02:01,  2.77s/it]

--Parsing time: 2.77
Parsing csv: parsedX_1000000.csv, 2 / 45


  4%|███████▋                                                                                                                                                                      | 2/45 [00:05<02:09,  3.02s/it]

--Parsing time: 3.2
Parsing csv: parsedX_1100000.csv, 3 / 45


  7%|███████████▌                                                                                                                                                                  | 3/45 [00:09<02:11,  3.13s/it]

--Parsing time: 3.25
Parsing csv: parsedX_1200000.csv, 4 / 45


  9%|███████████████▍                                                                                                                                                              | 4/45 [00:12<02:11,  3.20s/it]

--Parsing time: 3.3
Parsing csv: parsedX_1300000.csv, 5 / 45


 11%|███████████████████▎                                                                                                                                                          | 5/45 [00:15<02:09,  3.23s/it]

--Parsing time: 3.3
Parsing csv: parsedX_1400000.csv, 6 / 45


 13%|███████████████████████▏                                                                                                                                                      | 6/45 [00:19<02:07,  3.26s/it]

--Parsing time: 3.31
Parsing csv: parsedX_1500000.csv, 7 / 45


 16%|███████████████████████████                                                                                                                                                   | 7/45 [00:22<02:04,  3.28s/it]

--Parsing time: 3.3
Parsing csv: parsedX_1600000.csv, 8 / 45


 18%|██████████████████████████████▉                                                                                                                                               | 8/45 [00:25<02:01,  3.29s/it]

--Parsing time: 3.32
Parsing csv: parsedX_1700000.csv, 9 / 45


 20%|██████████████████████████████████▊                                                                                                                                           | 9/45 [00:29<01:59,  3.31s/it]

--Parsing time: 3.36
Parsing csv: parsedX_1800000.csv, 10 / 45


 22%|██████████████████████████████████████▍                                                                                                                                      | 10/45 [00:32<01:58,  3.38s/it]

--Parsing time: 3.54
Parsing csv: parsedX_1900000.csv, 11 / 45


 24%|██████████████████████████████████████████▎                                                                                                                                  | 11/45 [00:36<01:56,  3.44s/it]

--Parsing time: 3.56
Parsing csv: parsedX_200000.csv, 12 / 45


 27%|██████████████████████████████████████████████▏                                                                                                                              | 12/45 [00:38<01:44,  3.18s/it]

--Parsing time: 2.59
Parsing csv: parsedX_2000000.csv, 13 / 45


 29%|█████████████████████████████████████████████████▉                                                                                                                           | 13/45 [00:42<01:45,  3.29s/it]

--Parsing time: 3.56
Parsing csv: parsedX_2100000.csv, 14 / 45


 31%|█████████████████████████████████████████████████████▊                                                                                                                       | 14/45 [00:45<01:45,  3.39s/it]

--Parsing time: 3.62
Parsing csv: parsedX_2200000.csv, 15 / 45


 33%|█████████████████████████████████████████████████████████▋                                                                                                                   | 15/45 [00:49<01:44,  3.48s/it]

--Parsing time: 3.68
Parsing csv: parsedX_2300000.csv, 16 / 45


 36%|█████████████████████████████████████████████████████████████▌                                                                                                               | 16/45 [00:53<01:45,  3.63s/it]

--Parsing time: 3.98
Parsing csv: parsedX_2400000.csv, 17 / 45


 38%|█████████████████████████████████████████████████████████████████▎                                                                                                           | 17/45 [00:57<01:46,  3.82s/it]

--Parsing time: 4.25
Parsing csv: parsedX_2500000.csv, 18 / 45


 40%|█████████████████████████████████████████████████████████████████████▏                                                                                                       | 18/45 [01:02<01:48,  4.01s/it]

--Parsing time: 4.46
Parsing csv: parsedX_2600000.csv, 19 / 45


 42%|█████████████████████████████████████████████████████████████████████████                                                                                                    | 19/45 [01:06<01:48,  4.18s/it]

--Parsing time: 4.57
Parsing csv: parsedX_2700000.csv, 20 / 45


 44%|████████████████████████████████████████████████████████████████████████████▉                                                                                                | 20/45 [01:11<01:47,  4.31s/it]

--Parsing time: 4.62
Parsing csv: parsedX_2800000.csv, 21 / 45


 47%|████████████████████████████████████████████████████████████████████████████████▋                                                                                            | 21/45 [01:16<01:45,  4.38s/it]

--Parsing time: 4.53
Parsing csv: parsedX_2900000.csv, 22 / 45


 49%|████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 22/45 [01:20<01:41,  4.43s/it]

--Parsing time: 4.54
Parsing csv: parsedX_300000.csv, 23 / 45


 51%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 23/45 [01:23<01:24,  3.83s/it]

--Parsing time: 2.43
Parsing csv: parsedX_3000000.csv, 24 / 45


 53%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 24/45 [01:27<01:24,  4.04s/it]

--Parsing time: 4.53
Parsing csv: parsedX_3100000.csv, 25 / 45


 56%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 25/45 [01:32<01:23,  4.19s/it]

--Parsing time: 4.56
Parsing csv: parsedX_3200000.csv, 26 / 45


 58%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                         | 26/45 [01:36<01:21,  4.31s/it]

--Parsing time: 4.57
Parsing csv: parsedX_3300000.csv, 27 / 45


 60%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 27/45 [01:41<01:19,  4.39s/it]

--Parsing time: 4.59
Parsing csv: parsedX_3400000.csv, 28 / 45


 62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                 | 28/45 [01:45<01:15,  4.47s/it]

--Parsing time: 4.63
Parsing csv: parsedX_3500000.csv, 29 / 45


 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                             | 29/45 [01:50<01:12,  4.50s/it]

--Parsing time: 4.59
Parsing csv: parsedX_3600000.csv, 30 / 45


 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 30/45 [01:55<01:08,  4.56s/it]

--Parsing time: 4.69
Parsing csv: parsedX_3700000.csv, 31 / 45


 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 31/45 [01:59<01:04,  4.62s/it]

--Parsing time: 4.76
Parsing csv: parsedX_3800000.csv, 32 / 45


 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                  | 32/45 [02:04<01:00,  4.65s/it]

--Parsing time: 4.73
Parsing csv: parsedX_3900000.csv, 33 / 45


 73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 33/45 [02:09<00:56,  4.67s/it]

--Parsing time: 4.72
Parsing csv: parsedX_400000.csv, 34 / 45


 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 34/45 [02:11<00:43,  3.92s/it]

--Parsing time: 2.17
Parsing csv: parsedX_4000000.csv, 35 / 45


 78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 35/45 [02:16<00:41,  4.16s/it]

--Parsing time: 4.69
Parsing csv: parsedX_4100000.csv, 36 / 45


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 36/45 [02:21<00:39,  4.35s/it]

--Parsing time: 4.82
Parsing csv: parsedX_4200000.csv, 37 / 45


 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 37/45 [02:25<00:35,  4.48s/it]

--Parsing time: 4.78
Parsing csv: parsedX_4300000.csv, 38 / 45


 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 38/45 [02:30<00:32,  4.61s/it]

--Parsing time: 4.91
Parsing csv: parsedX_4400000.csv, 39 / 45


 87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 39/45 [02:35<00:28,  4.70s/it]

--Parsing time: 4.91
Parsing csv: parsedX_4454000.csv, 40 / 45


 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 40/45 [02:38<00:20,  4.06s/it]

--Parsing time: 2.56
Parsing csv: parsedX_500000.csv, 41 / 45


 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 41/45 [02:41<00:14,  3.69s/it]

--Parsing time: 2.82
Parsing csv: parsedX_600000.csv, 42 / 45


 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 42/45 [02:43<00:10,  3.44s/it]

--Parsing time: 2.86
Parsing csv: parsedX_700000.csv, 43 / 45


 96%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 43/45 [02:47<00:06,  3.34s/it]

--Parsing time: 3.12
Parsing csv: parsedX_800000.csv, 44 / 45


 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 44/45 [02:50<00:03,  3.26s/it]

--Parsing time: 3.05
Parsing csv: parsedX_900000.csv, 45 / 45


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [02:53<00:00,  3.85s/it]

--Parsing time: 3.14





In [58]:
# Rename "Label" as "Country"
df_globocan.rename(columns={"Label": "Country"}, inplace = True)

In [59]:
# Unique entity names in the Globocan dataset
globocan_dataset_countries = set(df_globocan["Country"])
globocan_dataset_countries

{'Afghanistan',
 'Africa',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Asia',
 'Australia',
 'Australia-New Zealand',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bosnia Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Caribbean',
 'Caribbean hub',
 'Central African Republic',
 'Central America',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Democratic Republic of',
 'Congo, Republic of',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 "Côte d'Ivoire",
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Eastern Africa',
 'Eastern Asia',
 'Eastern Europe',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Europe',
 'European Union (27)',
 'Fiji',
 '

In [60]:
## Remove from Globocan dataset any supranational entity

# Remove supranational entities
df_globocan = df_globocan.loc[df_globocan["Country"]!="Latin America and the Caribbean"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Sub-Saharan Africa Hub"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="High HDI country"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Latin America Hub"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Caribbean"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="South Central Asia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Australia-New Zealand"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="WHO Europe region (EURO)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="WHO South-East Asia region (SEARO)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Upper middle income"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="WHO East Mediterranean region (EMRO)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="WHO Africa region (AFRO)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Medium HDI country (but India)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Western Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Western Asia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="European Union (27)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Northern Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="South-Eastern Asia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Low income"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Medium HDI country"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="WHO Western Pacific region (WPRO)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Lower middle income"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Northern Europe"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Pacific Islands Hub"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Asia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Low HDI country"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="WHO Americas region (PAHO)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Very HDI country"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Caribbean hub"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Eastern Europe"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Southern Europe"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Central America"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Western Europe"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Europe"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Eastern Asia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="South America"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Sub-Saharan Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Southern Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Middle Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Oceania"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="High HDI country (but China)"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Northern Africa, Central and Western Asia Hub"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Eastern Africa"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="South, East and South-Eastern Asia Hub"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="High income"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Northern America"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Micronesia/Polynesia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Polynesia"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="Melanesia"]

# Totals will be calculated as aggregates
df_globocan = df_globocan.loc[df_globocan["Country"]!="Total"]
df_globocan = df_globocan.loc[df_globocan["Country"]!="World"]

In [61]:
## Rename Globocan country names to match format of articles' dataset
df_globocan['Country'] = df_globocan['Country'].replace({'Republic of Moldova': 'Moldova'})
df_globocan['Country'] = df_globocan['Country'].replace({'Viet Nam': 'Vietnam'})
df_globocan['Country'] = df_globocan['Country'].replace({'Bosnia Herzegovina': 'Bosnia and Herzegovina'})
df_globocan['Country'] = df_globocan['Country'].replace({'Gaza Strip and West Bank': 'Palestine'})
df_globocan['Country'] = df_globocan['Country'].replace({'Korea, Republic of': 'South Korea'})
df_globocan['Country'] = df_globocan['Country'].replace({"Côte d'Ivoire": "Cote d'Ivoire"})
df_globocan['Country'] = df_globocan['Country'].replace({"The Republic of the Gambia": "Gambia"})
df_globocan['Country'] = df_globocan['Country'].replace({"Iran, Islamic Republic of": "Iran"})
df_globocan['Country'] = df_globocan['Country'].replace({"Micronesia": "Micronesia, Fed. Sts."})
df_globocan['Country'] = df_globocan['Country'].replace({"Congo, Republic of": "Congo Republic"})
df_globocan['Country'] = df_globocan['Country'].replace({"Kyrgyzstan": "Kyrgyz Republic"})
df_globocan['Country'] = df_globocan['Country'].replace({"Congo, Democratic Republic of": "DR Congo"})
df_globocan['Country'] = df_globocan['Country'].replace({"French Guyana": "French Guiana"})
df_globocan['Country'] = df_globocan['Country'].replace({"Saint Lucia": "St. Lucia"})
df_globocan['Country'] = df_globocan['Country'].replace({"The Netherlands": "Netherlands"})
df_globocan['Country'] = df_globocan['Country'].replace({"Tanzania, United Republic of": "Tanzania"})
df_globocan['Country'] = df_globocan['Country'].replace({"Russian Federation": "Russia"})
df_globocan['Country'] = df_globocan['Country'].replace({"Lao People's Democratic Republic": "Laos"})
df_globocan['Country'] = df_globocan['Country'].replace({"Cape Verde": "Cabo Verde"})
df_globocan['Country'] = df_globocan['Country'].replace({"Korea, Democratic People Republic of": "North Korea"})
df_globocan['Country'] = df_globocan['Country'].replace({"Syrian Arab Republic": "Syria"})
df_globocan['Country'] = df_globocan['Country'].replace({"Bolivia (Plurinational State of)": "Bolivia"})
df_globocan['Country'] = df_globocan['Country'].replace({"France, La Réunion": "Reunion"})
df_globocan['Country'] = df_globocan['Country'].replace({"United States of America": "United States"})
df_globocan['Country'] = df_globocan['Country'].replace({"France, Martinique": "Martinique"})
df_globocan['Country'] = df_globocan['Country'].replace({"France (metropolitan)": "France"})
df_globocan['Country'] = df_globocan['Country'].replace({"France, Guadeloupe": "Guadeloupe"})

In [62]:
# Unique entity names in the Globocan dataset
globocan_dataset_countries = set(df_globocan["Country"])

# Entities appearing in Globocan dataset and missing in articles' dataset
for entity in globocan_dataset_countries:
    if entity not in articles_dataset_countries:
        print(entity)

Equatorial Guinea


Only country present in Globocan and not present in articles is Equatorial Guinea. The reason is that no cancer article published yet has someone from
this country as last author

In [63]:
# Entities appearing in articles' dataset and missing in Globocan dataset
for entity in articles_dataset_countries:
    if entity not in globocan_dataset_countries:
        print(entity)

Northern Mariana Islands
Bonaire, Saint Eustatius and Saba
Falkland Islands
Tonga
Andorra
Wallis and Futuna Islands
Taiwan
San Marino
Guernsey
Curacao
St. Kitts and Nevis
Palau
St. Barths
Cocos (Keeling) Islands
Svalbard and Jan Mayen Islands
Antigua and Barbuda
Hong Kong
French Southern Territories
Aruba
Christmas Island
Tuvalu
Aland Islands
Vatican
Antarctica
St. Pierre and Miquelon
British Virgin Islands
Faroe Islands
St. Vincent and the Grenadines
Pitcairn
Greenland
Mayotte
South Georgia and South Sandwich Is.
Bermuda
Saint-Martin
Anguilla
Grenada
Sint Maarten
Dominica
Niue
Liechtenstein
American Samoa
Cook Islands
Kosovo
British Indian Ocean Territory
Macau
United States Minor Outlying Islands
Montserrat
Nauru
Seychelles
Jersey
United States Virgin Islands
Gibraltar
Marshall Islands
Heard and McDonald Islands
Cayman Islands
St. Helena
Monaco
Isle of Man
Bouvet Island


A number of small entities are listed separately in the articles's dataset but not in the Globocan dataset. These countries stay as they are in the articles' datasets, meaning that they are not properly matched with Globocan numbers. These are small countries contributing marginally (with the probable exceptions of Hong Kong and Taiwan) to cancer incidence data.

In [64]:
# All countries present in cancer articles
articles_dataset_countries

{'Afghanistan',
 'Aland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire, Saint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo Republic',
 'Cook Islands',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czechia',
 'DR Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ec

## Preparation Cancer type names
To show incidence data in parallel to articles' data, common cancer types names are required. In this section, cancer types names in the Globocan dataset are replaced by those used in the articles' dataset

In [65]:
cancers_globocan = set(df_globocan["Cancer"].to_list())
cancers_globocan

{'anus',
 'bladder',
 'brain-central-nervous-system',
 'breast',
 'cervix-uteri',
 'colon',
 'colorectum',
 'corpus-uteri',
 'gallbladder',
 'hodgkin-lymphoma',
 'hypopharynx',
 'kaposi-sarcoma',
 'kidney',
 'larynx',
 'leukaemia',
 'lip-oral-cavity',
 'liver-and-intrahepatic-bile-ducts',
 'melanoma-of-skin',
 'mesothelioma',
 'multiple-myeloma',
 'nasopharynx',
 'non-hodgkin-lymphoma',
 'non-melanoma-skin-cancer',
 'oesophagus',
 'oropharynx',
 'ovary',
 'pancreas',
 'penis',
 'prostate',
 'rectum',
 'salivary-glands',
 'stomach',
 'testis',
 'thyroid',
 'trachea-bronchus-and-lung',
 'vagina',
 'vulva'}

Some cancers will be combined together to make more sense with regard to the naming in the articles' dataset. Combining several cancer types implies adding their numeric values and assigning a single name for all of them (e.g. "nasopharynx" + "hypopharynx" + "oropharynx" = "Throat cancer")

In [71]:
## Bind nasopharynx and hypopharynx as "Throat cancer"

# Aggregate data
bind_throat = df_globocan.loc[df_globocan["Cancer"].isin(["nasopharynx", "hypopharynx", "oropharynx"])].groupby(["Alpha-3 code", "Country"], as_index=False).sum()

# Rename cancer
bind_throat["Cancer"] = "Throat cancer"
bind_throat


# Concat, remove the rows having as Cancer type nasopharynx, hypopharynx or oropharynx
df_globocan = pd.concat([df_globocan, bind_throat])
df_globocan = df_globocan.loc[~df_globocan["Cancer"].isin(["nasopharynx", "hypopharynx", "oropharynx"])]

In [72]:
set(df_globocan["Cancer"].to_list())

{'Throat cancer',
 'anus',
 'bladder',
 'brain-central-nervous-system',
 'breast',
 'cervix-uteri',
 'colon',
 'colorectum',
 'corpus-uteri',
 'gallbladder',
 'hodgkin-lymphoma',
 'kaposi-sarcoma',
 'kidney',
 'larynx',
 'leukaemia',
 'lip-oral-cavity',
 'liver-and-intrahepatic-bile-ducts',
 'melanoma-of-skin',
 'mesothelioma',
 'multiple-myeloma',
 'non-hodgkin-lymphoma',
 'non-melanoma-skin-cancer',
 'oesophagus',
 'ovary',
 'pancreas',
 'penis',
 'prostate',
 'rectum',
 'salivary-glands',
 'stomach',
 'testis',
 'thyroid',
 'trachea-bronchus-and-lung',
 'vagina',
 'vulva'}

In [74]:
## Replace cancer names in the Globocan dataset to match those of the articles' dataset

df_globocan['Cancer'] = df_globocan['Cancer'].replace('anus', 'Anal cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('bladder', 'Bladder cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('brain-central-nervous system', 'Brain cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('breast', 'Breast cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('cervix-uteri', 'Cervical cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('colon', 'Colon cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('colorectum', 'Colorectal cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('corpus-uteri', 'Uterine cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('gallbladder', 'Gallbladder cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('hodgkin-lymphoma', 'Hodgkin lymphoma')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('kaposi-sarcoma', 'Kaposi sarcoma')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('kidney', 'Kidney cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('larynx', 'Laryngeal cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('leukaemia', 'Leukemia')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('lip-oral-cavity', 'Mouth cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('liver-and-intrahepatic-bile-ducts', 'Liver cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('melanoma-of-skin', 'Skin cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('non-melanoma-skin-cancer', 'Non-melanoma skin cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('mesothelioma', 'Mesothelioma')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('multiple-myeloma', 'Multiple myeloma')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('non-hodgkin-lymphoma', 'Non-Hodgkin lymphoma')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('oesophagus', 'Esophageal cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('ovary', 'Ovarian cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('pancreas', 'Pancreatic cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('penis', 'Penile cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('prostate', 'Prostate cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('rectum', 'Rectal cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('salivary-glands', 'Salivary gland cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('stomach', 'Stomach cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('testis', 'Testicular cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('thyroid', 'Thyroid cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('trachea-bronchus-and-lung', 'Lung cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('vagina', 'Vaginal cancer')
df_globocan['Cancer'] = df_globocan['Cancer'].replace('vulva', 'Vulvar cancer')

In [77]:
# Final Globocan dataset
df_globocan

Unnamed: 0,Alpha-3 code,Cancer code,Population code (ISO/UN),Country,Sex,Number,95% UI low,95% UI high,Number.1,ASR (World),Crude rate,Cumulative risk,Cancer
0,AFG,10.0,4.0,Afghanistan,0.0,110,15.0,793.0,110,0.47,0.27,0.06,Anal cancer
1,ALB,10.0,8.0,Albania,0.0,11,4.0,28.0,11,0.24,0.38,0.03,Anal cancer
2,DZA,10.0,12.0,Algeria,0.0,158,48.0,517.0,158,0.34,0.35,0.04,Anal cancer
3,AGO,10.0,24.0,Angola,0.0,38,35.0,41.0,38,0.24,0.11,0.04,Anal cancer
4,AZE,10.0,31.0,Azerbaijan,0.0,106,8.0,1471.0,106,0.87,1.00,0.08,Anal cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,WSM,12.0,2646.0,Samoa,0.0,4,2.0,8.0,4,2.20,2.00,0.17,Throat cancer
182,YEM,12.0,2661.0,Yemen,0.0,447,273.0,846.0,447,1.82,1.39,0.17,Throat cancer
183,ZAF,12.0,2130.0,South Africa,0.0,806,508.0,1388.0,806,1.49,1.32,0.17,Throat cancer
184,ZMB,12.0,2682.0,Zambia,0.0,79,12.0,543.0,79,0.84,0.41,0.10,Throat cancer


In [79]:
# Save curated dataset
df_globocan.to_csv(Globocan_output + "Globocan_dataset_ready.csv", index = False)