In [56]:
import os
import pandas as pd
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [57]:
# CSV with the finished Globocan dataset
GLOBOCAN_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [58]:
# Directory to save Globocan dataset with required columns only
GLOBOCAN_prepared_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [60]:
# Directory with the csvs of the articles' dataset with country and cancer names matching those of GLOBOCAN dataset
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_combined_until_Oct2025_match_GLOBOCAN_clinical_trials\


In [59]:
# Directory to save aggregated articles' datasets
DF_prepared_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources_clinical_trials\


## Part 1 - Preparing Globocan dataset
Right now, the Globocan dataset has a number of columns which are not required for preparing the visualizations:

In [61]:
df_globocan = pd.read_csv(GLOBOCAN_input + "Globocan_dataset_ready.csv")
df_globocan.columns.to_list()

['Alpha-3 code',
 'Cancer code',
 'Population code (ISO/UN)',
 'Country',
 'Sex',
 'Number',
 '95% UI low',
 '95% UI high',
 'Number.1',
 'ASR (World)',
 'Crude rate',
 'Cumulative risk',
 'Cancer']

### 1.1.- Remove unnecessary columns  
Only the columns **Country**, **Cancer**, **ASR (World)** and **Crude rate** are important in this project. ASR (World) refers to the age-standardized rate, whereas Crude rate refers to the raw (non age-standardized) rate.  Here, a new csv is saved with only these columns.

In [62]:
df_globocan_lite = df_globocan[["Country",  "Cancer", "ASR (World)", "Crude rate"]]
df_globocan_lite.rename(columns = {"ASR (World)":"ASR"}, inplace = True)
df_globocan_lite

Unnamed: 0,Country,Cancer,ASR,Crude rate
0,Afghanistan,Anal cancer,0.47,0.27
1,Albania,Anal cancer,0.24,0.38
2,Algeria,Anal cancer,0.34,0.35
3,Angola,Anal cancer,0.24,0.11
4,Azerbaijan,Anal cancer,0.87,1.00
...,...,...,...,...
6285,Samoa,Skin cancer,0.00,0.00
6286,Yemen,Skin cancer,1.78,0.96
6287,South Africa,Skin cancer,23.90,20.90
6288,Zambia,Skin cancer,3.27,1.59


In [63]:
df_globocan_lite.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_ready_lite.csv", index = False)

### 1.2.- Cummulative cancer incidence (all cancers) per country
Even though the dataset contains both the ASR and the crude incidence values (see above), in principle only the ASR values will be displayed. Here, a new dataset is created containing the sum of the cancer incidences (ASR) of all cancers per country. Note that **Colorectal cancer** is removed from the dataset before grouping to avoid duplication with the disaggregated data **Anal cancer**, **Rectal cancer**, **Colon cancer**.

In [64]:
df_globocan_cumm_ASR_country = df_globocan_lite.loc[df_globocan_lite["Cancer"] != "Colorectal cancer", ["Country", "ASR"]].groupby(by=["Country"]).sum()

In [65]:
df_globocan_cumm_ASR_country.reset_index(inplace=True)
df_globocan_cumm_ASR_country

Unnamed: 0,Country,ASR
0,Afghanistan,120.49
1,Albania,202.50
2,Algeria,178.86
3,Angola,174.78
4,Argentina,280.81
...,...,...
180,Venezuela,246.03
181,Vietnam,174.88
182,Yemen,90.97
183,Zambia,231.30


In [66]:
df_globocan_cumm_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_cummulative_ASR_country.csv", index = False)

### 1.3.- Cancer with highest incidence per country
A new dataset will be saved containing the name of the cancer with highest incidence (ASR) per country

In [67]:
highest_incidence_idx = df_globocan_lite.groupby(by="Country")["ASR"].idxmax()
df_globocan_max_ASR_country = df_globocan_lite.iloc[highest_incidence_idx]
df_globocan_max_ASR_country.rename(columns={"Crude rate": "Crude_rate"}, inplace = True)
df_globocan_max_ASR_country

Unnamed: 0,Country,Cancer,ASR,Crude_rate
555,Afghanistan,Breast cancer,29.4,17.9
556,Albania,Breast cancer,51.1,72.6
557,Algeria,Breast cancer,61.9,65.1
4258,Angola,Prostate cancer,47.9,15.1
560,Argentina,Breast cancer,71.3,91.8
...,...,...,...,...
4435,Venezuela,Prostate cancer,52.3,57.1
704,Vietnam,Breast cancer,38.0,49.6
737,Yemen,Breast cancer,25.4,18.6
923,Zambia,Cervical cancer,71.5,37.0


In [68]:
df_globocan_max_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_max_ASR_country.csv", index = False)

## Part 2 - Calculating aggregated data for articles' dataset
To prepare data in the articles' dataset, aggregates of studies will be calculated grouping by country, cancer type and year.  
Future iterations of the project will deal with animal species as well.

### 2.1.- Aggregate number of articles per year irrespective of country and cancer type 
Here, a csv is prepared aggregating papers by year (i.e. the final csv will have a row per year)

In [69]:
# Import list of csvs to parse
list_csvs_match_GLOBOCAN = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs_match_GLOBOCAN.append(file)

n_csvs_match_GLOBOCAN = len(list_csvs_match_GLOBOCAN)

In [71]:
year_aggregates = {}
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df["PublicationDate"].value_counts().to_dict()
    for year in dict_agg.keys():
        if int(year) in year_aggregates.keys():
            year_aggregates[int(year)] += dict_agg[int(year)]
        else:
            year_aggregates[int(year)] = dict_agg[int(year)]
    del df, dict_agg

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.86s/it]


In [72]:
year_aggregates

{2021: 6545,
 2019: 6355,
 2020: 6326,
 2015: 6170,
 2013: 6068,
 2014: 6019,
 2022: 6014,
 2018: 5959,
 2024: 5915,
 2012: 5677,
 2016: 5674,
 2023: 5645,
 2017: 5616,
 2025: 5158,
 2011: 4828,
 2005: 4585,
 2010: 4528,
 2009: 4381,
 2006: 4345,
 2007: 4335,
 2004: 4299,
 2008: 4167,
 2003: 3900,
 1999: 3758,
 2000: 3563,
 1995: 3480,
 1994: 3464,
 1997: 3416,
 2002: 3416,
 1998: 3410,
 1996: 3311,
 2001: 3262,
 1993: 2260,
 1992: 2147,
 1991: 1940,
 1990: 1823,
 1989: 1424,
 1988: 1153,
 1986: 1020,
 1987: 1012,
 1985: 933,
 1984: 854}

In [73]:
# Convert to pd dataframe
df_year_agg = pd.DataFrame.from_dict(year_aggregates, orient="index", columns = ["Articles"])
df_year_agg.reset_index(inplace=True)
df_year_agg.rename(columns={"index": "Year"}, inplace=True)
df_year_agg.sort_values(by=["Year"], ascending= True, inplace=True)

In [74]:
df_year_agg

Unnamed: 0,Year,Articles
41,1984,854
40,1985,933
38,1986,1020
39,1987,1012
37,1988,1153
36,1989,1424
35,1990,1823
34,1991,1940
33,1992,2147
32,1993,2260


In [75]:
# Save
df_year_agg.to_csv(DF_prepared_output + "articles_year.csv", index=False)

### 2.2.- Aggregate number of articles per country and year irrespective of cancer type
Here, a csv is prepared aggregating papers by year and country (i.e. the final csv will have a row per year and country)

In [76]:
year_country_aggregates = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df[["Country", "PublicationDate"]].value_counts().to_dict()
    for key in dict_agg.keys():
        if key[0] in year_country_aggregates.keys() and int(key[1]) in year_country_aggregates[key[0]].keys():
            year_country_aggregates[key[0]][int(key[1])] += dict_agg[(key[0], key[1])]
        elif key[0] in year_country_aggregates.keys():
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]
        else:
            year_country_aggregates[key[0]] = {}
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]

year_country_aggregates

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.52s/it]


{'United States': {2021: 1998,
  2019: 1994,
  2018: 1986,
  2015: 1938,
  2020: 1893,
  2013: 1871,
  2017: 1867,
  2024: 1805,
  2022: 1799,
  2014: 1769,
  2016: 1754,
  2012: 1740,
  2023: 1684,
  2011: 1551,
  2025: 1550,
  2009: 1432,
  2010: 1410,
  2008: 1407,
  2007: 1371,
  2005: 1325,
  2006: 1292,
  2004: 1285,
  2003: 1238,
  1999: 1110,
  2000: 1092,
  2002: 1077,
  1995: 1069,
  1997: 1044,
  2001: 1038,
  1998: 1012,
  1994: 998,
  1996: 954,
  1993: 812,
  1992: 699,
  1991: 627,
  1990: 403,
  1989: 287,
  1988: 276,
  1987: 100,
  1985: 3},
 'China': {2024: 1218,
  2022: 1145,
  2023: 1121,
  2025: 1073,
  2021: 928,
  2020: 862,
  2019: 651,
  2015: 601,
  2018: 571,
  2014: 566,
  2016: 546,
  2013: 528,
  2012: 515,
  2017: 501,
  2011: 394,
  2010: 337,
  2009: 310,
  2005: 254,
  2008: 249,
  2006: 246,
  2007: 239,
  2004: 211,
  2003: 203,
  2002: 145,
  1994: 112,
  2001: 103,
  2000: 100,
  1997: 92,
  1998: 92,
  1995: 89,
  1999: 88,
  1996: 59,
  1993: 32

In [77]:
# Create df from dict
dfs_year_country = []
for country in year_country_aggregates.keys():
    df = pd.DataFrame.from_dict(year_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Country"] = country
    dfs_year_country.append(df)

df_year_country_agg = pd.concat(dfs_year_country)

In [78]:
df_year_country_agg

Unnamed: 0,Year,Articles,Country
39,1985,3,United States
38,1987,100,United States
37,1988,276,United States
36,1989,287,United States
35,1990,403,United States
...,...,...,...
4,2005,1,Zimbabwe
3,2016,1,Zimbabwe
2,2020,1,Zimbabwe
1,2022,1,Zimbabwe


In [79]:
# Save
df_year_country_agg.to_csv(DF_prepared_output + "articles_year_country.csv", index=False)

### 2.3.- Aggregate number of articles per cancer type and year irrespective of country
Here, a csv is prepared aggregating papers by year and cancer type (i.e. the final csv will have a row per year and cancer type). Cancers names are hardcoded from a previous notebook in this project.

In [80]:
cancer_names = [
 'Anal cancer',
 'Bladder cancer',
 'Brain cancer',
 'Breast cancer',
 'Cervical cancer',
 'Colon cancer',
 'Colorectal cancer',
 'Esophageal cancer',
 'Gallbladder cancer',
 'Hodgkin lymphoma',
 'Kaposi sarcoma',
 'Kidney cancer',
 'Laryngeal cancer',
 'Leukemia',
 'Liver cancer',
 'Lung cancer',
 'Mesothelioma',
 'Mouth cancer',
 'Multiple myeloma',
 'Non-Hodgkin lymphoma',
 'Other cancer',
 'Ovarian cancer',
 'Pancreatic cancer',
 'Penile cancer',
 'Prostate cancer',
 'Rectal cancer',
 'Salivary gland cancer',
 'Skin cancer',
 'Stomach cancer',
 'Testicular cancer',
 'Throat cancer',
 'Thyroid cancer',
 'Undetermined cancer',
 'Uterine cancer',
 'Vaginal cancer',
 'Vulvar cancer'
]

In [81]:
# Create and structure dict to hold data about publications on each cancer type
year_cancer_aggregates = {}

for cancer in cancer_names:
    year_cancer_aggregates[cancer] = {}

# Parse all articles' csv and count the number of studies per cancer and country
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for row in df.index:
        for cancer in cancer_names:
            if cancer in df.columns.to_list():
                year = df.at[row, "PublicationDate"]
                if pd.notna(year) and df.at[row, cancer] > 0:
                    if int(year) in year_cancer_aggregates[cancer].keys():
                        year_cancer_aggregates[cancer][int(year)] += 1
                    else:
                        year_cancer_aggregates[cancer][int(year)] = 1

    del df, cancer, year, csv

year_cancer_aggregates

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:11<00:00, 71.22s/it]


{'Anal cancer': {2025: 7,
  2001: 2,
  2002: 1,
  2003: 2,
  2004: 1,
  2005: 4,
  2006: 2,
  2007: 4,
  2008: 4,
  2009: 3,
  2010: 3,
  2011: 5,
  2012: 2,
  2013: 12,
  2014: 8,
  2015: 7,
  2016: 8,
  2017: 5,
  2018: 5,
  2019: 6,
  2020: 3,
  2021: 6,
  2022: 11,
  2023: 5,
  2024: 7,
  1994: 1,
  1993: 1,
  1996: 3,
  1997: 2,
  1998: 2},
 'Bladder cancer': {1984: 19,
  2024: 78,
  2025: 69,
  1992: 44,
  1991: 32,
  2001: 41,
  2000: 60,
  2002: 38,
  1999: 75,
  2003: 38,
  2004: 34,
  2005: 44,
  2006: 37,
  2007: 28,
  2008: 34,
  2009: 35,
  2010: 54,
  2011: 37,
  2012: 42,
  2013: 64,
  1990: 39,
  1985: 23,
  1987: 21,
  1988: 28,
  1989: 32,
  2014: 56,
  2015: 50,
  2016: 57,
  2017: 45,
  2018: 63,
  1986: 15,
  2019: 41,
  2020: 56,
  2021: 63,
  2022: 76,
  2023: 83,
  1995: 55,
  1993: 28,
  1994: 70,
  1996: 67,
  1997: 48,
  1998: 41},
 'Brain cancer': {1984: 8,
  2024: 73,
  2025: 51,
  1992: 37,
  1991: 30,
  1990: 22,
  2001: 61,
  2000: 79,
  1999: 91,
  2002

In [82]:
# Create df from dict
dfs_year_cancer = []
for cancer in year_cancer_aggregates.keys():
    df = pd.DataFrame.from_dict(year_cancer_aggregates[cancer], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Cancer"] = cancer
    dfs_year_cancer.append(df)

df_year_cancer_agg = pd.concat(dfs_year_cancer)

In [83]:
df_year_cancer_agg

Unnamed: 0,Year,Articles,Cancer
26,1993,1,Anal cancer
25,1994,1,Anal cancer
27,1996,3,Anal cancer
28,1997,2,Anal cancer
29,1998,2,Anal cancer
...,...,...,...
21,2021,4,Vulvar cancer
22,2022,2,Vulvar cancer
24,2023,1,Vulvar cancer
0,2024,3,Vulvar cancer


In [84]:
# Save
df_year_cancer_agg.to_csv(DF_prepared_output + "articles_year_cancer.csv", index=False)

### 2.4.- Aggregate number of articles per cancer per country

Here, a csv is generated in which, for each country, the number of studies per cancer, in all years combined, is calculated. 

In [85]:
# Create and structure dict to hold data about publications on each cancer type
cancer_country_aggregates = {}

for country in set(df_globocan_lite["Country"]):
    cancer_country_aggregates[country] = {}

    for cancer in cancer_names:
        cancer_country_aggregates[country][cancer] = 0

# Parse all articles' csv and count the number of studies per cancer and country
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for row in df.index:
        for cancer in cancer_names:
            if cancer in df.columns.to_list():
                country = df.at[row, "Country"]
                if pd.notna(country) and df.at[row, cancer] > 0:
                    cancer_country_aggregates[country][cancer] += 1

    del df, cancer, country, csv

cancer_country_aggregates

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:09<00:00, 69.09s/it]


{'Sudan': {'Anal cancer': 0,
  'Bladder cancer': 0,
  'Brain cancer': 0,
  'Breast cancer': 1,
  'Cervical cancer': 0,
  'Colon cancer': 0,
  'Colorectal cancer': 0,
  'Esophageal cancer': 0,
  'Gallbladder cancer': 0,
  'Hodgkin lymphoma': 0,
  'Kaposi sarcoma': 0,
  'Kidney cancer': 0,
  'Laryngeal cancer': 0,
  'Leukemia': 0,
  'Liver cancer': 0,
  'Lung cancer': 0,
  'Mesothelioma': 0,
  'Mouth cancer': 0,
  'Multiple myeloma': 0,
  'Non-Hodgkin lymphoma': 0,
  'Other cancer': 1,
  'Ovarian cancer': 0,
  'Pancreatic cancer': 0,
  'Penile cancer': 0,
  'Prostate cancer': 0,
  'Rectal cancer': 0,
  'Salivary gland cancer': 0,
  'Skin cancer': 0,
  'Stomach cancer': 0,
  'Testicular cancer': 0,
  'Throat cancer': 0,
  'Thyroid cancer': 0,
  'Undetermined cancer': 2,
  'Uterine cancer': 0,
  'Vaginal cancer': 0,
  'Vulvar cancer': 0},
 'Samoa': {'Anal cancer': 0,
  'Bladder cancer': 0,
  'Brain cancer': 0,
  'Breast cancer': 0,
  'Cervical cancer': 0,
  'Colon cancer': 0,
  'Colorectal

In [86]:
# Create df from dict
dfs_cancer_country = []
for country in cancer_country_aggregates.keys():
    df = pd.DataFrame.from_dict(cancer_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Cancer"}, inplace=True)
    df["Country"] = country
    dfs_cancer_country.append(df)

df_cancer_country_agg = pd.concat(dfs_cancer_country)

In [87]:
df_cancer_country_agg

Unnamed: 0,Cancer,Articles,Country
0,Anal cancer,0,Sudan
1,Bladder cancer,0,Sudan
2,Brain cancer,0,Sudan
3,Breast cancer,1,Sudan
4,Cervical cancer,0,Sudan
...,...,...,...
31,Thyroid cancer,0,Iceland
32,Undetermined cancer,4,Iceland
33,Uterine cancer,0,Iceland
34,Vaginal cancer,0,Iceland


In [88]:
# Save
df_cancer_country_agg.to_csv(DF_prepared_output + "articles_cancer_country.csv", index=False)

### 2.5.- Most studied cancer per country

By using the aggregated data from the last dataframe, a new dataframe is obtained containing a column with the most studied cancer in each country. Here the categories **Other cancer** and **Undetermined cancer** are not taken into consideration

In [89]:
df_articles_filtered = df_cancer_country_agg.loc[~df_cancer_country_agg["Cancer"].isin(["Other cancer", "Undetermined cancer"])]

In [90]:
list_most_studied_cancer = []

for country in set(df_globocan_lite["Country"]):
    df = df_articles_filtered.loc[df_articles_filtered["Country"] == country]
    list_most_studied_cancer.append(df.sort_values(by="Articles", ascending=False)[:1])

df_max_studied_cancer = pd.concat(list_most_studied_cancer).sort_values(by="Country")
df_max_studied_cancer

Unnamed: 0,Cancer,Articles,Country
0,Anal cancer,0,Afghanistan
0,Anal cancer,0,Albania
6,Colorectal cancer,1,Algeria
18,Multiple myeloma,1,Angola
13,Leukemia,26,Argentina
...,...,...,...
3,Breast cancer,4,Venezuela
13,Leukemia,2,Vietnam
0,Anal cancer,0,Yemen
4,Cervical cancer,2,Zambia


In [91]:
# Save
df_max_studied_cancer.to_csv(DF_prepared_output + "articles_cancer_most_studied_country.csv", index = False)

### 2.6.- Number of cancer studies per country and cancer type

Here, a csv is generated in which, for each country and cancer type, the number of studies per year is calculated.

In [92]:
# Create and structure dict to hold data about publications on each cancer type
country_year_cancer_aggregates = {}

for country in set(df_globocan_lite["Country"]):
    country_year_cancer_aggregates[country] = {}
    for cancer in cancer_names:
        country_year_cancer_aggregates[country][cancer] = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for country in set(df["Country"]):
        if pd.notna(country):
            df_country = df.loc[df["Country"] == country]
            for cancer in cancer_names:
                if cancer in df.columns.to_list():
                    df_country_cancer = df_country.loc[df_country[cancer] > 0]
                    for year in set(df_country_cancer["PublicationDate"]):
                        if pd.notna(year):
                            if int(year) in country_year_cancer_aggregates[country][cancer].keys():
                                country_year_cancer_aggregates[country][cancer][int(year)] += len(df_country_cancer.loc[df_country_cancer["PublicationDate"] == year])
                            else:
                                country_year_cancer_aggregates[country][cancer][int(year)] = len(df_country_cancer.loc[df_country_cancer["PublicationDate"] == year])
                        
country_year_cancer_aggregates  

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.01s/it]


{'Sudan': {'Anal cancer': {},
  'Bladder cancer': {},
  'Brain cancer': {},
  'Breast cancer': {2018: 1},
  'Cervical cancer': {},
  'Colon cancer': {},
  'Colorectal cancer': {},
  'Esophageal cancer': {},
  'Gallbladder cancer': {},
  'Hodgkin lymphoma': {},
  'Kaposi sarcoma': {},
  'Kidney cancer': {},
  'Laryngeal cancer': {},
  'Leukemia': {},
  'Liver cancer': {},
  'Lung cancer': {},
  'Mesothelioma': {},
  'Mouth cancer': {},
  'Multiple myeloma': {},
  'Non-Hodgkin lymphoma': {},
  'Other cancer': {2013: 1},
  'Ovarian cancer': {},
  'Pancreatic cancer': {},
  'Penile cancer': {},
  'Prostate cancer': {},
  'Rectal cancer': {},
  'Salivary gland cancer': {},
  'Skin cancer': {},
  'Stomach cancer': {},
  'Testicular cancer': {},
  'Throat cancer': {},
  'Thyroid cancer': {},
  'Undetermined cancer': {2017: 1, 2019: 1},
  'Uterine cancer': {},
  'Vaginal cancer': {},
  'Vulvar cancer': {}},
 'Samoa': {'Anal cancer': {},
  'Bladder cancer': {},
  'Brain cancer': {},
  'Breast c

In [93]:
# Assign value 0 to any country, cancer, year combination without a value in the aggregates (e.g. without any published article)
years_articles_dataset = []

for country in country_year_cancer_aggregates.keys():
    for cancer in cancer_names:
        years_articles_dataset += list(country_year_cancer_aggregates[country][cancer].keys())

years_articles_dataset = set(years_articles_dataset)

for year in years_articles_dataset:
    for country in country_year_cancer_aggregates.keys():
        for cancer in country_year_cancer_aggregates[country].keys():
            if year not in country_year_cancer_aggregates[country][cancer].keys():
                country_year_cancer_aggregates[country][cancer][year] = 0

In [94]:
# Create df from dict
dfs_contry_year_cancer = []
for country in country_year_cancer_aggregates.keys():
    for cancer in country_year_cancer_aggregates[country].keys():
        df = pd.DataFrame.from_dict(country_year_cancer_aggregates[country][cancer], orient="index", columns = ["Articles"])
        df.reset_index(inplace=True)
        df.rename(columns={"index": "Year"}, inplace=True)
        df["Country"] = country
        df["Cancer"] = cancer
        dfs_contry_year_cancer.append(df)

        del df

df_country_year_cancer_agg = pd.concat(dfs_contry_year_cancer)
df_country_year_cancer_agg["Articles"] = df_country_year_cancer_agg["Articles"].astype(int)

In [95]:
df_country_year_cancer_agg

Unnamed: 0,Year,Articles,Country,Cancer
0,1985,0,Sudan,Anal cancer
1,1986,0,Sudan,Anal cancer
2,1987,0,Sudan,Anal cancer
3,1988,0,Sudan,Anal cancer
4,1989,0,Sudan,Anal cancer
...,...,...,...,...
36,2021,0,Iceland,Vulvar cancer
37,2022,0,Iceland,Vulvar cancer
38,2023,0,Iceland,Vulvar cancer
39,2024,0,Iceland,Vulvar cancer


In [96]:
df_country_year_cancer_agg.to_csv(DF_prepared_output + "articles_country_year_cancer.csv", index = False)

### 2.7.- Top 5 countries with largest number of studies per cancer

In [97]:
dfs_top5_countries_studies_per_cancer = []
for cancer in set(df_cancer_country_agg["Cancer"]):
    df = df_cancer_country_agg.loc[df_cancer_country_agg["Cancer"] == cancer].sort_values(by=["Articles"],ascending=False).head(5)
    dfs_top5_countries_studies_per_cancer.append(df)
    del df, cancer

df_top5_countries_studies_per_cancer = pd.concat(dfs_top5_countries_studies_per_cancer)
df_top5_countries_studies_per_cancer

Unnamed: 0,Cancer,Articles,Country
17,Mouth cancer,2,Iran
17,Mouth cancer,2,United Kingdom
17,Mouth cancer,1,Russia
17,Mouth cancer,1,Czechia
17,Mouth cancer,1,United States
...,...,...,...
21,Ovarian cancer,1357,United States
21,Ovarian cancer,284,United Kingdom
21,Ovarian cancer,278,Italy
21,Ovarian cancer,221,Germany


In [98]:
df_top5_countries_studies_per_cancer.to_csv(DF_prepared_output + "articles_top5_countries_studies_per_cancer.csv", index = False)

### 2.8- Number of cancer studies per country and cancer type, light version (only countries with many cancer studies)

Some plots show the top 5 countries by number of articles for a given cancer. In the previous step, the top5 countries per cancer have been calculated. Therefore, to make calculations in these plots faster, a light version of the previous df is obtained keeping only these countries with most number of cancer articles for each cancer

In [99]:
# Filter the dataset to keep only these countries
countries_top_articles = set(df_top5_countries_studies_per_cancer["Country"])
df_country_year_cancer_agg_lite = df_country_year_cancer_agg.loc[df_country_year_cancer_agg["Country"].isin(countries_top_articles)]
df_country_year_cancer_agg_lite

Unnamed: 0,Year,Articles,Country,Cancer
0,1985,0,Samoa,Anal cancer
1,1986,0,Samoa,Anal cancer
2,1987,0,Samoa,Anal cancer
3,1988,0,Samoa,Anal cancer
4,1989,0,Samoa,Anal cancer
...,...,...,...,...
36,2011,0,United States,Vulvar cancer
37,2013,0,United States,Vulvar cancer
38,2015,0,United States,Vulvar cancer
39,2019,0,United States,Vulvar cancer


In [100]:
df_country_year_cancer_agg_lite.to_csv(DF_prepared_output + "articles_country_year_cancer_lite.csv", index = False)

## Part 3 - Calculating aggregated data for articles' and Globocan datasets

### 3.1 Incidence and number of studies for each cancer per country

Here, the number of studies and ASR (incidence) per cancer and country are combined

In [101]:
df_cancer_country_agg

Unnamed: 0,Cancer,Articles,Country
0,Anal cancer,0,Sudan
1,Bladder cancer,0,Sudan
2,Brain cancer,0,Sudan
3,Breast cancer,1,Sudan
4,Cervical cancer,0,Sudan
...,...,...,...
31,Thyroid cancer,0,Iceland
32,Undetermined cancer,4,Iceland
33,Uterine cancer,0,Iceland
34,Vaginal cancer,0,Iceland


In [102]:
df_cancer_country_studies_ASR_agg = df_cancer_country_agg.loc[~df_cancer_country_agg["Cancer"].isin(["Undetermined cancer", "Other cancer"])]
df_cancer_country_studies_ASR_agg["ASR"] = None

for country in set(df_globocan_lite["Country"]):
    for cancer in set(df_globocan_lite["Cancer"]):
        df_cancer_country_studies_ASR_agg.loc[(df_cancer_country_studies_ASR_agg["Country"] == country) & (df_cancer_country_studies_ASR_agg["Cancer"] == cancer), "ASR"] = df_globocan_lite.loc[(df_globocan_lite["Country"] == country) & (df_globocan_lite["Cancer"] == cancer), "ASR"].values[0]

In [103]:
df_cancer_country_studies_ASR_agg

Unnamed: 0,Cancer,Articles,Country,ASR
0,Anal cancer,0,Sudan,0.12
1,Bladder cancer,0,Sudan,2.2
2,Brain cancer,0,Sudan,1.4
3,Breast cancer,1,Sudan,39.9
4,Cervical cancer,0,Sudan,8.6
...,...,...,...,...
30,Throat cancer,0,Iceland,0.57
31,Thyroid cancer,0,Iceland,6.9
33,Uterine cancer,0,Iceland,17.9
34,Vaginal cancer,0,Iceland,0.27


In [104]:
# Add a column with number of articles normalized by population
# Data about population per country obtained (for almost all countries) from: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
country_populations = {
    "Peru": 34350244,
    "Philippines": 114123600,
    "Jamaica": 2825544,
    "Moldova": 2381300,
    "Luxembourg": 681973,
    "Bolivia": 11312620,
    "Zimbabwe": 17073087,
    "Mozambique": 34090466,
    "Lithuania": 2894177,
    "DR Congo": 112832000,
    "Ethiopia": 111652998,
    "Vietnam": 101343800,
    "Iceland": 391810,
    "North Macedonia": 1826247,
    "Netherlands": 18080943,
    "Norway": 5606944,
    "South Korea": 51159889,
    "Gambia": 2422712,
    "Solomon Islands": 750325,
    "Nicaragua": 6803886,
    "Austria": 9200931,
    "Reunion": 896175,
    "Nigeria": 223800000,
    "Thailand": 65859640,
    "Tanzania": 68153004,
    "Congo Republic": 6142180,
    "New Caledonia": 264596,
    "Vanuatu": 321409,
    "Cambodia": 17577760,
    "Russia": 146028325,
    "French Polynesia": 279500,
    "United Kingdom": 68265209,
    "Somalia": 19655000,
    "Central African Republic": 6470307,
    "Dominican Republic": 10771504,
    "Ecuador": 18103660,
    "Timor-Leste": 1391221,
    "Tajikistan": 10499000,
    "India": 1417492000,
    "New Zealand": 5324700,
    "Türkiye": 85664944,
    "Spain": 49315949,
    "Israel": 10119400,
    "Sri Lanka": 21763170,
    "Georgia": 3704500,
    "Bahamas": 398165,
    "Rwanda": 14104969,
    "Argentina": 47067641,
    "Gabon": 2469296,
    "Chad": 19340757,
    "St. Lucia": 184100,
    "Indonesia": 284438782,
    "Botswana": 2359609,
    "Senegal": 18593258,
    "Sweden": 10592686,
    "Liberia": 5248621,
    "Haiti": 11867032,
    "Poland": 37401000,
    "Martinique": 349925,
    "Benin": 13224860,
    "Trinidad and Tobago": 1368333,
    "Micronesia, Fed. Sts.": 105564,
    "Iraq": 46118793,
    "Eritrea": 3607000,
    "Azerbaijan": 10241722,
    "Libya": 7459000,
    "Slovakia": 5413813,
    "Namibia": 3022401,
    "Uruguay": 3499451,
    "Afghanistan": 43844000,
    "Chile": 20206953,
    "North Korea": 25950000,
    "Lebanon": 5490000,
    "Bulgaria": 6437360,
    "Burkina Faso": 24070553,
    "Mali": 22395489,
    "Sudan": 51662000,
    "Serbia": 6567783,
    "Ireland": 5458600,
    "Bosnia and Herzegovina": 3422000,
    "Cuba": 9748007,
    "Guinea-Bissau": 1781308,
    "Angola": 36170961,
    "Japan": 123300000,
    "Hungary": 9539502,
    "Syria": 25620000,
    "China": 1408280000,
    "Switzerland": 9067144,
    "Belize": 417634,
    "Nepal": 29911840,
    "Paraguay": 6109644,
    "Italy": 58919837,
    "Portugal": 10749635,
    "Cameroon": 29442327,
    "Colombia": 52695952,
    "Algeria": 47400000,
    "Denmark": 6002420,
    "Samoa": 205557,
    "Romania": 19036031,
    "United States": 340110988,
    "Djibouti": 1066809,
    "Egypt": 107271260,
    "Venezuela": 28517000,
    "Eswatini": 1235549,
    "Mauritius": 1243741,
    "Laos": 7647000,
    "Mauritania": 4927532,
    "United Arab Emirates": 10678556,
    "Niger": 26312034,
    "Uzbekistan": 37859698,
    "South Sudan": 15786898,
    "Mongolia": 3544835,
    "Cote d'Ivoire": 29389150,
    "Estonia": 1369995,
    "Ghana": 33742380,
    "Fiji": 900869,
    "Malaysia": 34231700,
    "Togo": 8095498,
    "Cabo Verde": 491233,
    "Uganda": 45905417,
    "Yemen": 32684503,
    "Guadeloupe": 378561,
    "Australia": 27400013,
    "Montenegro": 623327,
    "Myanmar": 51316756,
    "Singapore": 6036900,
    "Pakistan": 241499431,
    "Belgium": 11825551,
    "Oman": 5306976,
    "Jordan": 11734000,
    "Costa Rica": 5309625,
    "Greece": 10400720,
    "Comoros": 870038,
    "Kazakhstan": 20387811,
    "Bangladesh": 169828911,
    "Czechia": 10876875,
    "Lesotho": 2306000,
    "Guyana": 772975,
    "Brunei Darussalam": 455500,
    "Guam": 153836,
    "Palestine": 5483450,
    "Morocco": 37712153,
    "Bhutan": 784043,
    "Malawi": 20734262,
    "Guatemala": 18079810,
    "Tunisia": 11972169,
    "Mexico": 130575786,
    "Suriname": 616500,
    "Bahrain": 1594654,
    "South Africa": 63100945,
    "Latvia": 1830400,
    "Kenya": 53330978,
    "Turkmenistan": 7057841,
    "Cyprus": 966400,
    "Puerto Rico": 3203295,
    "Slovenia": 2130638,
    "Belarus": 9109280,
    "Sierra Leone": 9077691,
    "Finland": 5645651,
    "Qatar": 3173024,
    "Kyrgyz Republic": 7281800,
    "Albania": 2363314,
    "Germany": 83577140,
    "Malta": 574250,
    "Maldives": 515132,
    "Guinea": 14363931,
    "Saudi Arabia": 35300280,
    "Panama": 4064780,
    "Ukraine": 32862000,
    "Burundi": 12332788,
    "France": 68668000,
    "Brazil": 212583750,
    "Barbados": 267800,
    "Honduras": 9892632,
    "Zambia": 19693423,
    "Canada": 41548787,
    "French Guiana": 292354,
    "Armenia": 3081100,
    "Kuwait": 4881254,
    "Croatia": 3859686,
    "Madagascar": 31727042,
    "Sao Tome and Principe": 235536,
    "El Salvador": 6029976,
    "Papua New Guinea": 11781559,
    "Iran": 85961000,
}

for country in set(df_cancer_country_studies_ASR_agg["Country"]):
    df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"] == country, "Norm_articles"] = df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"] == country, "Articles"] * 1000000 / country_populations[country]

In [105]:
df_cancer_country_studies_ASR_agg

Unnamed: 0,Cancer,Articles,Country,ASR,Norm_articles
0,Anal cancer,0,Sudan,0.12,0.000000
1,Bladder cancer,0,Sudan,2.2,0.000000
2,Brain cancer,0,Sudan,1.4,0.000000
3,Breast cancer,1,Sudan,39.9,0.019357
4,Cervical cancer,0,Sudan,8.6,0.000000
...,...,...,...,...,...
30,Throat cancer,0,Iceland,0.57,0.000000
31,Thyroid cancer,0,Iceland,6.9,0.000000
33,Uterine cancer,0,Iceland,17.9,0.000000
34,Vaginal cancer,0,Iceland,0.27,0.000000


In [106]:
# Save
df_cancer_country_studies_ASR_agg.to_csv(DF_prepared_output + "articles_ASR_country_cancer.csv", index = False)

### 3.2 Incidence and number of studies for each cancer per country, percentage values

In [107]:
df_cancer_country_studies_ASR_agg

Unnamed: 0,Cancer,Articles,Country,ASR,Norm_articles
0,Anal cancer,0,Sudan,0.12,0.000000
1,Bladder cancer,0,Sudan,2.2,0.000000
2,Brain cancer,0,Sudan,1.4,0.000000
3,Breast cancer,1,Sudan,39.9,0.019357
4,Cervical cancer,0,Sudan,8.6,0.000000
...,...,...,...,...,...
30,Throat cancer,0,Iceland,0.57,0.000000
31,Thyroid cancer,0,Iceland,6.9,0.000000
33,Uterine cancer,0,Iceland,17.9,0.000000
34,Vaginal cancer,0,Iceland,0.27,0.000000


In [108]:
dfs_percentage_values = []

for country in set(df_cancer_country_studies_ASR_agg["Country"]):
    df = df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"] == country]
    df = df.astype({"Articles":float, "ASR":float})
    total_articles = df["Articles"].sum()
    total_incidence = df["ASR"].sum()
    for row in df.index:
        if total_articles > 0:
            df.at[row, "Articles"] = df.at[row, "Articles"]*100/total_articles
        df.at[row, "ASR"] = df.at[row, "ASR"]*100/total_incidence
    dfs_percentage_values.append(df)
    del df, total_articles, total_incidence

df_cancer_country_studies_ASR_agg_percentage = pd.concat(dfs_percentage_values)
df_cancer_country_studies_ASR_agg_percentage.drop(columns=["Norm_articles"], inplace = True)
df_cancer_country_studies_ASR_agg_percentage

Unnamed: 0,Cancer,Articles,Country,ASR
0,Anal cancer,0.0,Sudan,0.096587
1,Bladder cancer,0.0,Sudan,1.770766
2,Brain cancer,0.0,Sudan,1.126851
3,Breast cancer,100.0,Sudan,32.115261
4,Cervical cancer,0.0,Sudan,6.922086
...,...,...,...,...
30,Throat cancer,0.0,Iceland,0.151382
31,Thyroid cancer,0.0,Iceland,1.832523
33,Uterine cancer,0.0,Iceland,4.753937
34,Vaginal cancer,0.0,Iceland,0.071707


In [109]:
df_cancer_country_studies_ASR_agg_percentage.to_csv(DF_prepared_output + "articles_ASR_country_cancer_percentage.csv", index = False)

### 3.3 Incidence and number of studies for each cancer per country, percentage values, restrict to countries with more than 1M people

In [110]:
countries_1M = []
for country in country_populations:
    if country_populations[country] >= 1000000:
        countries_1M.append(country)

df_cancer_country_studies_ASR_agg_percentage_1M = df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"].isin(countries_1M)]

In [111]:
df_cancer_country_studies_ASR_agg_percentage_1M.to_csv(DF_prepared_output + "articles_ASR_country_cancer_1M.csv", index = False)