In [57]:
import os
import pandas as pd
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [58]:
# CSV with the finished Globocan dataset
GLOBOCAN_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [59]:
# Directory to save Globocan dataset with required columns only
GLOBOCAN_prepared_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [60]:
# Directory with the csvs of the articles' dataset with country and cancer names matching those of GLOBOCAN dataset
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_combined_until_Sept2025_match_GLOBOCAN\


In [61]:
# Directory to save aggregated articles' datasets
DF_prepared_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


## Part 1 - Preparing Globocan dataset
Right now, the Globocan dataset has a number of columns which are not required for preparing the visualizations:

In [62]:
df_globocan = pd.read_csv(GLOBOCAN_input + "Globocan_dataset_ready.csv")
df_globocan.columns.to_list()

['Alpha-3 code',
 'Cancer code',
 'Population code (ISO/UN)',
 'Country',
 'Sex',
 'Number',
 '95% UI low',
 '95% UI high',
 'Number.1',
 'ASR (World)',
 'Crude rate',
 'Cumulative risk',
 'Cancer']

### 1.1.- Remove unnecessary columns  
Only the columns **Country**, **Cancer**, **ASR (World)** and **Crude rate** are important in this project. ASR (World) refers to the age-standardized rate, whereas Crude rate refers to the raw (non age-standardized) rate.  Here, a new csv is saved with only these columns.

In [63]:
df_globocan_lite = df_globocan[["Country",  "Cancer", "ASR (World)", "Crude rate"]]
df_globocan_lite.rename(columns = {"ASR (World)":"ASR"}, inplace = True)
df_globocan_lite

Unnamed: 0,Country,Cancer,ASR,Crude rate
0,Afghanistan,Anal cancer,0.47,0.27
1,Albania,Anal cancer,0.24,0.38
2,Algeria,Anal cancer,0.34,0.35
3,Angola,Anal cancer,0.24,0.11
4,Azerbaijan,Anal cancer,0.87,1.00
...,...,...,...,...
6285,Samoa,Skin cancer,0.00,0.00
6286,Yemen,Skin cancer,1.78,0.96
6287,South Africa,Skin cancer,23.90,20.90
6288,Zambia,Skin cancer,3.27,1.59


In [64]:
df_globocan_lite.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_ready_lite.csv", index = False)

### 1.2.- Cummulative cancer incidence (all cancers) per country
Even though the dataset contains both the ASR and the crude incidence values (see above), in principle only the ASR values will be displayed. Here, a new dataset is created containing the sum of the cancer incidences (ASR) of all cancers per country. Note that **Colorectal cancer** is removed from the dataset before grouping to avoid duplication with the disaggregated data **Anal cancer**, **Rectal cancer**, **Colon cancer**.

In [65]:
df_globocan_cumm_ASR_country = df_globocan_lite.loc[df_globocan_lite["Cancer"] != "Colorectal cancer", ["Country", "ASR"]].groupby(by=["Country"]).sum()

In [66]:
df_globocan_cumm_ASR_country.reset_index(inplace=True)
df_globocan_cumm_ASR_country

Unnamed: 0,Country,ASR
0,Afghanistan,120.49
1,Albania,202.50
2,Algeria,178.86
3,Angola,174.78
4,Argentina,280.81
...,...,...
180,Venezuela,246.03
181,Vietnam,174.88
182,Yemen,90.97
183,Zambia,231.30


In [67]:
df_globocan_cumm_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_cummulative_ASR_country.csv", index = False)

### 1.3.- Cancer with highest incidence per country
A new dataset will be saved containing the name of the cancer with highest incidence (ASR) per country

In [68]:
highest_incidence_idx = df_globocan_lite.groupby(by="Country")["ASR"].idxmax()
df_globocan_max_ASR_country = df_globocan_lite.iloc[highest_incidence_idx]
df_globocan_max_ASR_country.rename(columns={"Crude rate": "Crude_rate"}, inplace = True)
df_globocan_max_ASR_country

Unnamed: 0,Country,Cancer,ASR,Crude_rate
555,Afghanistan,Breast cancer,29.4,17.9
556,Albania,Breast cancer,51.1,72.6
557,Algeria,Breast cancer,61.9,65.1
4258,Angola,Prostate cancer,47.9,15.1
560,Argentina,Breast cancer,71.3,91.8
...,...,...,...,...
4435,Venezuela,Prostate cancer,52.3,57.1
704,Vietnam,Breast cancer,38.0,49.6
737,Yemen,Breast cancer,25.4,18.6
923,Zambia,Cervical cancer,71.5,37.0


In [69]:
df_globocan_max_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_max_ASR_country.csv", index = False)

## Part 2 - Calculating aggregated data for articles' dataset
To prepare data in the articles' dataset, aggregates of studies will be calculated grouping by country, cancer type and year.  
Future iterations of the project will deal with animal species as well.

### 2.1.- Aggregate number of articles per year irrespective of country and cancer type 
Here, a csv is prepared aggregating papers by year (i.e. the final csv will have a row per year)

In [70]:
# Import list of csvs to parse
list_csvs_match_GLOBOCAN = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs_match_GLOBOCAN.append(file)

n_csvs_match_GLOBOCAN = len(list_csvs_match_GLOBOCAN)

In [71]:
year_aggregates = {}
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df["PublicationDate"].value_counts().to_dict()
    for year in dict_agg.keys():
        if int(year) in year_aggregates.keys():
            year_aggregates[int(year)] += dict_agg[int(year)]
        else:
            year_aggregates[int(year)] = dict_agg[int(year)]
    del df, dict_agg

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [03:06<00:00,  3.74s/it]


In [72]:
year_aggregates

{1984: 40075,
 2024: 244443,
 2025: 198601,
 2002: 76353,
 2000: 71170,
 1993: 55102,
 2023: 237372,
 2012: 138544,
 2006: 94267,
 2022: 245236,
 2021: 249021,
 2018: 187680,
 2026: 197,
 2005: 91004,
 2004: 85434,
 1992: 53303,
 1991: 51473,
 1990: 53232,
 2001: 72691,
 1999: 66895,
 1998: 64720,
 1995: 58541,
 1997: 62860,
 1996: 61090,
 1994: 58000,
 1989: 51838,
 1988: 47953,
 1986: 42957,
 1987: 44938,
 1985: 41699,
 2003: 81219,
 2007: 99610,
 2013: 147002,
 2008: 104591,
 2016: 175374,
 2009: 109345,
 2010: 117318,
 2011: 125587,
 2015: 170983,
 2019: 201285,
 2014: 161508,
 2017: 180199,
 2020: 226078}

In [73]:
# Convert to pd dataframe
df_year_agg = pd.DataFrame.from_dict(year_aggregates, orient="index", columns = ["Articles"])
df_year_agg.reset_index(inplace=True)
df_year_agg.rename(columns={"index": "Year"}, inplace=True)
df_year_agg.sort_values(by=["Year"], ascending= True, inplace=True)

In [74]:
df_year_agg

Unnamed: 0,Year,Articles
0,1984,40075
29,1985,41699
27,1986,42957
28,1987,44938
26,1988,47953
25,1989,51838
17,1990,53232
16,1991,51473
15,1992,53303
5,1993,55102


In [75]:
# Save
df_year_agg.to_csv(DF_prepared_output + "articles_year.csv", index=False)

### 2.2.- Aggregate number of articles per country and year irrespective of cancer type
Here, a csv is prepared aggregating papers by year and country (i.e. the final csv will have a row per year and country)

In [76]:
year_country_aggregates = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df[["Country", "PublicationDate"]].value_counts().to_dict()
    for key in dict_agg.keys():
        if key[0] in year_country_aggregates.keys() and int(key[1]) in year_country_aggregates[key[0]].keys():
            year_country_aggregates[key[0]][int(key[1])] += dict_agg[(key[0], key[1])]
        elif key[0] in year_country_aggregates.keys():
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]
        else:
            year_country_aggregates[key[0]] = {}
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]

year_country_aggregates

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [03:10<00:00,  3.81s/it]


{'United States': {1984: 72,
  2024: 52251,
  2025: 42292,
  2000: 21900,
  1993: 17166,
  2023: 52585,
  2012: 36647,
  2005: 27441,
  2026: 10,
  2004: 26533,
  1992: 16637,
  1991: 15154,
  1990: 12177,
  2001: 22443,
  2002: 23286,
  1999: 20440,
  1998: 19568,
  1996: 19356,
  1994: 17439,
  1995: 18769,
  1997: 19673,
  1986: 213,
  1989: 10323,
  2003: 24875,
  1988: 9775,
  2006: 27859,
  2007: 29350,
  1985: 114,
  2008: 30607,
  2016: 45419,
  1987: 4219,
  2009: 31200,
  2010: 32871,
  2011: 34484,
  2013: 38100,
  2014: 41371,
  2015: 43961,
  2019: 49778,
  2018: 48416,
  2017: 46792,
  2021: 56577,
  2022: 52981,
  2020: 53682},
 'United Kingdom': {1984: 22,
  2024: 8088,
  2025: 6605,
  2000: 4211,
  2023: 8427,
  2021: 10299,
  1993: 3592,
  2026: 4,
  1992: 3579,
  1991: 3158,
  1990: 2370,
  2001: 4156,
  2002: 4387,
  1999: 3889,
  1995: 3716,
  1994: 3643,
  1998: 3683,
  1996: 3750,
  1997: 3778,
  2003: 4590,
  2004: 4768,
  1989: 1892,
  2005: 4726,
  1986: 55,
 

In [77]:
# Create df from dict
dfs_year_country = []
for country in year_country_aggregates.keys():
    df = pd.DataFrame.from_dict(year_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Country"] = country
    dfs_year_country.append(df)

df_year_country_agg = pd.concat(dfs_year_country)

In [78]:
df_year_country_agg

Unnamed: 0,Year,Articles,Country
0,1984,72,United States
27,1985,114,United States
21,1986,213,United States
30,1987,4219,United States
24,1988,9775,United States
...,...,...,...
0,2015,1,Belize
1,2021,1,Belize
2,2023,2,Belize
3,2024,1,Belize


In [79]:
# Save
df_year_country_agg.to_csv(DF_prepared_output + "articles_year_country.csv", index=False)

### 2.3.- Aggregate number of articles per cancer type and year irrespective of country
Here, a csv is prepared aggregating papers by year and cancer type (i.e. the final csv will have a row per year and cancer type). Cancers names are hardcoded from a previous notebook in this project.

In [80]:
cancer_names = [
 'Anal cancer',
 'Bladder cancer',
 'Brain cancer',
 'Breast cancer',
 'Cervical cancer',
 'Colon cancer',
 'Colorectal cancer',
 'Esophageal cancer',
 'Gallbladder cancer',
 'Hodgkin lymphoma',
 'Kaposi sarcoma',
 'Kidney cancer',
 'Laryngeal cancer',
 'Leukemia',
 'Liver cancer',
 'Lung cancer',
 'Mesothelioma',
 'Mouth cancer',
 'Multiple myeloma',
 'Non-Hodgkin lymphoma',
 'Other cancer',
 'Ovarian cancer',
 'Pancreatic cancer',
 'Penile cancer',
 'Prostate cancer',
 'Rectal cancer',
 'Salivary gland cancer',
 'Skin cancer',
 'Stomach cancer',
 'Testicular cancer',
 'Throat cancer',
 'Thyroid cancer',
 'Undetermined cancer',
 'Uterine cancer',
 'Vaginal cancer',
 'Vulvar cancer'
]

In [83]:
# Create and structure dict to hold data about publications on each cancer type
year_cancer_aggregates = {}

for cancer in cancer_names:
    year_cancer_aggregates[cancer] = {}

# Parse all articles' csv and count the number of studies per cancer and country
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for row in df.index:
        for cancer in cancer_names:
            if cancer in df.columns.to_list():
                year = df.at[row, "PublicationDate"]
                if pd.notna(year) and df.at[row, cancer] > 0:
                    if int(year) in year_cancer_aggregates[cancer].keys():
                        year_cancer_aggregates[cancer][int(year)] += 1
                    else:
                        year_cancer_aggregates[cancer][int(year)] = 1

    del df, cancer, year, csv

year_cancer_aggregates

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [38:18<00:00, 45.97s/it]


{'Anal cancer': {1984: 4,
  2002: 25,
  2025: 197,
  2024: 210,
  1992: 12,
  1991: 14,
  1990: 14,
  2001: 25,
  1996: 18,
  1999: 18,
  1998: 20,
  2003: 32,
  2000: 20,
  2004: 40,
  2005: 40,
  2007: 44,
  2006: 38,
  2008: 68,
  2009: 93,
  2010: 101,
  2011: 104,
  2012: 111,
  1989: 18,
  2013: 145,
  2014: 120,
  2015: 163,
  2016: 163,
  2017: 178,
  2018: 186,
  1987: 7,
  1988: 8,
  1986: 3,
  2019: 167,
  2020: 206,
  2021: 196,
  2022: 194,
  2023: 193,
  1995: 13,
  1993: 15,
  1994: 17,
  1997: 19},
 'Bladder cancer': {1984: 284,
  2002: 605,
  2025: 2508,
  2024: 2971,
  2026: 2,
  1992: 472,
  1991: 384,
  1990: 406,
  2001: 588,
  2000: 611,
  1998: 487,
  1999: 518,
  2003: 629,
  1997: 497,
  1996: 531,
  1995: 453,
  2004: 702,
  2005: 731,
  2006: 827,
  1989: 355,
  2007: 881,
  2008: 907,
  1985: 278,
  1993: 427,
  1986: 250,
  1994: 470,
  2009: 947,
  2010: 1054,
  2011: 1136,
  2012: 1324,
  1987: 305,
  2013: 1525,
  2014: 1720,
  1988: 315,
  2015: 1725,
 

In [84]:
# Create df from dict
dfs_year_cancer = []
for cancer in year_cancer_aggregates.keys():
    df = pd.DataFrame.from_dict(year_cancer_aggregates[cancer], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Cancer"] = cancer
    dfs_year_cancer.append(df)

df_year_cancer_agg = pd.concat(dfs_year_cancer)

In [85]:
df_year_cancer_agg

Unnamed: 0,Year,Articles,Cancer
0,1984,4,Anal cancer
31,1986,3,Anal cancer
29,1987,7,Anal cancer
30,1988,8,Anal cancer
22,1989,18,Anal cancer
...,...,...,...
36,2021,138,Vulvar cancer
39,2022,149,Vulvar cancer
38,2023,128,Vulvar cancer
3,2024,142,Vulvar cancer


In [86]:
# Save
df_year_cancer_agg.to_csv(DF_prepared_output + "articles_year_cancer.csv", index=False)

### 2.4.- Aggregate number of articles per cancer per country

Here, a csv is generated in which, for each country, the number of studies per cancer, in all years combined, is calculated. 

In [87]:
# Create and structure dict to hold data about publications on each cancer type
cancer_country_aggregates = {}

for country in set(df_globocan_lite["Country"]):
    cancer_country_aggregates[country] = {}

    for cancer in cancer_names:
        cancer_country_aggregates[country][cancer] = 0

# Parse all articles' csv and count the number of studies per cancer and country
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for row in df.index:
        for cancer in cancer_names:
            if cancer in df.columns.to_list():
                country = df.at[row, "Country"]
                if pd.notna(country) and df.at[row, cancer] > 0:
                    cancer_country_aggregates[country][cancer] += 1

    del df, cancer, country, csv

cancer_country_aggregates

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [36:09<00:00, 43.39s/it]


{'Comoros': {'Anal cancer': 0,
  'Bladder cancer': 0,
  'Brain cancer': 3,
  'Breast cancer': 12,
  'Cervical cancer': 3,
  'Colon cancer': 1,
  'Colorectal cancer': 0,
  'Esophageal cancer': 0,
  'Gallbladder cancer': 0,
  'Hodgkin lymphoma': 0,
  'Kaposi sarcoma': 0,
  'Kidney cancer': 0,
  'Laryngeal cancer': 0,
  'Leukemia': 8,
  'Liver cancer': 2,
  'Lung cancer': 2,
  'Mesothelioma': 0,
  'Mouth cancer': 0,
  'Multiple myeloma': 0,
  'Non-Hodgkin lymphoma': 0,
  'Other cancer': 14,
  'Ovarian cancer': 0,
  'Pancreatic cancer': 1,
  'Penile cancer': 0,
  'Prostate cancer': 2,
  'Rectal cancer': 0,
  'Salivary gland cancer': 0,
  'Skin cancer': 2,
  'Stomach cancer': 0,
  'Testicular cancer': 0,
  'Throat cancer': 0,
  'Thyroid cancer': 0,
  'Undetermined cancer': 59,
  'Uterine cancer': 0,
  'Vaginal cancer': 0,
  'Vulvar cancer': 0},
 'Palestine': {'Anal cancer': 0,
  'Bladder cancer': 4,
  'Brain cancer': 3,
  'Breast cancer': 63,
  'Cervical cancer': 10,
  'Colon cancer': 6,
  

In [88]:
# Create df from dict
dfs_cancer_country = []
for country in cancer_country_aggregates.keys():
    df = pd.DataFrame.from_dict(cancer_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Cancer"}, inplace=True)
    df["Country"] = country
    dfs_cancer_country.append(df)

df_cancer_country_agg = pd.concat(dfs_cancer_country)

In [89]:
df_cancer_country_agg

Unnamed: 0,Cancer,Articles,Country
0,Anal cancer,0,Comoros
1,Bladder cancer,0,Comoros
2,Brain cancer,3,Comoros
3,Breast cancer,12,Comoros
4,Cervical cancer,3,Comoros
...,...,...,...
31,Thyroid cancer,624,Canada
32,Undetermined cancer,54178,Canada
33,Uterine cancer,720,Canada
34,Vaginal cancer,15,Canada


In [90]:
# Save
df_cancer_country_agg.to_csv(DF_prepared_output + "articles_cancer_country.csv", index=False)

### 2.5.- Most studied cancer per country

By using the aggregated data from the last dataframe, a new dataframe is obtained containing a column with the most studied cancer in each country. Here the categories **Other cancer** and **Undetermined cancer** are not taken into consideration

In [91]:
df_articles_filtered = df_cancer_country_agg.loc[~df_cancer_country_agg["Cancer"].isin(["Other cancer", "Undetermined cancer"])]

In [92]:
list_most_studied_cancer = []

for country in set(df_globocan_lite["Country"]):
    df = df_articles_filtered.loc[df_articles_filtered["Country"] == country]
    list_most_studied_cancer.append(df.sort_values(by="Articles", ascending=False)[:1])

df_max_studied_cancer = pd.concat(list_most_studied_cancer).sort_values(by="Country")
df_max_studied_cancer

Unnamed: 0,Cancer,Articles,Country
3,Breast cancer,18,Afghanistan
3,Breast cancer,7,Albania
3,Breast cancer,66,Algeria
18,Multiple myeloma,3,Angola
3,Breast cancer,835,Argentina
...,...,...,...
27,Skin cancer,56,Venezuela
3,Breast cancer,183,Vietnam
3,Breast cancer,18,Yemen
4,Cervical cancer,38,Zambia


In [93]:
# Save
df_max_studied_cancer.to_csv(DF_prepared_output + "articles_cancer_most_studied_country.csv", index = False)

### 2.6.- Number of cancer studies per country and cancer type

Here, a csv is generated in which, for each country and cancer type, the number of studies per year is calculated.

In [94]:
# Create and structure dict to hold data about publications on each cancer type
country_year_cancer_aggregates = {}

for country in set(df_globocan_lite["Country"]):
    country_year_cancer_aggregates[country] = {}
    for cancer in cancer_names:
        country_year_cancer_aggregates[country][cancer] = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for country in set(df["Country"]):
        if pd.notna(country):
            df_country = df.loc[df["Country"] == country]
            for cancer in cancer_names:
                if cancer in df.columns.to_list():
                    df_country_cancer = df_country.loc[df_country[cancer] > 0]
                    for year in set(df_country_cancer["PublicationDate"]):
                        if pd.notna(year):
                            if int(year) in country_year_cancer_aggregates[country][cancer].keys():
                                country_year_cancer_aggregates[country][cancer][int(year)] += len(df_country_cancer.loc[df_country_cancer["PublicationDate"] == year])
                            else:
                                country_year_cancer_aggregates[country][cancer][int(year)] = len(df_country_cancer.loc[df_country_cancer["PublicationDate"] == year])
                        
country_year_cancer_aggregates  

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [06:07<00:00,  7.35s/it]


{'Comoros': {'Anal cancer': {},
  'Bladder cancer': {},
  'Brain cancer': {2002: 1, 2014: 1, 2015: 1},
  'Breast cancer': {2024: 2,
   2001: 1,
   2016: 1,
   2017: 1,
   2018: 1,
   2021: 1,
   2022: 2,
   2023: 2,
   2000: 1},
  'Cervical cancer': {2006: 1, 2018: 1, 2020: 1},
  'Colon cancer': {2010: 1},
  'Colorectal cancer': {},
  'Esophageal cancer': {},
  'Gallbladder cancer': {},
  'Hodgkin lymphoma': {},
  'Kaposi sarcoma': {},
  'Kidney cancer': {},
  'Laryngeal cancer': {},
  'Leukemia': {2001: 1, 2018: 1, 2021: 1, 2022: 1, 2023: 4},
  'Liver cancer': {2008: 1, 2021: 1},
  'Lung cancer': {2008: 1, 2021: 1},
  'Mesothelioma': {},
  'Mouth cancer': {},
  'Multiple myeloma': {},
  'Non-Hodgkin lymphoma': {},
  'Other cancer': {2001: 2,
   2004: 2,
   2008: 1,
   2010: 1,
   2011: 1,
   2016: 1,
   2015: 1,
   2018: 1,
   2021: 1,
   2023: 3},
  'Ovarian cancer': {},
  'Pancreatic cancer': {2015: 1},
  'Penile cancer': {},
  'Prostate cancer': {2025: 1, 2002: 1},
  'Rectal cancer

In [95]:
# Assign value 0 to any country, cancer, year combination without a value in the aggregates (e.g. without any published article)
years_articles_dataset = []

for country in country_year_cancer_aggregates.keys():
    for cancer in cancer_names:
        years_articles_dataset += list(country_year_cancer_aggregates[country][cancer].keys())

years_articles_dataset = set(years_articles_dataset)

for year in years_articles_dataset:
    for country in country_year_cancer_aggregates.keys():
        for cancer in country_year_cancer_aggregates[country].keys():
            if year not in country_year_cancer_aggregates[country][cancer].keys():
                country_year_cancer_aggregates[country][cancer][year] = 0

In [96]:
# Create df from dict
dfs_contry_year_cancer = []
for country in country_year_cancer_aggregates.keys():
    for cancer in country_year_cancer_aggregates[country].keys():
        df = pd.DataFrame.from_dict(country_year_cancer_aggregates[country][cancer], orient="index", columns = ["Articles"])
        df.reset_index(inplace=True)
        df.rename(columns={"index": "Year"}, inplace=True)
        df["Country"] = country
        df["Cancer"] = cancer
        dfs_contry_year_cancer.append(df)

        del df

df_country_year_cancer_agg = pd.concat(dfs_contry_year_cancer)
df_country_year_cancer_agg["Articles"] = df_country_year_cancer_agg["Articles"].astype(int)

In [97]:
df_country_year_cancer_agg

Unnamed: 0,Year,Articles,Country,Cancer
0,1984,0,Comoros,Anal cancer
1,1985,0,Comoros,Anal cancer
2,1986,0,Comoros,Anal cancer
3,1987,0,Comoros,Anal cancer
4,1988,0,Comoros,Anal cancer
...,...,...,...,...
38,1999,0,Canada,Vulvar cancer
39,2001,0,Canada,Vulvar cancer
40,2002,0,Canada,Vulvar cancer
41,2010,0,Canada,Vulvar cancer


In [98]:
df_country_year_cancer_agg.to_csv(DF_prepared_output + "articles_country_year_cancer.csv", index = False)

### 2.7.- Top 5 countries with largest number of studies per cancer

In [99]:
dfs_top5_countries_studies_per_cancer = []
for cancer in set(df_cancer_country_agg["Cancer"]):
    df = df_cancer_country_agg.loc[df_cancer_country_agg["Cancer"] == cancer].sort_values(by=["Articles"],ascending=False).head(5)
    dfs_top5_countries_studies_per_cancer.append(df)
    del df, cancer

df_top5_countries_studies_per_cancer = pd.concat(dfs_top5_countries_studies_per_cancer)
df_top5_countries_studies_per_cancer

Unnamed: 0,Cancer,Articles,Country
4,Cervical cancer,13699,China
4,Cervical cancer,12661,United States
4,Cervical cancer,3158,Japan
4,Cervical cancer,2763,India
4,Cervical cancer,2037,United Kingdom
...,...,...,...
3,Breast cancer,99024,United States
3,Breast cancer,52585,China
3,Breast cancer,16818,United Kingdom
3,Breast cancer,15437,Japan


In [100]:
df_top5_countries_studies_per_cancer.to_csv(DF_prepared_output + "articles_top5_countries_studies_per_cancer.csv", index = False)

### 2.8- Number of cancer studies per country and cancer type, light version (only countries with many cancer studies)

Some plots show the top 5 countries by number of articles for a given cancer. In the previous step, the top5 countries per cancer have been calculated. Therefore, to make calculations in these plots faster, a light version of the previous df is obtained keeping only these countries with most number of cancer articles for each cancer

In [101]:
# Filter the dataset to keep only these countries
countries_top_articles = set(df_top5_countries_studies_per_cancer["Country"])
df_country_year_cancer_agg_lite = df_country_year_cancer_agg.loc[df_country_year_cancer_agg["Country"].isin(countries_top_articles)]
df_country_year_cancer_agg_lite

Unnamed: 0,Year,Articles,Country,Cancer
0,2024,2,South Korea,Anal cancer
1,2025,1,South Korea,Anal cancer
2,1991,1,South Korea,Anal cancer
3,2003,1,South Korea,Anal cancer
4,2004,1,South Korea,Anal cancer
...,...,...,...,...
38,2010,0,France,Vulvar cancer
39,2011,0,France,Vulvar cancer
40,2015,0,France,Vulvar cancer
41,2016,0,France,Vulvar cancer


In [102]:
df_country_year_cancer_agg_lite.to_csv(DF_prepared_output + "articles_country_year_cancer_lite.csv", index = False)

## Part 3 - Calculating aggregated data for articles' and Globocan datasets

### 3.1 Incidence and number of studies for each cancer per country

Here, the number of studies and ASR (incidence) per cancer and country are combined

In [103]:
df_cancer_country_agg

Unnamed: 0,Cancer,Articles,Country
0,Anal cancer,0,Comoros
1,Bladder cancer,0,Comoros
2,Brain cancer,3,Comoros
3,Breast cancer,12,Comoros
4,Cervical cancer,3,Comoros
...,...,...,...
31,Thyroid cancer,624,Canada
32,Undetermined cancer,54178,Canada
33,Uterine cancer,720,Canada
34,Vaginal cancer,15,Canada


In [104]:
df_cancer_country_studies_ASR_agg = df_cancer_country_agg.loc[~df_cancer_country_agg["Cancer"].isin(["Undetermined cancer", "Other cancer"])]
df_cancer_country_studies_ASR_agg["ASR"] = None

for country in set(df_globocan_lite["Country"]):
    for cancer in set(df_globocan_lite["Cancer"]):
        df_cancer_country_studies_ASR_agg.loc[(df_cancer_country_studies_ASR_agg["Country"] == country) & (df_cancer_country_studies_ASR_agg["Cancer"] == cancer), "ASR"] = df_globocan_lite.loc[(df_globocan_lite["Country"] == country) & (df_globocan_lite["Cancer"] == cancer), "ASR"].values[0]

In [105]:
df_cancer_country_studies_ASR_agg

Unnamed: 0,Cancer,Articles,Country,ASR
0,Anal cancer,0,Comoros,0.0
1,Bladder cancer,0,Comoros,4.8
2,Brain cancer,3,Comoros,0.0
3,Breast cancer,12,Comoros,21.5
4,Cervical cancer,3,Comoros,52.0
...,...,...,...,...
30,Throat cancer,0,Canada,3.11
31,Thyroid cancer,624,Canada,10.7
33,Uterine cancer,720,Canada,21.1
34,Vaginal cancer,15,Canada,0.53


In [106]:
# Add a column with number of articles normalized by population
# Data about population per country obtained (for almost all countries) from: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
country_populations = {
    "Peru": 34350244,
    "Philippines": 114123600,
    "Jamaica": 2825544,
    "Moldova": 2381300,
    "Luxembourg": 681973,
    "Bolivia": 11312620,
    "Zimbabwe": 17073087,
    "Mozambique": 34090466,
    "Lithuania": 2894177,
    "DR Congo": 112832000,
    "Ethiopia": 111652998,
    "Vietnam": 101343800,
    "Iceland": 391810,
    "North Macedonia": 1826247,
    "Netherlands": 18080943,
    "Norway": 5606944,
    "South Korea": 51159889,
    "Gambia": 2422712,
    "Solomon Islands": 750325,
    "Nicaragua": 6803886,
    "Austria": 9200931,
    "Reunion": 896175,
    "Nigeria": 223800000,
    "Thailand": 65859640,
    "Tanzania": 68153004,
    "Congo Republic": 6142180,
    "New Caledonia": 264596,
    "Vanuatu": 321409,
    "Cambodia": 17577760,
    "Russia": 146028325,
    "French Polynesia": 279500,
    "United Kingdom": 68265209,
    "Somalia": 19655000,
    "Central African Republic": 6470307,
    "Dominican Republic": 10771504,
    "Ecuador": 18103660,
    "Timor-Leste": 1391221,
    "Tajikistan": 10499000,
    "India": 1417492000,
    "New Zealand": 5324700,
    "Türkiye": 85664944,
    "Spain": 49315949,
    "Israel": 10119400,
    "Sri Lanka": 21763170,
    "Georgia": 3704500,
    "Bahamas": 398165,
    "Rwanda": 14104969,
    "Argentina": 47067641,
    "Gabon": 2469296,
    "Chad": 19340757,
    "St. Lucia": 184100,
    "Indonesia": 284438782,
    "Botswana": 2359609,
    "Senegal": 18593258,
    "Sweden": 10592686,
    "Liberia": 5248621,
    "Haiti": 11867032,
    "Poland": 37401000,
    "Martinique": 349925,
    "Benin": 13224860,
    "Trinidad and Tobago": 1368333,
    "Micronesia, Fed. Sts.": 105564,
    "Iraq": 46118793,
    "Eritrea": 3607000,
    "Azerbaijan": 10241722,
    "Libya": 7459000,
    "Slovakia": 5413813,
    "Namibia": 3022401,
    "Uruguay": 3499451,
    "Afghanistan": 43844000,
    "Chile": 20206953,
    "North Korea": 25950000,
    "Lebanon": 5490000,
    "Bulgaria": 6437360,
    "Burkina Faso": 24070553,
    "Mali": 22395489,
    "Sudan": 51662000,
    "Serbia": 6567783,
    "Ireland": 5458600,
    "Bosnia and Herzegovina": 3422000,
    "Cuba": 9748007,
    "Guinea-Bissau": 1781308,
    "Angola": 36170961,
    "Japan": 123300000,
    "Hungary": 9539502,
    "Syria": 25620000,
    "China": 1408280000,
    "Switzerland": 9067144,
    "Belize": 417634,
    "Nepal": 29911840,
    "Paraguay": 6109644,
    "Italy": 58919837,
    "Portugal": 10749635,
    "Cameroon": 29442327,
    "Colombia": 52695952,
    "Algeria": 47400000,
    "Denmark": 6002420,
    "Samoa": 205557,
    "Romania": 19036031,
    "United States": 340110988,
    "Djibouti": 1066809,
    "Egypt": 107271260,
    "Venezuela": 28517000,
    "Eswatini": 1235549,
    "Mauritius": 1243741,
    "Laos": 7647000,
    "Mauritania": 4927532,
    "United Arab Emirates": 10678556,
    "Niger": 26312034,
    "Uzbekistan": 37859698,
    "South Sudan": 15786898,
    "Mongolia": 3544835,
    "Cote d'Ivoire": 29389150,
    "Estonia": 1369995,
    "Ghana": 33742380,
    "Fiji": 900869,
    "Malaysia": 34231700,
    "Togo": 8095498,
    "Cabo Verde": 491233,
    "Uganda": 45905417,
    "Yemen": 32684503,
    "Guadeloupe": 378561,
    "Australia": 27400013,
    "Montenegro": 623327,
    "Myanmar": 51316756,
    "Singapore": 6036900,
    "Pakistan": 241499431,
    "Belgium": 11825551,
    "Oman": 5306976,
    "Jordan": 11734000,
    "Costa Rica": 5309625,
    "Greece": 10400720,
    "Comoros": 870038,
    "Kazakhstan": 20387811,
    "Bangladesh": 169828911,
    "Czechia": 10876875,
    "Lesotho": 2306000,
    "Guyana": 772975,
    "Brunei Darussalam": 455500,
    "Guam": 153836,
    "Palestine": 5483450,
    "Morocco": 37712153,
    "Bhutan": 784043,
    "Malawi": 20734262,
    "Guatemala": 18079810,
    "Tunisia": 11972169,
    "Mexico": 130575786,
    "Suriname": 616500,
    "Bahrain": 1594654,
    "South Africa": 63100945,
    "Latvia": 1830400,
    "Kenya": 53330978,
    "Turkmenistan": 7057841,
    "Cyprus": 966400,
    "Puerto Rico": 3203295,
    "Slovenia": 2130638,
    "Belarus": 9109280,
    "Sierra Leone": 9077691,
    "Finland": 5645651,
    "Qatar": 3173024,
    "Kyrgyz Republic": 7281800,
    "Albania": 2363314,
    "Germany": 83577140,
    "Malta": 574250,
    "Maldives": 515132,
    "Guinea": 14363931,
    "Saudi Arabia": 35300280,
    "Panama": 4064780,
    "Ukraine": 32862000,
    "Burundi": 12332788,
    "France": 68668000,
    "Brazil": 212583750,
    "Barbados": 267800,
    "Honduras": 9892632,
    "Zambia": 19693423,
    "Canada": 41548787,
    "French Guiana": 292354,
    "Armenia": 3081100,
    "Kuwait": 4881254,
    "Croatia": 3859686,
    "Madagascar": 31727042,
    "Sao Tome and Principe": 235536,
    "El Salvador": 6029976,
    "Papua New Guinea": 11781559,
    "Iran": 85961000,
}

for country in set(df_cancer_country_studies_ASR_agg["Country"]):
    df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"] == country, "Norm_articles"] = df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"] == country, "Articles"] * 1000000 / country_populations[country]

In [107]:
df_cancer_country_studies_ASR_agg

Unnamed: 0,Cancer,Articles,Country,ASR,Norm_articles
0,Anal cancer,0,Comoros,0.0,0.000000
1,Bladder cancer,0,Comoros,4.8,0.000000
2,Brain cancer,3,Comoros,0.0,3.448125
3,Breast cancer,12,Comoros,21.5,13.792501
4,Cervical cancer,3,Comoros,52.0,3.448125
...,...,...,...,...,...
30,Throat cancer,0,Canada,3.11,0.000000
31,Thyroid cancer,624,Canada,10.7,15.018489
33,Uterine cancer,720,Canada,21.1,17.329026
34,Vaginal cancer,15,Canada,0.53,0.361021


In [108]:
# Save
df_cancer_country_studies_ASR_agg.to_csv(DF_prepared_output + "articles_ASR_country_cancer.csv", index = False)

### 3.2 Incidence and number of studies for each cancer per country, percentage values

In [109]:
df_cancer_country_studies_ASR_agg

Unnamed: 0,Cancer,Articles,Country,ASR,Norm_articles
0,Anal cancer,0,Comoros,0.0,0.000000
1,Bladder cancer,0,Comoros,4.8,0.000000
2,Brain cancer,3,Comoros,0.0,3.448125
3,Breast cancer,12,Comoros,21.5,13.792501
4,Cervical cancer,3,Comoros,52.0,3.448125
...,...,...,...,...,...
30,Throat cancer,0,Canada,3.11,0.000000
31,Thyroid cancer,624,Canada,10.7,15.018489
33,Uterine cancer,720,Canada,21.1,17.329026
34,Vaginal cancer,15,Canada,0.53,0.361021


In [110]:
dfs_percentage_values = []

for country in set(df_cancer_country_studies_ASR_agg["Country"]):
    df = df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"] == country]
    df = df.astype({"Articles":float, "ASR":float})
    total_articles = df["Articles"].sum()
    total_incidence = df["ASR"].sum()
    for row in df.index:
        if total_articles > 0:
            df.at[row, "Articles"] = df.at[row, "Articles"]*100/total_articles
        df.at[row, "ASR"] = df.at[row, "ASR"]*100/total_incidence
    dfs_percentage_values.append(df)
    del df, total_articles, total_incidence

df_cancer_country_studies_ASR_agg_percentage = pd.concat(dfs_percentage_values)
df_cancer_country_studies_ASR_agg_percentage.drop(columns=["Norm_articles"], inplace = True)
df_cancer_country_studies_ASR_agg_percentage

Unnamed: 0,Cancer,Articles,Country,ASR
0,Anal cancer,0.000000,Comoros,0.000000
1,Bladder cancer,0.000000,Comoros,3.055768
2,Brain cancer,8.333333,Comoros,0.000000
3,Breast cancer,33.333333,Comoros,13.687293
4,Cervical cancer,8.333333,Comoros,33.104151
...,...,...,...,...
30,Throat cancer,0.000000,Moldova,1.951646
31,Thyroid cancer,0.000000,Moldova,2.395055
33,Uterine cancer,2.272727,Moldova,4.919571
34,Vaginal cancer,0.000000,Moldova,0.022656


In [111]:
df_cancer_country_studies_ASR_agg_percentage.to_csv(DF_prepared_output + "articles_ASR_country_cancer_percentage.csv", index = False)

### 3.3 Incidence and number of studies for each cancer per country, percentage values, restrict to countries with more than 1M people

In [112]:
countries_1M = []
for country in country_populations:
    if country_populations[country] >= 1000000:
        countries_1M.append(country)

df_cancer_country_studies_ASR_agg_percentage_1M = df_cancer_country_studies_ASR_agg.loc[df_cancer_country_studies_ASR_agg["Country"].isin(countries_1M)]

In [113]:
df_cancer_country_studies_ASR_agg_percentage_1M.to_csv(DF_prepared_output + "articles_ASR_country_cancer_1M.csv", index = False)