In [295]:
import os
import pandas as pd
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

In [296]:
# CSV with the finished Globocan dataset
GLOBOCAN_input = input().strip()

  C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\Globocan_dataset_ready.csv


In [297]:
# Directory to save Globocan dataset with required columns only
GLOBOCAN_prepared_output = input().strip()

  C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [298]:
# Directory with the csvs of the articles' dataset with country and cancer names matching those of GLOBOCAN dataset
DF_input = input().strip()

  C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_match_GLOBOCAN\


In [299]:
# Directory to save aggregated articles' datasets
DF_prepared_output = input().strip()

   C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


## Part 1 - Preparing Globocan dataset
Right now, the Globocan dataset has a number of columns which are not required for preparing the visualizations:

In [300]:
df_globocan = pd.read_csv(GLOBOCAN_input)
df_globocan.columns.to_list()

['Alpha-3 code',
 'Cancer code',
 'Population code (ISO/UN)',
 'Country',
 'Sex',
 'Number',
 '95% UI low',
 '95% UI high',
 'Number.1',
 'ASR (World)',
 'Crude rate',
 'Cumulative risk',
 'Cancer']

### 1.1.- Remove unnecessary columns  
Only the columns **Country**, **Cancer**, **ASR (World)** and **Crude rate** are important in this project. ASR (World) refers to the age-standardized rate, whereas Crude rate refers to the raw (non age-standardized) rate.  Here, a new csv is saved with only these columns.

In [301]:
df_globocan_lite = df_globocan[["Country",  "Cancer", "ASR (World)", "Crude rate"]]
df_globocan_lite

Unnamed: 0,Country,Cancer,ASR (World),Crude rate
0,Afghanistan,Anal cancer,0.47,0.27
1,Albania,Anal cancer,0.24,0.38
2,Algeria,Anal cancer,0.34,0.35
3,Angola,Anal cancer,0.24,0.11
4,Azerbaijan,Anal cancer,0.87,1.0
5,Argentina,Anal cancer,0.5,0.64
6,Australia,Anal cancer,1.4,2.5
7,Austria,Anal cancer,1.1,2.2
8,Bahamas,Anal cancer,1.3,1.5
9,Bahrain,Anal cancer,0.24,0.17


In [302]:
df_globocan_lite.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_ready_lite.csv", index = False)

### 1.2.- Cummulative cancer incidence (all cancers) per country
Even though the dataset contains both the ASR and the crude incidence values (see above), in principle only the ASR values will be displayed. Here, a new dataset is created containing the sum of the cancer incidences (ASR) of all cancers per country

In [63]:
df_globocan_cumm_ASR_country = df_globocan_lite[["Country", "ASR (World)"]].groupby(by=["Country"]).sum()

In [64]:
df_globocan_cumm_ASR_country

Unnamed: 0_level_0,ASR (World)
Country,Unnamed: 1_level_1
Afghanistan,126.09
Albania,212.70
Algeria,195.96
Angola,179.38
Argentina,305.01
...,...
Venezuela,264.73
Vietnam,188.78
Yemen,100.07
Zambia,236.40


In [65]:
df_globocan_cumm_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_cummulative_ASR_country.csv", index = False)

### 1.3.- Cancer with highest incidence per country
A new dataset will be saved containing the name of the cancer with highest incidence (ASR) per country

In [70]:
highest_incidence_idx = df_globocan_lite.groupby(by="Country")["ASR (World)"].idxmax()
df_globocan_max_ASR_country = df_globocan_lite.iloc[highest_incidence_idx]
df_globocan_max_ASR_country

Unnamed: 0,Country,Cancer,ASR (World),Crude rate
555,Afghanistan,Breast cancer,29.4,17.9
556,Albania,Breast cancer,51.1,72.6
557,Algeria,Breast cancer,61.9,65.1
4258,Angola,Prostate cancer,47.9,15.1
560,Argentina,Breast cancer,71.3,91.8
566,Armenia,Breast cancer,39.6,64.4
6111,Australia,Skin cancer,177.1,297.7
562,Austria,Breast cancer,69.5,130.3
559,Azerbaijan,Breast cancer,32.9,42.6
4263,Bahamas,Prostate cancer,89.1,102.2


In [71]:
df_globocan_max_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_max_ASR_country.csv", index = False)

## Part 2 - Calculating aggregated data for articles' dataset
To prepare data in the articles' dataset, aggregates of studies will be calculated grouping by country, cancer type and year.  
Future iterations of the project will deal with animal species as well.

### 2.1.- Aggregate number of articles per year irrespective of country and cancer type 
Here, a csv is prepared aggregating papers by year (i.e. the final csv will have a row per year)

In [303]:
# Import list of csvs to parse
list_csvs_match_GLOBOCAN = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs_match_GLOBOCAN.append(file)

n_csvs_match_GLOBOCAN = len(list_csvs_match_GLOBOCAN)

In [21]:
year_aggregates = {}
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df["PublicationDate"].value_counts().to_dict()
    for year in dict_agg.keys():
        if int(year) in year_aggregates.keys():
            year_aggregates[int(year)] += dict_agg[int(year)]
        else:
            year_aggregates[int(year)] = dict_agg[int(year)]
    del df, dict_agg

100%|█████████████████████████| 45/45 [02:33<00:00,  3.41s/it]


In [22]:
year_aggregates

{1992: 53303,
 1991: 51473,
 1990: 53232,
 2001: 72691,
 2002: 76353,
 2000: 71170,
 1999: 66895,
 1998: 64720,
 1995: 58541,
 1997: 62860,
 1996: 61090,
 1994: 58000,
 1993: 55099,
 1989: 51838,
 1988: 47953,
 1986: 42957,
 1987: 44938,
 1985: 41699,
 1984: 16274,
 2003: 81219,
 2004: 85433,
 2005: 91004,
 2006: 94266,
 2007: 99610,
 2013: 147002,
 2008: 104591,
 2016: 175374,
 2009: 109345,
 2010: 117318,
 2011: 125587,
 2012: 138543,
 2015: 170983,
 2019: 201285,
 2014: 161508,
 2017: 180199,
 2018: 187679,
 2021: 249023,
 2020: 226078,
 2022: 245243,
 2023: 237455,
 2025: 3119,
 2024: 160906}

In [44]:
# Convert to pd dataframe
df_year_agg = pd.DataFrame.from_dict(year_aggregates, orient="index", columns = ["Articles"])
df_year_agg.reset_index(inplace=True)
df_year_agg.rename(columns={"index": "Year"}, inplace=True)
df_year_agg.sort_values(by=["Year"], ascending= True, inplace=True)

In [45]:
df_year_agg

Unnamed: 0,Year,Articles
18,1984,16274
17,1985,41699
15,1986,42957
16,1987,44938
14,1988,47953
13,1989,51838
2,1990,53232
1,1991,51473
0,1992,53303
12,1993,55099


In [46]:
# Save
df_year_agg.to_csv(DF_prepared_output + "articles_year.csv", index=False)

### 2.2.- Aggregate number of articles per country and year irrespective of cancer type
Here, a csv is prepared aggregating papers by year and country (i.e. the final csv will have a row per year and country)

In [87]:
year_country_aggregates = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df[["Country", "PublicationDate"]].value_counts().to_dict()
    for key in dict_agg.keys():
        if key[0] in year_country_aggregates.keys() and int(key[1]) in year_country_aggregates[key[0]].keys():
            year_country_aggregates[key[0]][int(key[1])] += dict_agg[(key[0], key[1])]
        elif key[0] in year_country_aggregates.keys():
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]
        else:
            year_country_aggregates[key[0]] = {}
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]

year_country_aggregates

100%|█████████████████████████| 45/45 [02:36<00:00,  3.49s/it]


{'United States': {1992: 16637,
  1991: 15154,
  1990: 12177,
  2001: 22427,
  2002: 23274,
  2000: 21889,
  1999: 20441,
  1998: 19568,
  1996: 19355,
  1995: 18769,
  1994: 17439,
  1997: 19672,
  1993: 17162,
  1986: 213,
  1989: 10323,
  2003: 24865,
  2004: 26513,
  2005: 27424,
  1988: 9775,
  2006: 27844,
  2007: 29339,
  1985: 114,
  2008: 30589,
  2016: 45401,
  1987: 4219,
  2009: 31174,
  2010: 32845,
  2011: 34473,
  1984: 27,
  2012: 36613,
  2013: 38079,
  2014: 41358,
  2015: 43941,
  2017: 46776,
  2018: 48405,
  2019: 49761,
  2021: 56565,
  2022: 52964,
  2020: 53664,
  2025: 981,
  2023: 52591,
  2024: 34263},
 'Japan': {1992: 5806,
  1991: 5398,
  1990: 4957,
  2001: 8114,
  2002: 7892,
  2000: 8257,
  1999: 7535,
  1998: 7560,
  1993: 6041,
  1994: 6192,
  1995: 6375,
  1997: 6756,
  1996: 6633,
  1989: 3879,
  2003: 7946,
  2004: 7960,
  2005: 7997,
  1987: 1433,
  2006: 7454,
  2007: 7735,
  2008: 7443,
  1986: 48,
  2009: 7604,
  2010: 7911,
  2011: 8599,
  2012

In [94]:
# Create df from dict
dfs_year_country = []
for country in year_country_aggregates.keys():
    df = pd.DataFrame.from_dict(year_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Country"] = country
    dfs_year_country.append(df)

df_year_country_agg = pd.concat(dfs_year_country)

In [95]:
df_year_country_agg

Unnamed: 0,Year,Articles,Country
28,1984,27,United States
21,1985,114,United States
13,1986,213,United States
24,1987,4219,United States
18,1988,9775,United States
14,1989,10323,United States
2,1990,12177,United States
1,1991,15154,United States
0,1992,16637,United States
12,1993,17162,United States


In [96]:
# Save
df_year_country_agg.to_csv(DF_prepared_output + "articles_year_country.csv", index=False)

### 2.3.- Aggregate number of articles per cancer type and year irrespective of country
Here, a csv is prepared aggregating papers by year and cancer type (i.e. the final csv will have a row per year and cancer type). Cancers names are hardcoded from a previous notebook in this project.

In [304]:
cancer_names = [
 'Anal cancer',
 'Bladder cancer',
 'Brain cancer',
 'Breast cancer',
 'Cervical cancer',
 'Colon cancer',
 'Colorectal cancer',
 'Esophageal cancer',
 'Gallbladder cancer',
 'Hodgkin lymphoma',
 'Kaposi sarcoma',
 'Kidney cancer',
 'Laryngeal cancer',
 'Leukemia',
 'Liver cancer',
 'Lung cancer',
 'Mesothelioma',
 'Mouth cancer',
 'Multiple myeloma',
 'Non-Hodgkin lymphoma',
 'Other cancer',
 'Ovarian cancer',
 'Pancreatic cancer',
 'Penile cancer',
 'Prostate cancer',
 'Rectal cancer',
 'Salivary gland cancer',
 'Skin cancer',
 'Stomach cancer',
 'Testicular cancer',
 'Throat cancer',
 'Thyroid cancer',
 'Undetermined_Cancer',
 'Uterine cancer',
 'Vaginal cancer',
 'Vulvar cancer'
]

In [286]:
# Create and structure dict to hold data about publications on each cancer type
year_cancer_aggregates = {}

for cancer in cancer_names:
    year_cancer_aggregates[cancer] = {}

# Parse all articles' csv and count the number of studies per cancer and country
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for row in df.index:
        for cancer in cancer_names:
            year = df.at[row, "PublicationDate"]
            if pd.notna(year) and df.at[row, cancer] > 0:
                if int(year) in year_cancer_aggregates[cancer].keys():
                    year_cancer_aggregates[cancer][int(year)] += 1
                else:
                    year_cancer_aggregates[cancer][int(year)] = 1

    del df, cancer, year, csv

year_cancer_aggregates

100%|█████████████████████████| 45/45 [23:56<00:00, 31.92s/it]


{'Anal cancer': {1992: 12,
  1991: 14,
  1990: 14,
  2001: 25,
  1996: 18,
  1999: 18,
  2002: 25,
  1998: 20,
  2003: 32,
  2000: 20,
  2004: 40,
  2005: 40,
  2007: 44,
  2006: 38,
  2008: 68,
  2009: 93,
  2010: 101,
  2011: 104,
  2012: 111,
  1989: 18,
  2013: 145,
  2014: 120,
  2015: 163,
  2016: 163,
  2017: 178,
  2018: 186,
  2025: 4,
  1987: 7,
  1988: 8,
  1986: 3,
  2019: 167,
  2020: 206,
  2021: 196,
  2022: 194,
  2023: 193,
  1984: 1,
  2024: 144,
  1995: 13,
  1993: 15,
  1994: 17,
  1997: 19},
 'Bladder cancer': {1992: 472,
  1991: 384,
  1990: 406,
  2001: 588,
  2000: 611,
  1998: 487,
  1999: 518,
  2002: 605,
  2003: 629,
  1997: 497,
  1996: 531,
  1995: 453,
  2004: 702,
  2005: 731,
  2006: 827,
  1989: 355,
  2007: 881,
  2008: 907,
  1985: 278,
  1993: 427,
  1986: 250,
  1994: 470,
  2009: 947,
  2010: 1054,
  2011: 1136,
  2012: 1324,
  1987: 305,
  2013: 1525,
  2014: 1720,
  1988: 315,
  2015: 1725,
  2016: 1852,
  2017: 1810,
  2018: 2074,
  2020: 2481,

In [288]:
# Create df from dict
dfs_year_cancer = []
for cancer in year_cancer_aggregates.keys():
    df = pd.DataFrame.from_dict(year_cancer_aggregates[cancer], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Cancer"] = cancer
    dfs_year_cancer.append(df)

df_year_cancer_agg = pd.concat(dfs_year_cancer)

In [289]:
df_year_cancer_agg

Unnamed: 0,Year,Articles,Cancer
35,1984,1,Anal cancer
29,1986,3,Anal cancer
27,1987,7,Anal cancer
28,1988,8,Anal cancer
19,1989,18,Anal cancer
2,1990,14,Anal cancer
1,1991,14,Anal cancer
0,1992,12,Anal cancer
38,1993,15,Anal cancer
39,1994,17,Anal cancer


In [290]:
# Save
df_year_cancer_agg.to_csv(DF_prepared_output + "articles_year_cancer.csv", index=False)

### 2.4.- Aggregate number of articles per cancer per country

Here, a csv is generated in which, for each country, the number of studies per cancer, in all years combined, is calculated. 

In [155]:
# Create and structure dict to hold data about publications on each cancer type
cancer_country_aggregates = {}

for country in set(df_globocan_lite["Country"]):
    cancer_country_aggregates[country] = {}

    for cancer in cancer_names:
        cancer_country_aggregates[country][cancer] = 0

# Parse all articles' csv and count the number of studies per cancer and country
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for row in df.index:
        for cancer in cancer_names:
            country = df.at[row, "Country"]
            if pd.notna(country) and df.at[row, cancer] > 0:
                cancer_country_aggregates[country][cancer] += 1

    del df, cancer, country, csv

cancer_country_aggregates

100%|█████████████████████████| 45/45 [22:24<00:00, 29.89s/it]


{'Brunei Darussalam': {'Anal cancer': 0,
  'Bladder cancer': 0,
  'Brain cancer': 2,
  'Breast cancer': 11,
  'Cervical cancer': 7,
  'Colon cancer': 2,
  'Colorectal cancer': 16,
  'Esophageal cancer': 1,
  'Gallbladder cancer': 0,
  'Hodgkin lymphoma': 1,
  'Kaposi sarcoma': 0,
  'Kidney cancer': 1,
  'Laryngeal cancer': 0,
  'Leukemia': 3,
  'Liver cancer': 2,
  'Lung cancer': 3,
  'Mesothelioma': 1,
  'Mouth cancer': 0,
  'Multiple myeloma': 2,
  'Non-Hodgkin lymphoma': 1,
  'Other cancer': 19,
  'Ovarian cancer': 1,
  'Pancreatic cancer': 0,
  'Penile cancer': 0,
  'Prostate cancer': 2,
  'Rectal cancer': 2,
  'Salivary gland cancer': 0,
  'Skin cancer': 1,
  'Stomach cancer': 1,
  'Testicular cancer': 0,
  'Throat cancer': 0,
  'Thyroid cancer': 0,
  'Undetermined_Cancer': 63,
  'Uterine cancer': 0,
  'Vaginal cancer': 0,
  'Vulvar cancer': 0},
 'Slovakia': {'Anal cancer': 0,
  'Bladder cancer': 21,
  'Brain cancer': 78,
  'Breast cancer': 309,
  'Cervical cancer': 48,
  'Colon c

In [156]:
# Create df from dict
dfs_cancer_country = []
for country in cancer_country_aggregates.keys():
    df = pd.DataFrame.from_dict(cancer_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Cancer"}, inplace=True)
    df["Country"] = country
    dfs_cancer_country.append(df)

df_cancer_country_agg = pd.concat(dfs_cancer_country)

In [157]:
df_cancer_country_agg

Unnamed: 0,Cancer,Articles,Country
0,Anal cancer,0,Brunei Darussalam
1,Bladder cancer,0,Brunei Darussalam
2,Brain cancer,2,Brunei Darussalam
3,Breast cancer,11,Brunei Darussalam
4,Cervical cancer,7,Brunei Darussalam
5,Colon cancer,2,Brunei Darussalam
6,Colorectal cancer,16,Brunei Darussalam
7,Esophageal cancer,1,Brunei Darussalam
8,Gallbladder cancer,0,Brunei Darussalam
9,Hodgkin lymphoma,1,Brunei Darussalam


In [158]:
# Save
df_cancer_country_agg.to_csv(DF_prepared_output + "articles_cancer_country.csv", index=False)

### 2.5.- Most studied cancer per country

By using the aggregated data from the last dataframe, a new dataframe is obtained containing a column with the most studied cancer in each country. Here the categories **Other cancer** and **Undetermined_cancer** are not taken into consideration

In [280]:
df_articles_filtered = df_cancer_country_agg.loc[~df_cancer_country_agg["Cancer"].isin(["Other cancer", "Undetermined_Cancer"])]

In [278]:
list_most_studied_cancer = []

for country in set(df_globocan_lite["Country"]):
    df = df_articles_filtered.loc[df_articles_filtered["Country"] == country]
    list_most_studied_cancer.append(df.sort_values(by="Articles", ascending=False)[:1])

df_max_studied_cancer = pd.concat(list_most_studied_cancer).sort_values(by="Country")
df_max_studied_cancer

Unnamed: 0,Cancer,Articles,Country
3,Breast cancer,13,Afghanistan
3,Breast cancer,5,Albania
3,Breast cancer,53,Algeria
18,Multiple myeloma,3,Angola
3,Breast cancer,793,Argentina
13,Leukemia,15,Armenia
3,Breast cancer,6444,Australia
3,Breast cancer,1808,Austria
3,Breast cancer,10,Azerbaijan
3,Breast cancer,8,Bahamas


In [282]:
# Save
df_max_studied_cancer.to_csv(DF_prepared_output + "articles_cancer_most_studied_country.csv", index = False)

### 2.6.- Number of cancer studies per country and cancer type

Here, a csv is generated in which, for each country and cancer type, the number of studies per year is calculated.

In [323]:
# Create and structure dict to hold data about publications on each cancer type
country_year_cancer_aggregates = {}

for country in set(df_globocan_lite["Country"]):
    country_year_cancer_aggregates[country] = {}
    for cancer in cancer_names:
        country_year_cancer_aggregates[country][cancer] = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    for country in set(df["Country"]):
        if pd.notna(country):
            df_country = df.loc[df["Country"] == country]
            for cancer in cancer_names:
                df_country_cancer = df_country.loc[df_country[cancer] > 0]
                for year in set(df_country_cancer["PublicationDate"]):
                    if pd.notna(year):
                        if int(year) in country_year_cancer_aggregates[country][cancer].keys():
                            country_year_cancer_aggregates[country][cancer][int(year)] += len(df_country_cancer.loc[df_country_cancer["PublicationDate"] == year])
                        else:
                            country_year_cancer_aggregates[country][cancer][int(year)] = len(df_country_cancer.loc[df_country_cancer["PublicationDate"] == year])
                        
country_year_cancer_aggregates  

100%|█████████████████████████| 45/45 [05:03<00:00,  6.75s/it]


{'Brunei Darussalam': {'Anal cancer': {},
  'Bladder cancer': {},
  'Brain cancer': {2022: 1, 2023: 1},
  'Breast cancer': {2011: 1, 2017: 1, 2020: 1, 2023: 5, 2022: 1, 2024: 2},
  'Cervical cancer': {2012: 1, 2019: 1, 2022: 1, 2023: 2, 1994: 1, 1993: 1},
  'Colon cancer': {2015: 1, 2022: 1},
  'Colorectal cancer': {2009: 1,
   2011: 1,
   2013: 1,
   2015: 3,
   2016: 1,
   2019: 1,
   2020: 3,
   2021: 1,
   2022: 3,
   2023: 1},
  'Esophageal cancer': {2015: 1},
  'Gallbladder cancer': {},
  'Hodgkin lymphoma': {2023: 1},
  'Kaposi sarcoma': {},
  'Kidney cancer': {2022: 1},
  'Laryngeal cancer': {},
  'Leukemia': {2002: 1, 2016: 1, 2020: 1},
  'Liver cancer': {2009: 1, 2013: 1},
  'Lung cancer': {2009: 1, 2013: 1, 2022: 1},
  'Mesothelioma': {2021: 1},
  'Mouth cancer': {},
  'Multiple myeloma': {2021: 2},
  'Non-Hodgkin lymphoma': {2023: 1},
  'Other cancer': {2008: 1,
   2009: 2,
   2013: 1,
   2014: 2,
   2015: 3,
   2016: 2,
   2020: 1,
   2021: 2,
   2023: 3,
   2024: 1,
   19

In [331]:
# Assign value 0 to any country, cancer, year combination without a value in the aggregates (e.g. without any published article)
years_articles_dataset = []

for country in country_year_cancer_aggregates.keys():
    for cancer in cancer_names:
        years_articles_dataset += list(country_year_cancer_aggregates[country][cancer].keys())

years_articles_dataset = set(years_articles_dataset)

for year in years_articles_dataset:
    for country in country_year_cancer_aggregates.keys():
        for cancer in country_year_cancer_aggregates[country].keys():
            if year not in country_year_cancer_aggregates[country][cancer].keys():
                country_year_cancer_aggregates[country][cancer][year] = 0

In [337]:
# Create df from dict
dfs_contry_year_cancer = []
for country in country_year_cancer_aggregates.keys():
    for cancer in country_year_cancer_aggregates[country].keys():
        df = pd.DataFrame.from_dict(country_year_cancer_aggregates[country][cancer], orient="index", columns = ["Articles"])
        df.reset_index(inplace=True)
        df.rename(columns={"index": "Year"}, inplace=True)
        df["Country"] = country
        df["Cancer"] = cancer
        dfs_contry_year_cancer.append(df)

        del df

df_country_year_cancer_agg = pd.concat(dfs_contry_year_cancer)

In [338]:
df_country_year_cancer_agg

Unnamed: 0,Year,Articles,Country,Cancer
0,1984,0,Brunei Darussalam,Anal cancer
1,1985,0,Brunei Darussalam,Anal cancer
2,1986,0,Brunei Darussalam,Anal cancer
3,1987,0,Brunei Darussalam,Anal cancer
4,1988,0,Brunei Darussalam,Anal cancer
...,...,...,...,...
37,2021,0,Lesotho,Vulvar cancer
38,2022,0,Lesotho,Vulvar cancer
39,2023,0,Lesotho,Vulvar cancer
40,2024,0,Lesotho,Vulvar cancer


In [342]:
df_country_year_cancer_agg.to_csv(DF_prepared_output + "articles_country_year_cancer.csv", index = False)