In [19]:
import os
import pandas as pd
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

In [47]:
# CSV with the finished Globocan dataset
GLOBOCAN_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\Globocan_dataset_ready.csv


In [48]:
# Directory to save Globocan dataset with required columns only
GLOBOCAN_prepared_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


In [2]:
# Directory with the csvs of the articles' dataset with country and cancer names matching those of GLOBOCAN dataset
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_match_GLOBOCAN\


In [3]:
# Directory to save aggregated articles' datasets
DF_prepared_output = input().strip()

  C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\


## Part 1 - Preparing Globocan dataset
Right now, the Globocan dataset has a number of columns which are not required for preparing the visualizations:

In [51]:
df_globocan = pd.read_csv(GLOBOCAN_input)
df_globocan.columns.to_list()

['Alpha-3 code',
 'Cancer code',
 'Population code (ISO/UN)',
 'Country',
 'Sex',
 'Number',
 '95% UI low',
 '95% UI high',
 'Number.1',
 'ASR (World)',
 'Crude rate',
 'Cumulative risk',
 'Cancer']

### 1.1.- Remove unnecessary columns  
Only the columns **Country**, **Cancer**, **ASR (World)** and **Crude rate** are important in this project. ASR (World) refers to the age-standardized rate, whereas Crude rate refers to the raw (non age-standardized) rate.  Here, a new csv is saved with only these columns.

In [52]:
df_globocan_lite = df_globocan[["Country",  "Cancer", "ASR (World)", "Crude rate"]]
df_globocan_lite

Unnamed: 0,Country,Cancer,ASR (World),Crude rate
0,Afghanistan,Anal cancer,0.47,0.27
1,Albania,Anal cancer,0.24,0.38
2,Algeria,Anal cancer,0.34,0.35
3,Angola,Anal cancer,0.24,0.11
4,Azerbaijan,Anal cancer,0.87,1.00
...,...,...,...,...
6285,Samoa,Skin cancer,0.00,0.00
6286,Yemen,Skin cancer,1.78,0.96
6287,South Africa,Skin cancer,23.90,20.90
6288,Zambia,Skin cancer,3.27,1.59


In [53]:
df_globocan_lite.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_ready_lite.csv", index = False)

### 1.2.- Cummulative cancer incidence (all cancers) per country
Even though the dataset contains both the ASR and the crude incidence values (see above), in principle only the ASR values will be displayed. Here, a new dataset is created containing the sum of the cancer incidences (ASR) of all cancers per country

In [63]:
df_globocan_cumm_ASR_country = df_globocan_lite[["Country", "ASR (World)"]].groupby(by=["Country"]).sum()

In [64]:
df_globocan_cumm_ASR_country

Unnamed: 0_level_0,ASR (World)
Country,Unnamed: 1_level_1
Afghanistan,126.09
Albania,212.70
Algeria,195.96
Angola,179.38
Argentina,305.01
...,...
Venezuela,264.73
Vietnam,188.78
Yemen,100.07
Zambia,236.40


In [65]:
df_globocan_cumm_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_cummulative_ASR_country.csv", index = False)

### 1.3.- Cancer with highest incidence per country
A new dataset will be saved containing the name of the cancer with highest incidence (ASR) per country

In [70]:
higest_incidence_idx = df_globocan_lite.groupby(by="Country")["ASR (World)"].idxmax()
df_globocan_max_ASR_country = df_globocan_lite.iloc[higest_incidence_idx]
df_globocan_max_ASR_country

Unnamed: 0,Country,Cancer,ASR (World),Crude rate
555,Afghanistan,Breast cancer,29.4,17.9
556,Albania,Breast cancer,51.1,72.6
557,Algeria,Breast cancer,61.9,65.1
4258,Angola,Prostate cancer,47.9,15.1
560,Argentina,Breast cancer,71.3,91.8
566,Armenia,Breast cancer,39.6,64.4
6111,Australia,Skin cancer,177.1,297.7
562,Austria,Breast cancer,69.5,130.3
559,Azerbaijan,Breast cancer,32.9,42.6
4263,Bahamas,Prostate cancer,89.1,102.2


In [71]:
df_globocan_max_ASR_country.to_csv(GLOBOCAN_prepared_output + "Globocan_dataset_max_ASR_country.csv", index = False)

## Part 2 - Calculating aggregated data for articles' dataset
To prepare data in the articles' dataset, aggregates of studies will be calculated grouping by country, cancer type and year.  
Future iterations of the project will deal with animal species as well.

### 2.1.- Aggregate number of papers per year irrespective of country and cancer type 
Here, a csv is prepared aggregating papers by year (i.e. the final csv will have a row per year)

In [4]:
# Import list of csvs to parse
list_csvs_match_GLOBOCAN = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs_match_GLOBOCAN.append(file)

n_csvs_match_GLOBOCAN = len(list_csvs_match_GLOBOCAN)

In [21]:
year_aggregates = {}
for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df["PublicationDate"].value_counts().to_dict()
    for year in dict_agg.keys():
        if int(year) in year_aggregates.keys():
            year_aggregates[int(year)] += dict_agg[int(year)]
        else:
            year_aggregates[int(year)] = dict_agg[int(year)]
    del df, dict_agg

100%|█████████████████████████| 45/45 [02:33<00:00,  3.41s/it]


In [22]:
year_aggregates

{1992: 53303,
 1991: 51473,
 1990: 53232,
 2001: 72691,
 2002: 76353,
 2000: 71170,
 1999: 66895,
 1998: 64720,
 1995: 58541,
 1997: 62860,
 1996: 61090,
 1994: 58000,
 1993: 55099,
 1989: 51838,
 1988: 47953,
 1986: 42957,
 1987: 44938,
 1985: 41699,
 1984: 16274,
 2003: 81219,
 2004: 85433,
 2005: 91004,
 2006: 94266,
 2007: 99610,
 2013: 147002,
 2008: 104591,
 2016: 175374,
 2009: 109345,
 2010: 117318,
 2011: 125587,
 2012: 138543,
 2015: 170983,
 2019: 201285,
 2014: 161508,
 2017: 180199,
 2018: 187679,
 2021: 249023,
 2020: 226078,
 2022: 245243,
 2023: 237455,
 2025: 3119,
 2024: 160906}

In [44]:
# Convert to pd dataframe
df_year_agg = pd.DataFrame.from_dict(year_aggregates, orient="index", columns = ["Articles"])
df_year_agg.reset_index(inplace=True)
df_year_agg.rename(columns={"index": "Year"}, inplace=True)
df_year_agg.sort_values(by=["Year"], ascending= True, inplace=True)

In [45]:
df_year_agg

Unnamed: 0,Year,Articles
18,1984,16274
17,1985,41699
15,1986,42957
16,1987,44938
14,1988,47953
13,1989,51838
2,1990,53232
1,1991,51473
0,1992,53303
12,1993,55099


In [46]:
# Save
df_year_agg.to_csv(DF_prepared_output + "articles_year.csv", index=False)

### 2.2.- Aggregate number of articles per country and year irrespective of cancer type
Here, a csv is prepared aggregating papers by year and country (i.e. the final csv will have a row per year and country)

In [87]:
year_country_aggregates = {}

for csv in tqdm(list_csvs_match_GLOBOCAN):
    df = pd.read_csv(DF_input + csv)
    dict_agg = df[["Country", "PublicationDate"]].value_counts().to_dict()
    for key in dict_agg.keys():
        if key[0] in year_country_aggregates.keys() and int(key[1]) in year_country_aggregates[key[0]].keys():
            year_country_aggregates[key[0]][int(key[1])] += dict_agg[(key[0], key[1])]
        elif key[0] in year_country_aggregates.keys():
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]
        else:
            year_country_aggregates[key[0]] = {}
            year_country_aggregates[key[0]][int(key[1])] = dict_agg[(key[0], key[1])]

year_country_aggregates

100%|█████████████████████████| 45/45 [02:36<00:00,  3.49s/it]


{'United States': {1992: 16637,
  1991: 15154,
  1990: 12177,
  2001: 22427,
  2002: 23274,
  2000: 21889,
  1999: 20441,
  1998: 19568,
  1996: 19355,
  1995: 18769,
  1994: 17439,
  1997: 19672,
  1993: 17162,
  1986: 213,
  1989: 10323,
  2003: 24865,
  2004: 26513,
  2005: 27424,
  1988: 9775,
  2006: 27844,
  2007: 29339,
  1985: 114,
  2008: 30589,
  2016: 45401,
  1987: 4219,
  2009: 31174,
  2010: 32845,
  2011: 34473,
  1984: 27,
  2012: 36613,
  2013: 38079,
  2014: 41358,
  2015: 43941,
  2017: 46776,
  2018: 48405,
  2019: 49761,
  2021: 56565,
  2022: 52964,
  2020: 53664,
  2025: 981,
  2023: 52591,
  2024: 34263},
 'Japan': {1992: 5806,
  1991: 5398,
  1990: 4957,
  2001: 8114,
  2002: 7892,
  2000: 8257,
  1999: 7535,
  1998: 7560,
  1993: 6041,
  1994: 6192,
  1995: 6375,
  1997: 6756,
  1996: 6633,
  1989: 3879,
  2003: 7946,
  2004: 7960,
  2005: 7997,
  1987: 1433,
  2006: 7454,
  2007: 7735,
  2008: 7443,
  1986: 48,
  2009: 7604,
  2010: 7911,
  2011: 8599,
  2012

In [94]:
dfs_year_country = []
for country in year_country_aggregates.keys():
    df = pd.DataFrame.from_dict(year_country_aggregates[country], orient="index", columns = ["Articles"])
    df.reset_index(inplace=True)
    df.rename(columns={"index": "Year"}, inplace=True)
    df.sort_values(by=["Year"], ascending= True, inplace=True)
    df["Country"] = country
    dfs_year_country.append(df)

df_year_country_agg = pd.concat(dfs_year_country)

In [95]:
df_year_country_agg

Unnamed: 0,Year,Articles,Country
28,1984,27,United States
21,1985,114,United States
13,1986,213,United States
24,1987,4219,United States
18,1988,9775,United States
14,1989,10323,United States
2,1990,12177,United States
1,1991,15154,United States
0,1992,16637,United States
12,1993,17162,United States


In [96]:
# Save
df_year_country_agg.to_csv(DF_prepared_output + "articles_year_country.csv", index=False)