In [1]:
import os
import pandas as pd
import time
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
DF_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs_update_2025-09\


In [3]:
# Import list of csvs to parse
list_csvs = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs.append(file)

n_csvs = len(list_csvs)

In [8]:
# List of known types of cancer
cancer_types = [
"Lung cancer",
"Colorectal cancer",
"Breast cancer",
"Prostate cancer",
"Colon cancer",
"Rectal cancer",
"Multiple myeloma",
"Melanoma",
"Lymphoma",
"Leukemia",
"Brain cancer",
"Pancreatic cancer",
"Neuroendocrine tumor",
"Neuroblastoma",
"Ovarian cancer",
"Cervical cancer",
"Bladder cancer",
"Kidney cancer",
"Renal cell carcinoma",
"Urothelial carcinoma",
"Adrenocortical carcinoma",
"Sebaceous gland carcinoma",
"Thymoma",
"Wilms tumor",
"Thymic carcinoma",
"Liver cancer",
"Mesothelioma",
"hemangiosarcoma",
"Salivary gland cancer",
"Thyroid cancer",
"Parathyroid carcinoma",
"Mesonephric carcinoma",
"Stomach cancer",
"Gastric adenocarcinoma",
"Gastrointestinal stromal tumor",
"Ampullary carcinoma",
"Esophageal cancer",
"Testicular cancer",
"Skin cancer",
"Sebaceous carcinoma",
"Merkel cell carcinoma",
"Bone cancer",
"Giant cell tumor of bone",
"Chordoma",
"Sarcoma",
"Head and neck cancer",
"Throat cancer",
"Mouth cancer",
"Nasal cancer",
"Eye cancer",
"Ear cancer",
"Gallbladder cancer",
"Anal cancer",
"Penile cancer",
"Vaginal cancer",
"Vulvar cancer",
"Endometrial cancer",
"Granulosa cell tumor",
"Sertoli-Leydig cell tumor",
"Uterine cancer",
"Fallopian tube cancer",
"Peritoneal cancer",
"Small cell lung cancer",
"Non-small cell lung cancer",
"Squamous cell carcinoma",
"Basal cell carcinoma",
"Adenocarcinoma",
"Mucinous carcinoma",
"Ductal carcinoma",
"Lobular carcinoma",
"Clear cell carcinoma",
"Transitional cell carcinoma",
"Adenoid cystic carcinoma",
"Follicular lymphoma",
"Hodgkin lymphoma",
"Non-Hodgkin lymphoma",
"Acute lymphoblastic leukemia",
"Chronic lymphocytic leukemia",
"Acute myeloid leukemia",
"Chronic myeloid leukemia",
"Glioblastoma",
"Astrocytoma",
"Medulloblastoma",
"Meningioma",
"Ependymoma",
"Adenocarcinoma of the colon",
"Squamous cell carcinoma of the skin",
"Adenocarcinoma of the pancreas",
"Cholangiocarcinoma",
"Hepatocellular carcinoma",
"Fibrolamellar carcinoma",
"Anaplastic thyroid cancer",
"Papillary thyroid cancer",
"Follicular thyroid cancer",
"Medullary thyroid cancer",
"Oligodendroglioma",
"Malignant peripheral nerve sheath tumor",
"Primitive neuroectodermal tumor",
"Rhabdomyosarcoma",
"Ewing sarcoma",
"Chondrosarcoma",
"Osteosarcoma",
"Angiosarcoma",
"Leiomyosarcoma",
"Malignant fibrous histiocytoma",
"Liposarcoma",
"Synovial sarcoma",
"Fibrosarcoma",
"Kaposi sarcoma",
"Pleomorphic sarcoma",
"Alveolar soft part sarcoma",
"Extraskeletal myxoid chondrosarcoma",
"Endometrial stromal sarcoma",
"Leiomyoma",
"Neurofibroma",
"Schwannoma",
"Desmoplastic small round cell tumor",
"Epithelioid sarcoma",
"Extraskeletal osteosarcoma",
"Extraskeletal chondrosarcoma",
"Malignant mesenchymoma",
"Mucinous adenocarcinoma",
"Osteoblastoma",
"Osteochondroma",
"Periosteal osteosarcoma",
"Solitary fibrous tumor",
"Small intestine cancer",
"Appendix cancer",
"Ileal cancer",
"Jejunal cancer",
"Duodenal cancer",
"Ampullary cancer",
"Diffuse large B-cell lymphoma",
"Mantle cell lymphoma",
"Marginal zone lymphoma",
"Mucosa-associated lymphoid tissue lymphoma",
"Peripheral T-cell lymphoma",
"Primary central nervous system lymphoma",
"Hairy cell leukemia",
"Myelodysplastic syndromes",
"Myeloproliferative neoplasms",
"Polycythemia vera",
"Essential thrombocythemia",
"Primary myelofibrosis",
"Chronic neutrophilic leukemia",
"Chronic eosinophilic leukemia",
"Mast cell leukemia",
"Atypical chronic myeloid leukemia",
"Juvenile myelomonocytic leukemia",
"Systemic mastocytosis",
"Cutaneous mastocytosis",
"Aggressive systemic mastocytosis",
"Smoldering systemic mastocytosis",
"Mast cell sarcoma",
"Laryngeal cancer",
"Vulval cancer"]

In [10]:
# Manage log of parsed files
path_parsed_csvs_cancer = DF_input+"csv files with Cancer binarized.txt"
if not os.path.exists(path_parsed_csvs_cancer):
    open(path_parsed_csvs_cancer, 'a').close()

parsed_csvs_cancer = []
with open(path_parsed_csvs_cancer, "r") as f:
    for line in f:
        parsed_csvs_cancer.append(f)
    
for csv in tqdm(list_csvs):
    if csv not in parsed_csvs_cancer:
        start = time.time()
        print(f"Parsing csv: {csv}, {list_csvs.index(csv)+1} / {n_csvs}")

        # Import csv
        df = pd.read_csv(DF_input + csv)

        ## Binarize Cancer types
        # Use Multilabel binarizer from scikit-learn
        mlb = MultiLabelBinarizer()
        
        # Use the binarizer on the Cancer column
        df_one_hot_cancer = pd.DataFrame(
            mlb.fit_transform(df['Cancer'].astype(str)
                              .str.strip("[]")
                              .str.split(",")
                              .map(lambda x: [i.strip() for i in x])),
            columns=mlb.classes_,
            index=df.index)
        
        ## Rename the columns 
        # First column contains value 1 for articles with no found cancer
        df_one_hot_cancer.columns.values[0] = "Undetermined_Cancer"

        # Remove the '' from the names of the cancers
        for col in df_one_hot_cancer.columns[1:]:
            df_one_hot_cancer.rename(columns={col: col.strip("'")}, inplace=True)

        # Concatenate the data with the binarized columns
        df_save = pd.concat([df, df_one_hot_cancer], axis=1)

        # Add a 0 to each species or cancer with no records
        for cancer in cancer_types:
            if cancer not in df_save.columns:
                df_save[cancer] = 0
            
        # Save df, add to parsed list, clean workspace
        df_save.to_csv(DF_input+csv, index=False)
        del df, df_one_hot_cancer, mlb, df_save
        parsed_csvs_cancer.append(csv)
        with open(path_parsed_csvs_cancer, "a") as f:
                f.write(csv+"\n")
        print(f"--Parsing time: {round(time.time()-start, 2)}")

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Parsing csv: parsedXMLs_update_2025_09_33200.csv, 1 / 1


  df = pd.read_csv(DF_input + csv)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.39s/it]

--Parsing time: 4.39



