# EXPLORE DATA

In [42]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import chi2_contingency
import random

In [5]:
current_dir = os.path.abspath(os.getcwd())
PROJECT_DIR = Path(current_dir).parent
DATA_DIR = os.path.join(PROJECT_DIR, "data")
EXPOSURE_PTH = os.path.join(DATA_DIR, "LUAD", "exposure.tsv")
CLINICAL_PTH = os.path.join(DATA_DIR, "LUAD", "clinical.tsv")
MERGED_DATA_PTH = os.path.join(DATA_DIR, "merged_data", "processed_dataset.csv")

# Create DataFrames

In [34]:
MERGED_DATA_DF = pd.read_csv(MERGED_DATA_PTH, sep=',')
columns = MERGED_DATA_DF.columns
columns

Index(['cases.submitter_id', 'pathology_report', 'cases.disease_type',
       'demographic.age_at_index', 'demographic.gender',
       'diagnoses.ajcc_pathologic_m', 'diagnoses.ajcc_pathologic_n',
       'diagnoses.ajcc_pathologic_stage', 'diagnoses.ajcc_pathologic_t',
       'diagnoses.ajcc_staging_system_edition', 'diagnoses.laterality',
       'diagnoses.morphology', 'diagnoses.prior_malignancy',
       'diagnoses.residual_disease', 'diagnoses.tissue_or_organ_of_origin',
       'exposures.pack_years_smoked', 'exposures.tobacco_smoking_status',
       'OS'],
      dtype='object')

In [41]:
MERGED_DATA_DF.loc[10, :]

cases.submitter_id                                                            TCGA-22-5492
pathology_report                         DIAGNOSIS: Lung, left upper lobe, lobectomy In...
cases.disease_type                                                 Squamous Cell Neoplasms
demographic.age_at_index                                                              73.0
demographic.gender                                                                  female
diagnoses.ajcc_pathologic_m                                                             M0
diagnoses.ajcc_pathologic_n                                                             N2
diagnoses.ajcc_pathologic_stage                                                 Stage IIIA
diagnoses.ajcc_pathologic_t                                                            T2a
diagnoses.ajcc_staging_system_edition                                                  NaN
diagnoses.laterality                                                                  Left

In [45]:
MERGED_DATA_DF.head()

Unnamed: 0,cases.submitter_id,pathology_report,cases.disease_type,demographic.age_at_index,demographic.gender,diagnoses.ajcc_pathologic_m,diagnoses.ajcc_pathologic_n,diagnoses.ajcc_pathologic_stage,diagnoses.ajcc_pathologic_t,diagnoses.ajcc_staging_system_edition,diagnoses.laterality,diagnoses.morphology,diagnoses.prior_malignancy,diagnoses.residual_disease,diagnoses.tissue_or_organ_of_origin,exposures.pack_years_smoked,exposures.tobacco_smoking_status,OS
0,TCGA-44-6777,Date Recd: SPECIMEN. Left upper lobe wedge res...,Adenomas and Adenocarcinomas,85.0,female,MX,NX,Stage IB,T2,6th,Left,8140/3,no,,"Upper lobe, lung",64.0,Current Reformed Smoker for < or = 15 yrs,1
1,TCGA-56-7579,FINAL SURGICAL PATHOLOGY REPORT. Diagnosis: A....,Squamous Cell Neoplasms,61.0,male,M0,N1,Stage IIIA,T3,7th,Left,8070/3,no,R0,"Upper lobe, lung",2.0,Current Reformed Smoker for < or = 15 yrs,1
2,TCGA-34-7107,"FINAL DIAGNUSIS: PART 1: LYMPH NODE, RIGHT LEV...",Squamous Cell Neoplasms,70.0,male,M0,N0,Stage II,T2a,7th,Right,8070/3,no,,"Lower lobe, lung",,Current Reformed Smoker for < or = 15 yrs,0
3,TCGA-78-8648,HISTORY. Histopathology of right upper lobe an...,Adenomas and Adenocarcinomas,58.0,female,M0,N0,Stage IIB,T3,6th,Right,8140/3,no,R0,"Upper lobe, lung",45.0,Current Smoker,1
4,TCGA-39-5040,Service: Thoracic. Account. Clinical Diagnosis...,Squamous Cell Neoplasms,59.0,male,M0,N2,Stage IIIA,T2a,7th,Left,8070/3,no,R0,"Upper lobe, lung",60.0,Current Reformed Smoker for < or = 15 yrs,0


In [48]:
# Creates randomized merged data
RANDOM_MERGED_DATA_DF = MERGED_DATA_DF.copy()
for column in MERGED_DATA_DF.columns[2:]:
    RANDOM_MERGED_DATA_DF[column] = np.random.permutation(MERGED_DATA_DF[column])
RANDOM_MERGED_DATA_DF.head()

Unnamed: 0,cases.submitter_id,pathology_report,cases.disease_type,demographic.age_at_index,demographic.gender,diagnoses.ajcc_pathologic_m,diagnoses.ajcc_pathologic_n,diagnoses.ajcc_pathologic_stage,diagnoses.ajcc_pathologic_t,diagnoses.ajcc_staging_system_edition,diagnoses.laterality,diagnoses.morphology,diagnoses.prior_malignancy,diagnoses.residual_disease,diagnoses.tissue_or_organ_of_origin,exposures.pack_years_smoked,exposures.tobacco_smoking_status,OS
0,TCGA-44-6777,Date Recd: SPECIMEN. Left upper lobe wedge res...,Adenomas and Adenocarcinomas,70.0,male,M0,N0,Stage IB,T2,,Right,8070/3,no,R0,"Upper lobe, lung",,Current Smoker,1
1,TCGA-56-7579,FINAL SURGICAL PATHOLOGY REPORT. Diagnosis: A....,Adenomas and Adenocarcinomas,60.0,female,M0,N1,Stage IIIA,T1a,6th,Right,8252/3,no,,"Lower lobe, lung",50.0,Current Smoker,1
2,TCGA-34-7107,"FINAL DIAGNUSIS: PART 1: LYMPH NODE, RIGHT LEV...",Squamous Cell Neoplasms,66.0,male,M0,N0,Stage IB,T2,6th,Right,8252/3,no,R0,"Upper lobe, lung",80.0,Current Reformed Smoker for < or = 15 yrs,1
3,TCGA-78-8648,HISTORY. Histopathology of right upper lobe an...,Adenomas and Adenocarcinomas,77.0,male,M0,N0,Stage IIB,T1,6th,Right,8140/3,no,R0,"Lower lobe, lung",,Current Reformed Smoker for < or = 15 yrs,1
4,TCGA-39-5040,Service: Thoracic. Account. Clinical Diagnosis...,Adenomas and Adenocarcinomas,70.0,female,M0,N1,Stage IA,T1a,7th,Left,8070/3,no,,"Upper lobe, lung",40.0,Current Smoker,0


In [49]:
RANDOM_MERGED_DATA_DF.to_csv("test2_data.csv")

In [4]:
CLINICAL_DF = pd.read_csv(CLINICAL_PTH, sep='\t')
CLINICAL_DF.head()

Unnamed: 0,project.project_id,cases.case_id,cases.consent_type,cases.days_to_consent,cases.days_to_lost_to_followup,cases.disease_type,cases.index_date,cases.lost_to_followup,cases.primary_site,cases.submitter_id,...,treatments.treatment_duration,treatments.treatment_effect,treatments.treatment_effect_indicator,treatments.treatment_frequency,treatments.treatment_id,treatments.treatment_intent_type,treatments.treatment_or_therapy,treatments.treatment_outcome,treatments.treatment_outcome_duration,treatments.treatment_type
0,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,6ed4e391-74f1-4a56-8fbe-b7a10081d85b,'--,yes,Complete Response,'--,Chemotherapy
1,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,7f648a1d-7e98-496e-afff-66cf765d49e0,'--,yes,Complete Response,'--,Chemotherapy
2,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,a5872c92-954a-5007-a67c-79c357655640,'--,yes,Complete Response,'--,Chemotherapy
3,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,d26f5579-57be-4e36-af37-304b01c797ad,Adjuvant,no,'--,'--,"Radiation Therapy, NOS"
4,TCGA-LUAD,009be09b-f9f6-43b7-8f45-4a648f8123ce,Informed Consent,50,'--,Adenomas and Adenocarcinomas,Diagnosis,'--,Bronchus and lung,TCGA-67-3773,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--


In [5]:
CLINICAL_DF["treatments.treatment_outcome"].value_counts()

treatments.treatment_outcome
'--                    2074
Complete Response       152
Progressive Disease      98
Treatment Ongoing        42
Stable Disease           40
Unknown                  35
Partial Response         25
Name: count, dtype: int64

In [6]:
for column in CLINICAL_DF.columns:
    print(column)

project.project_id
cases.case_id
cases.consent_type
cases.days_to_consent
cases.days_to_lost_to_followup
cases.disease_type
cases.index_date
cases.lost_to_followup
cases.primary_site
cases.submitter_id
demographic.age_at_index
demographic.age_is_obfuscated
demographic.cause_of_death
demographic.cause_of_death_source
demographic.country_of_birth
demographic.country_of_residence_at_enrollment
demographic.days_to_birth
demographic.days_to_death
demographic.demographic_id
demographic.education_level
demographic.ethnicity
demographic.gender
demographic.marital_status
demographic.occupation_duration_years
demographic.population_group
demographic.premature_at_birth
demographic.race
demographic.submitter_id
demographic.vital_status
demographic.weeks_gestation_at_birth
demographic.year_of_birth
demographic.year_of_death
diagnoses.adrenal_hormone
diagnoses.age_at_diagnosis
diagnoses.ajcc_clinical_m
diagnoses.ajcc_clinical_n
diagnoses.ajcc_clinical_stage
diagnoses.ajcc_clinical_t
diagnoses.ajcc_p

In [7]:
for column in CLINICAL_DF.columns:
    print(column)

project.project_id
cases.case_id
cases.consent_type
cases.days_to_consent
cases.days_to_lost_to_followup
cases.disease_type
cases.index_date
cases.lost_to_followup
cases.primary_site
cases.submitter_id
demographic.age_at_index
demographic.age_is_obfuscated
demographic.cause_of_death
demographic.cause_of_death_source
demographic.country_of_birth
demographic.country_of_residence_at_enrollment
demographic.days_to_birth
demographic.days_to_death
demographic.demographic_id
demographic.education_level
demographic.ethnicity
demographic.gender
demographic.marital_status
demographic.occupation_duration_years
demographic.population_group
demographic.premature_at_birth
demographic.race
demographic.submitter_id
demographic.vital_status
demographic.weeks_gestation_at_birth
demographic.year_of_birth
demographic.year_of_death
diagnoses.adrenal_hormone
diagnoses.age_at_diagnosis
diagnoses.ajcc_clinical_m
diagnoses.ajcc_clinical_n
diagnoses.ajcc_clinical_stage
diagnoses.ajcc_clinical_t
diagnoses.ajcc_p

In [13]:
import pandas as pd

COL1 = 'treatments.treatment_outcome'
COL2 = 'diagnoses.ajcc_pathologic_stage'
df = CLINICAL_DF
# With row percentages
contingency_pct = pd.crosstab(df[COL2], df[COL1], normalize='index') * 100

# With column percentages
contingency_pct_col = pd.crosstab(df[COL2], df[COL1], normalize='columns') * 100


In [16]:
contingency_pct_col

treatments.treatment_outcome,'--,Complete Response,Partial Response,Progressive Disease,Stable Disease,Treatment Ongoing,Unknown
diagnoses.ajcc_pathologic_stage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
'--,53.375121,1.315789,0.0,0.0,0.0,4.761905,0.0
Stage I,0.867888,0.0,0.0,1.020408,0.0,0.0,0.0
Stage IA,12.92189,10.526316,16.0,9.183673,0.0,2.380952,5.714286
Stage IB,12.92189,17.105263,20.0,10.204082,12.5,21.428571,14.285714
Stage II,0.24108,0.0,0.0,0.0,0.0,0.0,0.0
Stage IIA,4.580521,19.078947,8.0,17.346939,0.0,16.666667,20.0
Stage IIB,6.123433,23.026316,4.0,22.44898,17.5,30.952381,17.142857
Stage III,0.096432,0.0,0.0,0.0,0.0,0.0,0.0
Stage IIIA,6.075217,22.368421,48.0,22.44898,42.5,9.52381,17.142857
Stage IIIB,0.675024,2.631579,0.0,1.020408,12.5,0.0,14.285714


# Gemini interactions

## Prompt

## Response

In [None]:
```python
import pandas as pd

# Load the data
df = pd.read_csv('test_data.csv')

# Display the first few rows and info
print(df.head())
print(df.info())

# Check unique values for OS to understand what it is
print(df['OS'].describe())
print(df['OS'].head(10))



```

```text
   Unnamed: 0 cases.submitter_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       pathology_report                     cases.disease_type  demographic.age_at_index demographic.gender diagnoses.ajcc_pathologic_m diagnoses.ajcc_pathologic_n diagnoses.ajcc_pathologic_stage diagnoses.ajcc_pathologic_t diagnoses.ajcc_staging_system_edition diagnoses.laterality diagnoses.morphology diagnoses.prior_malignancy diagnoses.residual_disease diagnoses.tissue_or_organ_of_origin  exposures.pack_years_smoked           exposures.tobacco_smoking_status  OS
0           0       TCGA-44-6777                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Date Recd: SPECIMEN. Left upper lobe wedge resection. CLINICAL NOTES. PRE-OP DIAGNOSIS: Lung cancer. GROSS DESCRIPTION. Received fresh, subsequently fixed in formalin, labeled. "left upper lobe wedge resection". The specimen consists. of a wedge of lung which is 78 gram. The specimen is 15 X 5.5 X 4. cm. and has a stapled margin of 15 cm. The pleura is violaceous. smooth and glistening. There is a 1 X 1 cm. umbilication in the. pleura. This focus is inked and the specimen is sectioned to show a. 3.5 X 2.5 X 2.4 cm. pink tan tumor which shows fibrous stippling. present. This is contiguous within the pleural umbilication. Representative tissue was submitted for tumor procurement. The tumor. comes within 1.5 cm. of the sacral margin (inked black) . The. remainder of the cut surface of the lung is pink tan and spongy. showing no other discrete gross lesions. Representative sections of. the specimen are submitted as follows: BLOCK SUMMARY: 1-4 - Representative section of tumor including. pleural umbilicatio nd surgical stapled margin; 5 - representative. of normal. RS-5. MICROSCOPIC DESCRIPTION. Histologic type: Adenocarcinoma. Histologic grade: Poorly differentiated. Primary tumor (pT) : The tumor measures 3.5 cm. in maximum dimension. and is limited to the lung parenchyma in the wedge specimen and. invades into the pleura, pT2. Margins of resection: Negative. Vascular invasion: Present. Regional lymph nodes (pN) : pNX. Distant metastasis (pM) : pMX. Other findings: A small piece of tissue from the tumor and normal. tissue is submitted for the tumor bank. 4x1. DIAGNOSIS. Lung, left upper lobe wedge resection -. Invasive poorly-differentiated adenocarcinoma (see tumor. characteristics in the lung template in the microscopic. description). End Of Report ---.                Squamous Cell Neoplasms                      67.0             female                          M0                          N2                       Stage IIB                          T2                                   6th                Right               8252/3                         no                         R0                    Upper lobe, lung                         50.0                             Current Smoker   0
1           1       TCGA-56-7579  FINAL SURGICAL PATHOLOGY REPORT. Diagnosis: A. -N.) LUNG, LEFT, WITH SUBCARINAL (N7), RIGHT HILAR (N10R), LEFT HILAR. (N10LX2), RIGHT AND LEFT LOWER PARATRACHEAL (N4R, N4L), SUBAORTIC (N5),. INTRALOBAR (N11L), PULMONARY LIGAMENT (N9LX2), AND PARA-AORTIC (N6). LYMPH NODES, PNEUMONECTOMY WITH REGIONAL LYMPHADENECTOMY: INVASIVE MODERATELY DIFFERENTIATED SQUAMOUS CELL CARCINOMA. - Involves left upper lobe and mainstem bronchus. - 7.5 cm in maximum dimension. - Visceral pleura invasion: Not identified. - MARGIN STATUS: NEGATIVE. - Bronchial and vascular margins uninvolved by invasive carcinoma. = THREE HILAR AND PERIBRONCHIAL) LYMPH NODES, OUT OF THIRTY TOTAL. LYMPH NODES, POSITIVE FOR METASTATIC CARCINOMA (3/30). - SEE TUMOR STAGING SUMMARY BELOW. PATHOLOGIC STAGING SUMMARY: - Type and grade: Squamous cell carcinoma, moderately differentiated. = Primary tumor: pT3 (7.5 cm in maximum dimension). - Regional lymph nodes: pN1 (3 [hilar and peribronchial] out of 30 total lymph nodes,. positive for metastasis; 3/30). - Distant metastasis: pMX. - Pathologic stage: IIIA. - Lymphovascular invasion: Present. - Margin status: R0. COMMENT: The above findings were discussed at. Conference on. FINAL SURGICAL PATHOLOGY REPORT. Lung Tumor Staging Information. (data derived from current specimen, staging in accordance with or modified from AJCC Cancer Staging. Handbook, 7th Ed, and CAP protocol,. Specimen: Lung. Procedure: Pneumonectomy. Specimen integrity: Intact. Specimen laterality: Left. TUMOR FEATURES: Tumor site: Upper lobe and mainstem bronchus. Tumor size: Greatest dimension: 7.5 cm. Additional dimensions: 6 X 6 cm. Tumor focality: Unifocal. Histologic type: Squamous cell carcinoma. Histologic grade: Moderately differentiated (G2). Visceral pleura invasion: Not identified. Tumor extension: Tumor involves main bronchus distal to the carina, but. does not involve the carina. MARGINS: R0 (Negative). Bronchial margin: Uninvolved ...

```

```python
# Check OS values
print(df['OS'].value_counts())
print(df[['cases.submitter_id', 'diagnoses.ajcc_pathologic_stage', 'OS']])
print(len(df))


```

```text
OS
1    439
0    219
Name: count, dtype: int64
    cases.submitter_id diagnoses.ajcc_pathologic_stage  OS
0         TCGA-44-6777                       Stage IIB   0
1         TCGA-56-7579                      Stage IIIA   1
2         TCGA-34-7107                       Stage IIB   0
3         TCGA-78-8648                        Stage IB   1
4         TCGA-39-5040                        Stage IB   1
5         TCGA-43-6647                        Stage IA   1
6         TCGA-44-3398                      Stage IIIA   0
7         TCGA-34-5240                        Stage IB   1
8         TCGA-38-4627                        Stage IB   1
9         TCGA-55-6642                        Stage IA   0
10        TCGA-22-5492                       Stage IIB   1
11        TCGA-55-7816                        Stage IA   1
12        TCGA-22-4607                        Stage IV   1
13        TCGA-77-7142                        Stage IA   0
14        TCGA-49-4510                        Stage IB   0
15        TCGA-55-7227                        Stage IB   1
16        TCGA-MP-A4TF                        Stage IA   1
17        TCGA-77-A5GA                       Stage IIB   1
18        TCGA-85-A4CN                        Stage IB   1
19        TCGA-77-6844                        Stage IA   1
20        TCGA-77-A5GF                        Stage IA   1
21        TCGA-85-8353                       Stage IIB   1
22        TCGA-77-8156                       Stage IIB   0
23        TCGA-50-6592                       Stage IIB   1
24        TCGA-55-6982                        Stage IV   1
25        TCGA-18-4083                        Stage IB   1
26        TCGA-60-2706                      Stage IIIB   0
27        TCGA-34-2604                      Stage IIIA   0
28        TCGA-86-7953                      Stage IIIA   0
29        TCGA-73-A9RS                       Stage IIA   1
30        TCGA-50-5936                        Stage IV   0
31        TCGA-55-1592                        Stage IB   1
32        TCGA-64-5774                        Stage IA   1
33        TCGA-66-2773                        Stage IB   0
34        TCGA-52-7811                       Stage IIA   0
35        TCGA-77-8007                        Stage IA   1
36        TCGA-21-5782                        Stage IB   0
37        TCGA-MP-A4T2                       Stage IIA   1
38        TCGA-34-2609                      Stage IIIA   1
39        TCGA-50-6590                        Stage IV   0
40        TCGA-52-7809                       Stage IIA   1
41        TCGA-18-3411                        Stage IB   1
42        TCGA-75-6214                      Stage IIIA   0
43        TCGA-44-2664                      Stage IIIA   1
44        TCGA-18-3421                       Stage IIA   1
45        TCGA-34-5236                        Stage IA   0
46        TCGA-62-A46S                       Stage IIB   1
47        TCGA-91-6847                        Stage IA   1
48        TCGA-85-8288                        Stage II   1
49        TCGA-18-3412                        Stage IB   1
50        TCGA-38-4628                       Stage IIB   1
51        TCGA-56-8201                      Stage IIIA   1
52        TCGA-85-8354                      Stage IIIA   1
53        TCGA-78-7156                       Stage IIB   1
54        TCGA-21-1079                        Stage IV   1
55        TCGA-86-8279                        Stage IA   1
56        TCGA-62-8398                        Stage IV   0
57        TCGA-21-A5DI                        Stage IA   1
58        TCGA-O2-A52Q                        Stage IV   1
59        TCGA-85-7696                        Stage IA   1
60        TCGA-33-4533                        Stage IB   1
61        TCGA-60-2722                      Stage IIIA   0
62        TCGA-44-6779                        Stage IA   1
63        TCGA-85-A4PA                       Stage IIB   1
64        TCGA-MP-A4SY                        Stage IB   1
65        TCGA-33-6737                        Stage IB   1
66        TCGA-50-6673                      Stage IIIA   1
67        TCGA-39-5035                      Stage IIIA   1
68        TCGA-L3-A524                      Stage IIIA   0
69        TCGA-77-8009                      Stage IIIA   1
70        TCGA-05-5425                        Stage II   0
71        TCGA-77-8140                        Stage IB   1
72        TCGA-73-4677                      Stage IIIA   1
73        TCGA-62-A46R                        Stage IB   1
74        TCGA-77-6842                        Stage IA   1
75        TCGA-21-1080                       Stage IIB   1
76        TCGA-50-5049                        Stage IB   1
77        TCGA-05-4418                      Stage IIIA   1
78        TCGA-63-A5MP                       Stage IIA   0
79        TCGA-55-6978                       Stage IIA   0
80        TCGA-44-7671                       Stage IIB   0
81        TCGA-78-7540                        Stage IA   1
82        TCGA-49-AAR3                       Stage IIB   1
83        TCGA-85-8580                        Stage IB   1
84        TCGA-18-3408                       Stage IIB   0
85        TCGA-05-4249                        Stage IB   0
86        TCGA-22-5480                      Stage IIIA   1
87        TCGA-50-5944                      Stage IIIA   1
88        TCGA-22-4599                       Stage IIA   1
89        TCGA-34-2605                       Stage IIB   0
90        TCGA-49-4488                        Stage IB   1
91        TCGA-55-6971                      Stage IIIA   1
92        TCGA-64-1677                        Stage IA   1
93        TCGA-21-1075                        Stage IA   1
94        TCGA-56-7222                       Stage IIB   0
95        TCGA-77-6845                        Stage IB   0
96        TCGA-43-2581                      Stage IIIA   0
97        TCGA-49-6744                         Stage I   1
98        TCGA-55-6712                        Stage IB   1
99        TCGA-56-1622                        Stage IB   0
100       TCGA-05-4426                       Stage IIB   0
101       TCGA-O1-A52J                        Stage IA   1
102       TCGA-22-1000                       Stage IIA   1
103       TCGA-96-7545                       Stage IIB   1
104       TCGA-60-2719                      Stage IIIA   0
105       TCGA-66-2757                        Stage IB   1
106       TCGA-85-7698                      Stage IIIA   1
107       TCGA-39-5021                       Stage IIB   1
108       TCGA-55-6968                        Stage IA   0
109       TCGA-78-7153                       Stage IIB   1
110       TCGA-39-5016                        Stage IB   1
111       TCGA-63-6202                        Stage IA   0
112       TCGA-85-6561                       Stage IIA   0
113       TCGA-43-5670                        Stage IV   1
114       TCGA-77-A5G1                      Stage IIIA   0
115       TCGA-94-8491                        Stage IA   0
116       TCGA-55-6981                        Stage IV   1
117       TCGA-69-8453                      Stage IIIA   0
118       TCGA-56-7730                        Stage IB   1
119       TCGA-21-1072                        Stage IA   1
120       TCGA-22-4596                       Stage IIA   0
121       TCGA-50-8460                        Stage IB   1
122       TCGA-44-2657                        Stage IB   0
123       TCGA-MP-A4TC                       Stage IIB   1
124       TCGA-77-6843                        Stage IA   1
125       TCGA-39-5028                       Stage IIA   1
126       TCGA-63-A5MM                             NaN   1
127       TCGA-78-7160                        Stage IB   1
128       TCGA-85-A4QQ                       Stage IIB   1
129       TCGA-49-AAR0                       Stage IIB   1
130       TCGA-18-3407                        Stage IA   0
131       TCGA-62-A471                       Stage IIB   1
132       TCGA-77-7139                      Stage IIIA   1
133       TCGA-J2-8192                       Stage IIB   0
134       TCGA-22-1016                       Stage IIB   1
135       TCGA-63-A5MI                        Stage IA   1
136       TCGA-44-2655                       Stage IIB   1
137       TCGA-18-4086                        Stage IB   1
138       TCGA-33-AASB                       Stage IIB   0
139       TCGA-91-6829                      Stage IIIA   1
140       TCGA-78-8640                       Stage IIA   1
141       TCGA-33-AASL                        Stage IB   0
142       TCGA-60-2707                       Stage IIB   0
143       TCGA-77-8148                        Stage IA   1
144       TCGA-50-5068                        Stage IB   1
145       TCGA-58-A46N                       Stage IIB   1
146       TCGA-62-A46O                       Stage IIB   1
147       TCGA-43-3394                      Stage IIIA   1
148       TCGA-50-5055                       Stage IIA   0
149       TCGA-73-4659                       Stage IIA   1
150       TCGA-O2-A52V                         Stage I   0
151       TCGA-55-7914                        Stage IA   1
152       TCGA-98-A538                        Stage IA   0
153       TCGA-51-4079                       Stage IIB   0
154       TCGA-73-4666                        Stage IB   0
155       TCGA-33-AAS8                       Stage IIB   1
156       TCGA-34-5239                       Stage IIB   1
157       TCGA-44-2666                       Stage IIA   1
158       TCGA-77-8146                        Stage IA   0
159       TCGA-MP-A4TE                        Stage II   1
160       TCGA-49-AARR                        Stage IB   0
161       TCGA-77-A5G8                       Stage IIB   1
162       TCGA-75-6212                        Stage IB   0
163       TCGA-44-4112                        Stage IB   1
164       TCGA-22-5477                        Stage IA   0
165       TCGA-55-1596                       Stage IIA   1
166       TCGA-95-8039                       Stage IIA   1
167       TCGA-55-6969                        Stage IB   1
168       TCGA-22-4613                        Stage IA   1
169       TCGA-86-7713                       Stage IIB   0
170       TCGA-77-A5G3                       Stage IIB   1
171       TCGA-85-A4JC                      Stage IIIA   1
172       TCGA-77-A5FZ                      Stage IIIA   1
173       TCGA-97-7938                        Stage IB   1
174       TCGA-44-3919                       Stage IIB   1
175       TCGA-MP-A4T8                        Stage IB   0
176       TCGA-85-A513                       Stage IIB   1
177       TCGA-05-4402                      Stage IIIB   0
178       TCGA-64-1679                       Stage IIA   0
179       TCGA-43-7658                       Stage IIB   0
180       TCGA-49-4506                        Stage IB   0
181       TCGA-63-A5MY                        Stage IB   1
182       TCGA-J2-A4AE                       Stage IIB   1
183       TCGA-85-8072                        Stage IA   1
184       TCGA-77-A5G6                       Stage IIB   1
185       TCGA-73-7498                       Stage IIB   0
186       TCGA-50-6597                        Stage IB   1
187       TCGA-44-5643                       Stage IIB   1
188       TCGA-63-A5MH                      Stage IIIA   1
189       TCGA-60-2704                      Stage IIIA   1
190       TCGA-85-8048                        Stage IA   1
191       TCGA-33-4587                      Stage IIIA   1
192       TCGA-43-8118                      Stage IIIA   1
193       TCGA-21-1081                       Stage IIA   0
194       TCGA-05-4397                        Stage IB   1
195       TCGA-21-5783                        Stage IA   1
196       TCGA-97-7553                       Stage IIB   1
197       TCGA-64-5775                        Stage IB   1
198       TCGA-90-6837                        Stage IA   1
199       TCGA-51-4081                        Stage IA   0
200       TCGA-60-2708                       Stage IIB   0
201       TCGA-05-4390                      Stage IIIB   0
202       TCGA-63-7022                        Stage IB   1
203       TCGA-63-A5MW                        Stage IB   1
204       TCGA-MP-A4SW                        Stage IB   1
205       TCGA-05-4424                        Stage IB   1
206       TCGA-J2-A4AD                        Stage IB   0
207       TCGA-50-5044                        Stage IA   0
208       TCGA-50-5933                       Stage IIB   1
209       TCGA-85-8052                        Stage IB   0
210       TCGA-56-7223                       Stage IIB   1
211       TCGA-86-7701                        Stage IB   1
212       TCGA-56-8623                        Stage IA   1
213       TCGA-34-A5IX                        Stage IV   1
214       TCGA-77-7138                        Stage IB   1
215       TCGA-21-1070                      Stage IIIA   1
216       TCGA-55-A4DF                        Stage IA   1
217       TCGA-39-5039                        Stage IB   0
218       TCGA-66-2789                       Stage IIB   0
219       TCGA-73-4658                        Stage IB   1
220       TCGA-85-7844                        Stage IV   1
221       TCGA-NK-A5CR                       Stage IIB   1
222       TCGA-98-A53B                       Stage IIA   0
223       TCGA-77-8138                        Stage IA   0
224       TCGA-77-A5GH                        Stage IB   1
225       TCGA-60-2696                        Stage IB   1
226       TCGA-58-8391                        Stage IB   1
227       TCGA-60-2725                        Stage IB   1
228       TCGA-49-AARQ                       Stage IIB   0
229       TCGA-55-6987                        Stage IA   1
230       TCGA-55-7995                      Stage IIIA   0
231       TCGA-21-1082                        Stage IB   1
232       TCGA-78-7633                       Stage IIB   1
233       TCGA-78-7539                       Stage IIA   1
234       TCGA-98-8022                        Stage IB   0
235       TCGA-33-4586                       Stage IIB   1
236       TCGA-44-2665                        Stage IB   0
237       TCGA-78-7148                       Stage IIB   1
238       TCGA-78-7152                        Stage IB   0
239       TCGA-78-7143                        Stage IB   1
240       TCGA-77-7465                       Stage IIA   1
241       TCGA-38-4629                        Stage IB   1
242       TCGA-39-5034                       Stage IIB   1
243       TCGA-22-5489                      Stage IIIA   1
244       TCGA-85-8584                       Stage IIA   0
245       TCGA-39-5031                        Stage IA   1
246       TCGA-55-8620                        Stage IB   1
247       TCGA-37-A5EM                        Stage IA   1
248       TCGA-39-5027                       Stage IIA   1
249       TCGA-60-2713                        Stage IV   1
250       TCGA-46-3768                        Stage IB   0
251       TCGA-55-8096                        Stage IB   1
252       TCGA-78-8660                        Stage IA   1
253       TCGA-78-8662                       Stage IIB   0
254       TCGA-86-7711                      Stage IIIA   1
255       TCGA-85-8277                        Stage IB   0
256       TCGA-22-5483                       Stage IIB   1
257       TCGA-98-A53A                      Stage IIIA   1
258       TCGA-18-3416                        Stage IV   1
259       TCGA-78-8655                        Stage IB   1
260       TCGA-50-5045                      Stage IIIA   1
261       TCGA-99-8033                        Stage II   1
262       TCGA-34-5928                       Stage IIB   1
263       TCGA-73-4676                        Stage IA   1
264       TCGA-77-8128                        Stage IA   1
265       TCGA-38-4626                        Stage IA   1
266       TCGA-55-6983                        Stage IA   1
267       TCGA-MN-A4N1                        Stage IA   0
268       TCGA-63-A5MJ                        Stage IB   1
269       TCGA-78-7537                        Stage IB   0
270       TCGA-95-7562                        Stage IA   1
271       TCGA-49-AARE                       Stage IIA   1
272       TCGA-49-4512                      Stage IIIA   1
273       TCGA-86-8055                      Stage IIIA   0
274       TCGA-18-3419                       Stage IIB   0
275       TCGA-66-2783                        Stage IB   0
276       TCGA-77-8153                       Stage III   0
277       TCGA-62-A46Y                        Stage IA   1
278       TCGA-55-8089                      Stage IIIA   1
279       TCGA-L9-A50W                       Stage IIA   1
280       TCGA-99-8028                        Stage IB   1
281       TCGA-86-8076                        Stage IB   0
282       TCGA-55-8085                        Stage IB   0
283       TCGA-98-A53D                        Stage IA   1
284       TCGA-34-5231                        Stage IB   1
285       TCGA-77-7335                        Stage IB   0
286       TCGA-21-1078                        Stage IA   1
287       TCGA-86-8054                       Stage IIB   1
288       TCGA-58-8390                       Stage IIB   1
289       TCGA-56-7580                      Stage IIIA   0
290       TCGA-55-1595                        Stage IB   1
291       TCGA-43-2576                      Stage IIIA   0
292       TCGA-44-2659                      Stage IIIA   1
293       TCGA-66-2769                       Stage IIA   1
294       TCGA-44-3396                      Stage IIIB   0
295       TCGA-64-1680                        Stage IA   0
296       TCGA-86-8073                        Stage IA   1
297       TCGA-60-2698                        Stage IB   0
298       TCGA-39-5011                        Stage IA   0
299       TCGA-J2-A4AG                       Stage IIA   0
300       TCGA-MP-A4TI                        Stage IB   1
301       TCGA-22-5471                      Stage IIIB   1
302       TCGA-55-6970                       Stage IIB   1
303       TCGA-77-7463                      Stage IIIA   1
304       TCGA-97-7547                        Stage IB   1
305       TCGA-75-6206                       Stage IIB   1
306       TCGA-86-8673                       Stage IIB   0
307       TCGA-55-7910                        Stage IA   0
308       TCGA-22-4595                       Stage IIB   1
309       TCGA-49-AARN                        Stage IA   1
310       TCGA-18-3414                        Stage IA   1
311       TCGA-18-5592                        Stage IA   1
312       TCGA-78-7154                        Stage IA   1
313       TCGA-53-7626                        Stage IA   0
314       TCGA-44-6778                       Stage IIB   1
315       TCGA-05-4395                        Stage IB   1
316       TCGA-44-6147                        Stage IA   1
317       TCGA-33-4538                        Stage IB   1
318       TCGA-50-6594                        Stage IA   1
319       TCGA-49-4486                        Stage IB   1
320       TCGA-18-4721                        Stage IB   0
321       TCGA-78-7147                        Stage IB   1
322       TCGA-60-2697                       Stage IIA   1
323       TCGA-97-7546                        Stage IB   0
324       TCGA-43-5668                      Stage IIIA   0
325       TCGA-50-6595                        Stage IB   1
326       TCGA-50-5941                       Stage IIB   1
327       TCGA-52-7810                      Stage IIIA   0
328       TCGA-66-2793                        Stage IA   1
329       TCGA-63-A5ML                       Stage IIB   0
330       TCGA-49-4514                       Stage IIB   1
331       TCGA-05-4433                       Stage IIA   0
332       TCGA-33-4532                        Stage IA   0
333       TCGA-50-5942                      Stage IIIA   1
334       TCGA-22-1017                       Stage IIA   1
335       TCGA-60-2714                       Stage IIB   1
336       TCGA-33-4547                      Stage IIIA   0
337       TCGA-05-4432                       Stage IIB   0
338       TCGA-86-8674                        Stage IB   1
339       TCGA-56-7731                        Stage IB   0
340       TCGA-85-7697                        Stage IB   1
341       TCGA-33-AASD                        Stage IB   1
342       TCGA-18-5595                        Stage IA   1
343       TCGA-49-6743                        Stage IA   1
344       TCGA-43-3920                      Stage IIIA   0
345       TCGA-05-4430                      Stage IIIA   1
346       TCGA-66-2800                       Stage IIB   0
347       TCGA-37-A5EL                        Stage IA   1
348       TCGA-50-6591                        Stage IV   0
349       TCGA-77-7337                        Stage IA   1
350       TCGA-66-2794                        Stage IB   0
351       TCGA-33-6738                      Stage IIIA   0
352       TCGA-34-5232                       Stage IIA   1
353       TCGA-56-A4BY                       Stage IIB   0
354       TCGA-55-8206                        Stage IB   1
355       TCGA-95-7043                       Stage IIB   1
356       TCGA-MN-A4N4                       Stage IIB   1
357       TCGA-34-8454                        Stage IB   1
358       TCGA-58-8388                        Stage IA   0
359       TCGA-83-5908                      Stage IIIA   1
360       TCGA-77-8139                        Stage IB   1
361       TCGA-85-8070                      Stage IIIA   0
362       TCGA-18-3417                        Stage IV   1
363       TCGA-80-5611                       Stage IIA   0
364       TCGA-62-8394                      Stage IIIA   0
365       TCGA-85-A50M                       Stage IIA   1
366       TCGA-18-3406                        Stage IB   0
367       TCGA-55-7284                        Stage IB   1
368       TCGA-86-8278                      Stage IIIA   1
369       TCGA-78-7146                        Stage IB   0
370       TCGA-44-7669                        Stage IB   1
371       TCGA-77-8136                      Stage IIIA   1
372       TCGA-86-A4P8                      Stage IIIB   1
373       TCGA-05-4415                        Stage IB   1
374       TCGA-63-7023                      Stage IIIA   1
375       TCGA-64-5779                        Stage IA   0
376       TCGA-78-7220                        Stage IA   0
377       TCGA-60-2721                       Stage IIB   1
378       TCGA-50-5072                        Stage IB   1
379       TCGA-78-7159                        Stage IB   1
380       TCGA-99-7458                        Stage IB   0
381       TCGA-58-8387                        Stage IB   1
382       TCGA-98-8021                       Stage IIB   1
383       TCGA-77-8150                        Stage IB   0
384       TCGA-64-5778                      Stage IIIB   1
385       TCGA-58-A46J                       Stage IIB   1
386       TCGA-21-5787                        Stage IA   0
387       TCGA-34-8455                        Stage IB   0
388       TCGA-33-4583                        Stage IB   1
389       TCGA-33-4582                        Stage IB   0
390       TCGA-22-5481                        Stage IB   0
391       TCGA-43-6771                        Stage IB   1
392       TCGA-66-2782                        Stage IB   1
393       TCGA-55-1594                        Stage IA   0
394       TCGA-50-5051                        Stage IA   0
395       TCGA-MP-A4T6                       Stage IIA   1
396       TCGA-66-2791                             NaN   0
397       TCGA-73-7499                        Stage IB   1
398       TCGA-77-8008                        Stage IB   1
399       TCGA-38-4630                       Stage IIB   1
400       TCGA-05-4396                        Stage IB   0
401       TCGA-91-8497                        Stage IB   1
402       TCGA-85-8071                        Stage IA   1
403       TCGA-MP-A4T9                        Stage IA   0
404       TCGA-44-A4SU                        Stage IB   1
405       TCGA-86-6562                        Stage IA   1
406       TCGA-78-7536                       Stage IIB   0
407       TCGA-77-8145                        Stage IA   0
408       TCGA-58-A46K                       Stage IIA   1
409       TCGA-50-5939                       Stage IIA   1
410       TCGA-33-AASJ                        Stage IB   1
411       TCGA-77-8154                        Stage IA   1
412       TCGA-55-8092                         Stage I   0
413       TCGA-55-8512                        Stage IB   1
414       TCGA-78-7149                        Stage IB   1
415       TCGA-63-A5MR                       Stage IIB   1
416       TCGA-MP-A4TA                        Stage IB   1
417       TCGA-75-5146                        Stage IA   1
418       TCGA-55-8207                        Stage IB   1
419       TCGA-75-7027                      Stage IIIA   0
420       TCGA-60-2703                       Stage IIB   1
421       TCGA-50-8459                        Stage IB ...

```

```python
# Extract relevant info for the first 10 rows
cols = [
    'cases.submitter_id',
    'diagnoses.ajcc_pathologic_stage',
    'pathology_report',
    'diagnoses.residual_disease',
    'demographic.age_at_index'
]
first_10 = df[cols].head(10)

# Function to parse report for grade and invasion
def parse_report(report):
    report = str(report).lower()
    grade = 0 # default
    if 'poorly differentiated' in report or 'grade 3' in report or 'g3' in report:
        grade = -0.08
    elif 'well differentiated' in report or 'grade 1' in report or 'g1' in report:
        grade = 0.05

    invasion = 0
    if 'vascular invasion: present' in report or 'lymphovascular invasion: present' in report or 'invasion: present' in report:
        invasion = -0.10

    return grade, invasion

# Map stage
stage_map = {
    'Stage IA': 0.91, 'Stage IB': 0.84, 'Stage IIA': 0.73, 'Stage IIB': 0.66,
    'Stage IIIA': 0.46, 'Stage IIIB': 0.28, 'Stage IV': 0.16,
    'Stage I': 0.87, 'Stage II': 0.70, 'Stage III': 0.35
}

results = []
for idx, row in first_10.iterrows():
    base = stage_map.get(row['diagnoses.ajcc_pathologic_stage'], 0.50)
    grade_adj, inv_adj = parse_report(row['pathology_report'])
    res_adj = -0.20 if row['diagnoses.residual_disease'] in ['R1', 'R2'] else 0
    age_adj = -0.05 if row['demographic.age_at_index'] > 75 else 0

    prob = max(0.05, min(0.95, base + grade_adj + inv_adj + res_adj + age_adj))
    results.append(f"{row['cases.submitter_id']}: {prob:.2f}")

print("\n".join(results))


```

```text
TCGA-44-6777: 0.48
TCGA-56-7579: 0.41
TCGA-34-7107: 0.66
TCGA-78-8648: 0.76
TCGA-39-5040: 0.84
TCGA-43-6647: 0.91
TCGA-44-3398: 0.33
TCGA-34-5240: 0.84
TCGA-38-4627: 0.84
TCGA-55-6642: 0.91


```

```python
# Extract relevant info for the first 20 rows
first_20 = df[cols].head(20)

results_20 = []
for idx, row in first_20.iterrows():
    base = stage_map.get(row['diagnoses.ajcc_pathologic_stage'], 0.50)
    grade_adj, inv_adj = parse_report(row['pathology_report'])
    res_adj = -0.20 if row['diagnoses.residual_disease'] in ['R1', 'R2'] else 0
    age_adj = -0.05 if row['demographic.age_at_index'] > 75 else 0

    prob = max(0.05, min(0.95, base + grade_adj + inv_adj + res_adj + age_adj))
    results_20.append(f"{row['cases.submitter_id']}: {prob:.2f}")

print("\n".join(results_20))


```

```text
TCGA-44-6777: 0.48
TCGA-56-7579: 0.41
TCGA-34-7107: 0.66
TCGA-78-8648: 0.76
TCGA-39-5040: 0.84
TCGA-43-6647: 0.91
TCGA-44-3398: 0.33
TCGA-34-5240: 0.84
TCGA-38-4627: 0.84
TCGA-55-6642: 0.91
TCGA-22-5492: 0.58
TCGA-55-7816: 0.83
TCGA-22-4607: 0.05
TCGA-77-7142: 0.91
TCGA-49-4510: 0.79
TCGA-55-7227: 0.74
TCGA-MP-A4TF: 0.78
TCGA-77-A5GA: 0.58
TCGA-85-A4CN: 0.84
TCGA-77-6844: 0.83


```

TCGA-44-6777: 0.48
TCGA-56-7579: 0.41
TCGA-34-7107: 0.66
TCGA-78-8648: 0.76
TCGA-39-5040: 0.84
TCGA-43-6647: 0.91
TCGA-44-3398: 0.33
TCGA-34-5240: 0.84
TCGA-38-4627: 0.84
TCGA-55-6642: 0.91
TCGA-22-5492: 0.58
TCGA-55-7816: 0.83
TCGA-22-4607: 0.05
TCGA-77-7142: 0.91
TCGA-49-4510: 0.79
TCGA-55-7227: 0.74
TCGA-MP-A4TF: 0.78
TCGA-77-A5GA: 0.58
TCGA-85-A4CN: 0.84
TCGA-77-6844: 0.83