### Extract diagnosis from JSON

In [2]:
import json
import pandas as pd

# Path to clinical JSON file
clinical_json_path = "../raw_data/clinical.cart.2024-10-27.json"

# Load JSON file
with open(clinical_json_path, 'r') as f:
    clinical_data = json.load(f)

# Extract relevant diagnosis information
diagnosis_records = []
for record in clinical_data:
    case_id = record.get("case_id")  # Unique patient ID
    
    if "diagnoses" in record and len(record["diagnoses"]) > 0:
        diagnosis_info = record["diagnoses"][0]  # Assuming one diagnosis per patient
        primary_diagnosis = diagnosis_info.get("primary_diagnosis", "Unknown")
        stage = diagnosis_info.get("ajcc_pathologic_stage", "Unknown")
        
        # Create a combined diagnosis field
        full_diagnosis = f"{primary_diagnosis} (Stage: {stage})"

        diagnosis_records.append({
            "case_id": case_id,
            "primary_diagnosis": primary_diagnosis,
            "stage": stage,
            "diagnosis": full_diagnosis  # Final diagnosis format
        })

# Convert to DataFrame
diagnosis_df = pd.DataFrame(diagnosis_records)

# Display extracted diagnosis information
print("\n Extracted Diagnosis Data:")
print(diagnosis_df.head())

# Save as CSV for merging later
diagnosis_df.to_csv("../processed_data/diagnosis_data.csv", index=False)
print("\n Diagnosis data saved as '../processed_data/diagnosis_data.csv'")



 Extracted Diagnosis Data:
                                case_id             primary_diagnosis  \
0  005669e5-1a31-45fb-ae97-9d450e74e7cb  Squamous cell carcinoma, NOS   
1  0075437e-ba1a-46be-86d6-9773209a2b5e           Adenocarcinoma, NOS   
2  009be09b-f9f6-43b7-8f45-4a648f8123ce           Adenocarcinoma, NOS   
3  00fd9306-4a68-49ab-a768-e5fed126a765  Squamous cell carcinoma, NOS   
4  01417822-b608-4934-8fe0-594315212be5  Squamous cell carcinoma, NOS   

       stage                                        diagnosis  
0  Stage IIA  Squamous cell carcinoma, NOS (Stage: Stage IIA)  
1  Stage IIB           Adenocarcinoma, NOS (Stage: Stage IIB)  
2   Stage IB            Adenocarcinoma, NOS (Stage: Stage IB)  
3  Stage IIB  Squamous cell carcinoma, NOS (Stage: Stage IIB)  
4  Stage IIA  Squamous cell carcinoma, NOS (Stage: Stage IIA)  

 Diagnosis data saved as '../processed_data/diagnosis_data.csv'
