### Import Libraries

In [5]:
import json
import os
import pandas as pd
import numpy as np
from pprint import pprint

### Load Metadata and Clinical JSON Files

In [7]:
# Load JSON data for metadata and clinical information
# Ensure these files are located in the same directory or provide the full path if they are located elsewhere
with open('metadata.cart.2024-10-27.json') as metadata_file:
    metadata = json.load(metadata_file)

with open('clinical.cart.2024-10-27.json') as clinical_file:
    clinical = json.load(clinical_file)

# Display a sample entry from metadata and clinical to verify the structure
pprint(metadata[0])  # Display the first entry of metadata
pprint(clinical[0])  # Display the first entry of clinical data

{'access': 'open',
 'analysis': {'analysis_id': '760850fa-8b86-49fb-be5f-8f8f5c6dbea4',
              'created_datetime': '2018-03-20T15:27:35.798083-05:00',
              'input_files': [{'access': 'controlled',
                               'created_datetime': '2018-03-20T15:27:35.798083-05:00',
                               'data_category': 'Sequencing Reads',
                               'data_format': 'BAM',
                               'data_type': 'Aligned Reads',
                               'experimental_strategy': 'miRNA-Seq',
                               'file_id': 'cc5deb2c-cad3-4bad-869c-5e37f5a4ca6b',
                               'file_name': 'TCGA-21-A5DI-01A-31H-A26V-13_mirna_gdc_realn.bam',
                               'file_size': 157942346,
                               'md5sum': '9f120e02415d1c617d8c23f68e0424fd',
                               'platform': 'Illumina',
                               'state': 'released',
                               '

### Extract file_id and case_id from Metadata

In [9]:
# Initialize an empty list to store file_id and case_id pairs
labels = []

# Extract 'file_id' and 'case_id' from each entry in metadata
for entry in metadata:
    file_id = entry['file_id']
    case_id = entry['associated_entities'][0]['case_id']  # Assumes 'associated_entities' is a list with one element
    labels.append({'file_id': file_id, 'case_id': case_id})

# Display the extracted labels to verify correctness
pprint(labels[:5])  # Display the first 5 entries of labels

[{'case_id': 'a7cd7b0f-ab76-4f3f-a2d5-5ec04fe6c4f3',
  'file_id': '4a8f4158-92ab-42a6-ac2f-d688bdd6b39b'},
 {'case_id': 'dbece124-c042-4adc-8136-90e7940ee6ad',
  'file_id': 'b688460b-5c4a-4eaa-84fc-302f29c47def'},
 {'case_id': 'dbece124-c042-4adc-8136-90e7940ee6ad',
  'file_id': '3de2e5b4-fdde-4b37-91e7-21a72ccc97f9'},
 {'case_id': '7b982d5e-3a7d-40ac-bd25-6044c62879b6',
  'file_id': 'fd4e10cd-b09d-4797-abb0-c039248c9def'},
 {'case_id': '3351b902-9b7e-4b90-bf6b-bcf74be00bc1',
  'file_id': 'a7d5bf83-d019-4379-93b2-d15be80fe95d'}]


### Match case_id with Clinical Data to Retrieve ajcc_pathologic_stag

In [10]:
# Counter for matched entries (similar to 'm' in MATLAB)
match_count = 0

# Loop through each clinical entry and find corresponding case_id in labels
for clinical_entry in clinical:
    # Check if 'diagnoses' and 'ajcc_pathologic_stage' fields are present
    if 'diagnoses' in clinical_entry and 'ajcc_pathologic_stage' in clinical_entry['diagnoses']:
        case_id = clinical_entry['case_id']
        
        # Search for matching case_id in labels
        for label in labels:
            if label['case_id'] == case_id:
                # If case_id matches, add the 'ajcc_pathologic_stage' as 'label'
                label['label'] = clinical_entry['diagnoses']['ajcc_pathologic_stage']
                match_count += 1

# Display the updated labels list with added 'label' fields to verify
pprint(labels[:5])  # Display the first 5 entries to verify stages have been added
print("Total matches found:", match_count)

[{'case_id': 'a7cd7b0f-ab76-4f3f-a2d5-5ec04fe6c4f3',
  'file_id': '4a8f4158-92ab-42a6-ac2f-d688bdd6b39b'},
 {'case_id': 'dbece124-c042-4adc-8136-90e7940ee6ad',
  'file_id': 'b688460b-5c4a-4eaa-84fc-302f29c47def'},
 {'case_id': 'dbece124-c042-4adc-8136-90e7940ee6ad',
  'file_id': '3de2e5b4-fdde-4b37-91e7-21a72ccc97f9'},
 {'case_id': '7b982d5e-3a7d-40ac-bd25-6044c62879b6',
  'file_id': 'fd4e10cd-b09d-4797-abb0-c039248c9def'},
 {'case_id': '3351b902-9b7e-4b90-bf6b-bcf74be00bc1',
  'file_id': 'a7d5bf83-d019-4379-93b2-d15be80fe95d'}]
Total matches found: 0


### Map ajcc_pathologic_stage to Numerical Labels

In [40]:
# Initialize an empty list for the numerical labels
numerical_labels = []

# Check the unique values of the label field in labels to understand its contents
unique_stages = set(entry.get('label', 'No label') for entry in labels)
print("Unique stages found in labels:", unique_stages)

# Map each clinical stage to a number and add to the numerical_labels list
for entry in labels:
    # Retrieve the stage label, strip whitespace, and convert to uppercase for uniformity
    stage = entry.get('label', '').strip().upper()
    print(f"Mapping stage: {stage}")  # Diagnostic output for each stage

    # Map stage to numerical labels
    if stage in ['STAGE I', 'STAGE IA', 'STAGE IB']:
        numerical_labels.append(1)  # Cancer Stage I
        print("Mapped to: 1")
    elif stage in ['STAGE II', 'STAGE IIA', 'STAGE IIB']:
        numerical_labels.append(2)  # Cancer Stage II
        print("Mapped to: 2")
    elif stage in ['STAGE III', 'STAGE IIIA', 'STAGE IIIB']:
        numerical_labels.append(3)  # Cancer Stage III
        print("Mapped to: 3")
    elif stage in ['STAGE IV', 'STAGE IVA', 'STAGE IVB']:
        numerical_labels.append(4)  # Cancer Stage IV
        print("Mapped to: 4")
    else:
        numerical_labels.append(0)  # Healthy patient (non-cancerous)
        print("Mapped to: 0 (healthy patient)")

# Display the first 10 entries in numerical_labels to verify the mapping
print("First 10 entries in numerical_labels:", numerical_labels[:10])

Unique stages found in labels: {'', 'Stage IIB', 'Stage III', 'Stage IV', 'Stage IIIA', 'Stage II', 'Stage I', 'Stage IA', 'Stage IIA', 'Stage IB', 'Stage IIIB', 'No label'}
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: 
Mapped to: 0 (healthy patient)
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: STAGE IIIA
Mapped to: 3
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: STAGE IIA
Mapped to: 2
Mapping stage: STAGE IIA
Mapped to: 2
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IIIA
Mapped to: 3
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IIIA
Mapped to: 3
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IIB
Mapped to: 2
Mapping stage: STAGE IB
Mapped to: 1
Mapping stage: STAGE IA
Mapped to: 1
Mapping stage: STAGE IIIA
Mapp

### Read miRNA Quantification Files

In [41]:
# Define the directory containing the training files
TrainDir = './Train'  # Update this path if necessary
data_list = []  # List to store the extracted data
file_ids = []   # List to store file IDs

# Traverse each folder within the Train directory
for folder_name in os.listdir(TrainDir):
    folder_path = os.path.join(TrainDir, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Find .mirnas.quantific files in the current directory
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.mirnas.quantific'):
                file_path = os.path.join(folder_path, file_name)
                
                # Read the file as a DataFrame
                df = pd.read_csv(file_path, sep='\t')
                
                # Check if the data has the expected 1881 rows
                if df.shape[0] == 1881:
                    # Extract the third column and convert it to a list, then append it
                    data_list.append(df.iloc[:, 2].values)  # Third column assumed to be index 2
                    file_ids.append(folder_name)  # Use the folder name as the file ID
                else:
                    print(f"Warning: File {file_path} does not have 1881 rows")


 ### Create the Feature Matrix and Append Labels

In [42]:
# Convert data_list directly into a DataFrame (no transpose needed)
data_matrix = pd.DataFrame(data_list)

# Add the numerical labels as the last column in data_matrix
data_matrix['Label'] = numerical_labels

# Display the data matrix to verify structure
data_matrix.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1872,1873,1874,1875,1876,1877,1878,1879,1880,Label
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,3.022415,0.0,0.0,1.847031,0.0,40.298863,35.429417,148.602058,12118.707689,1
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,6.45114,0.0,128.562009,4.607957,0.0,8.60152,38.86044,111.512567,7471.802757,1
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,2.910966,0.0,161.267504,1.746579,0.0,33.767203,31.43843,168.253822,16026.613214,0
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,6.452288,0.0,1.683206,10.660302,0.0,5.049617,95.101114,1416.978551,12750.562682,1
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,9.201018,0.0,97.990843,3.450382,0.0,22.77252,46.235116,455.450396,14401.203493,1


### Add Column Names and Save to CSV

In [43]:
# Use the miRNA IDs from the first file as column names, with 'Label' as the final column name
miRNA_ids = list(pd.read_csv(os.path.join(TrainDir, os.listdir(TrainDir)[0], os.listdir(os.path.join(TrainDir, os.listdir(TrainDir)[0]))[0]), sep='\t').iloc[:, 0])
column_names = miRNA_ids + ['Label']

# Set the column names for the data matrix
data_matrix.columns = column_names

# Save the feature matrix to 'Lung.csv'
data_matrix.to_csv('Lung.csv', index=False)

print("Lung.csv created successfully.")


Lung.csv created successfully.
