In [None]:
import requests
import json
import os

# Create folder for downloads
os.makedirs("gbm_rnaseq", exist_ok=True)

# filters to find RNA-Seq files for TCGA-GBM
filters = {
    "op": "and",
    "content": [
        {"op": "=", "content": {"field": "cases.project.project_id", "value": "TCGA-GBM"}},
        {"op": "=", "content": {"field": "data_category", "value": "Transcriptome Profiling"}},
        {"op": "=", "content": {"field": "data_type", "value": "Gene Expression Quantification"}}
    ]
}

params = {
    "filters": json.dumps(filters),
    "fields": "file_id,file_name",
    "format": "JSON",
    "size": "10000"
}

response = requests.get("https://api.gdc.cancer.gov/files", params=params)
files = response.json()["data"]["hits"]

print(f"Found {len(files)} files")

# Download each file
for f in files:
    file_id = f["file_id"]
    file_name = f["file_name"]
    url = f"https://api.gdc.cancer.gov/data/{file_id}"

    print("Downloading:", file_name)
    r = requests.get(url, stream=True)
    with open(os.path.join("gbm_rnaseq", file_name), "wb") as out:
       for chunk in r.iter_content(chunk_size=8192):
         if chunk:
            out.write(chunk)




Found 391 files
Downloading: 1ccc2b22-7f6c-4973-a6fa-f2c4d6be6dba.rna_seq.augmented_star_gene_counts.tsv
Downloading: 44aa45ef-4bcd-47eb-a2ef-68b614d00a51.rna_seq.augmented_star_gene_counts.tsv
Downloading: 967e6f4b-8c6b-4498-87c2-330b2c174061.rna_seq.augmented_star_gene_counts.tsv
Downloading: 2426bdbe-9a09-4ea1-b5eb-1ecb2c492898.rna_seq.augmented_star_gene_counts.tsv
Downloading: d2b6a1a6-ed16-4374-8b28-912237ed87c2.rna_seq.augmented_star_gene_counts.tsv
Downloading: 520c0e78-f0a3-4e46-8c8e-00cf654e100d.rna_seq.augmented_star_gene_counts.tsv
Downloading: 229e6793-e11e-46c8-9307-1847243a311c.rna_seq.augmented_star_gene_counts.tsv
Downloading: 1f027bee-5316-4d9f-a566-41890da18493.rna_seq.augmented_star_gene_counts.tsv
Downloading: e3ca6a99-2907-4509-a416-eb7362b0b606.rna_seq.augmented_star_gene_counts.tsv
Downloading: 2c3f2491-4d3b-41da-9866-94644fc8d102.rna_seq.augmented_star_gene_counts.tsv
Downloading: 69804564-72f3-443e-a45e-add43d917b2c.rna_seq.augmented_star_gene_counts.tsv
Downl

In [7]:
import os

#print(os.listdir("gbm_rnaseq")[:10])  # show first 10 files
#print(len(os.listdir("gbm_rnaseq")))  # should be 391

print(os.getcwd())


c:\Users\joann\AppData\Local\Programs\Microsoft VS Code


In [None]:
import pandas as pd
import glob
import os


# 1. Find all the TSV files in the gbm_rnaseq folder
files = glob.glob("gbm_rnaseq/*.tsv")
print(f"Found {len(files)} files")

# 2. This list will store one DataFrame per sample
data_frames = []

# 3. Loop over each file (one file ≈ one sample)
for file in files:
    # --- A. Read the STAR gene-count file ---
    # comment="#" tells pandas: "skip lines that start with #"
    df = pd.read_csv(file, sep="\t", comment="#")
    
    # --- B. Decide which column contains the counts ---
    # STAR 'augmented_star_gene_counts' files typically have columns:
    # gene_id, gene_name, unstranded, stranded_first, stranded_second, input
    # We want the 'unstranded' counts 
    if "unstranded" in df.columns:
        count_col = "unstranded"
    else:
        # Fallback: use the third column if for some reason the name is different
        count_col = df.columns[2]
    
    # --- C. Create a sample ID based on the filename ---
    # Example filename:
    # "1ccc2b22-7f6c-4973-a6fa-f2c4d6be6dba.rna_seq.augmented_star_gene_counts.tsv"
    # We take the part before the first dot (the UUID):
    sample_id = os.path.basename(file).split(".")[0]
    
    # --- D. Keep only gene_id and this sample's counts ---
    df = df[["gene_id", count_col]]
    
    # --- E. Remove non-gene summary rows ---
    # STAR adds rows like "__no_feature", "__ambiguous", etc.
    # These are not real genes and must be removed.
    df = df[~df["gene_id"].str.startswith("__")]
    
    # --- F. Set gene_id as the index (rows labeled by gene) ---
    df = df.set_index("gene_id")
    
    # --- G. Rename the count column to the sample_id ---
    # So instead of column name 'unstranded', it becomes that sample's ID
    df = df.rename(columns={count_col: sample_id})
    
    # --- H. Store this per-sample DataFrame for later concatenation ---
    data_frames.append(df)

# 4. Concatenate all sample DataFrames side-by-side by gene_id
# Because all dfs have the same index (gene_id), concat aligns by genes.
merged_df = pd.concat(data_frames, axis=1)

# 5. Print shape: (number_of_genes, number_of_samples)
print("Final expression matrix shape:", merged_df.shape)

# 6. Save to CSV
output_path = "gbm_rnaseq/merged_gbm_rnaseq.csv"
merged_df.to_csv(output_path)
print("Saved merged matrix to:", output_path)



Found 391 files
Final expression matrix shape: (60664, 391)
Saved merged matrix to: gbm_rnaseq/merged_gbm_rnaseq.csv


In [2]:
import requests
import json
import pandas as pd
# Step 1: Request all clinical records for TCGA-GBM
endpoint = "https://api.gdc.cancer.gov/cases"

filters = {
    "op": "in",
    "content": {
        "field": "project.project_id",
        "value": ["TCGA-GBM"]
    }
}

params = {
    "filters": json.dumps(filters),
    "fields": ",".join([
        "case_id",
        "submitter_id",
        "demographic.gender",
        "demographic.race",
        "demographic.ethnicity",
        "diagnoses.age_at_diagnosis",
        "diagnoses.days_to_death",
        "diagnoses.days_to_last_follow_up",
        "diagnoses.vital_status",
        "diagnoses.tumor_grade",
        "diagnoses.morphology",
        "diagnoses.primary_diagnosis"
    ]),
    "format": "JSON",
    "size": "10000"
}

response = requests.get(endpoint, params=params)
data = response.json()["data"]["hits"]

clinical_df = pd.json_normalize(data)
clinical_df.to_csv("clinical_gbm.csv", index=False)

print("Saved clinical metadata with shape:", clinical_df.shape)
clinical_df.head()


Saved clinical metadata with shape: (617, 7)


Unnamed: 0,id,case_id,submitter_id,diagnoses,demographic.race,demographic.gender,demographic.ethnicity
0,025a7401-a65d-4ea0-8b4e-0ba775b0322a,025a7401-a65d-4ea0-8b4e-0ba775b0322a,TCGA-12-0819,"[{'age_at_diagnosis': 18588, 'primary_diagnosi...",black or african american,female,not hispanic or latino
1,e3711a9b-6d4c-44df-bbab-0a675046a5df,e3711a9b-6d4c-44df-bbab-0a675046a5df,TCGA-06-0208,"[{'age_at_diagnosis': 19257, 'primary_diagnosi...",white,female,not hispanic or latino
2,cc1459be-de8f-482e-9efe-65937db9dc45,cc1459be-de8f-482e-9efe-65937db9dc45,TCGA-12-1601,"[{'days_to_last_follow_up': None, 'age_at_diag...",not reported,not reported,not reported
3,d75996d6-9f02-4478-a4a1-dfa7ab41de77,d75996d6-9f02-4478-a4a1-dfa7ab41de77,TCGA-06-0131,"[{'days_to_last_follow_up': None, 'age_at_diag...",not reported,not reported,not reported
4,883dc176-925a-44f7-9ec8-e0cce33c2a54,883dc176-925a-44f7-9ec8-e0cce33c2a54,TCGA-02-0099,"[{'days_to_last_follow_up': 106.0, 'age_at_dia...",white,male,not hispanic or latino


In [9]:
#visualize rna-seq file 
import pandas as pd
import glob
import os
os.chdir(r"C:/Users/joann/AppData/Local/Programs/Microsoft VS Code")


files = glob.glob("gbm_rnaseq/*.tsv")
sample_file = files[0]   # take the first file
print(sample_file)

df = pd.read_csv(sample_file, sep="\t", comment="#")
df.head(10)


gbm_rnaseq\006b091b-3a7a-4a26-9eb6-d7797874ad9b.rna_seq.augmented_star_gene_counts.tsv


Unnamed: 0,gene_id,gene_name,gene_type,unstranded,stranded_first,stranded_second,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded
0,N_unmapped,,,4044353,4044353,4044353,,,
1,N_multimapping,,,14331323,14331323,14331323,,,
2,N_noFeature,,,26589531,93361615,27503197,,,
3,N_ambiguous,,,13803893,113272,10907055,,,
4,ENSG00000000003.15,TSPAN6,protein_coding,4281,1,4282,43.1334,18.2247,17.032
5,ENSG00000000005.6,TNMD,protein_coding,21,12,9,0.6502,0.2747,0.2568
6,ENSG00000000419.13,DPM1,protein_coding,923,15,908,34.9491,14.7666,13.8003
7,ENSG00000000457.14,SCYL3,protein_coding,1056,324,1157,7.0118,2.9626,2.7687
8,ENSG00000000460.17,C1orf112,protein_coding,577,261,780,4.4172,1.8663,1.7442
9,ENSG00000000938.13,FGR,protein_coding,458,3,455,6.1892,2.615,2.4439


In [11]:
#visualize expression matrix

expr = pd.read_csv("gbm_rnaseq/merged_gbm_rnaseq.csv", index_col=0)
expr.shape
expr.head()
expr.iloc[:10, :10]   # first 5 genes × first 5 samples


Unnamed: 0_level_0,006b091b-3a7a-4a26-9eb6-d7797874ad9b,0257f378-cf6a-42f2-9cb1-0d10b4ca1b9a,026ed595-4218-4379-8e2d-54b390cb2544,034c42f2-fee6-4196-9c02-24589ce01071,03ddea6f-d377-4e2a-a59d-91d5c5b6e4ed,045a8628-9c12-4816-ae31-bad5ba0d784f,04b6d829-b031-4d9b-99cd-7f9007227edd,051dc36f-47e2-4d77-b9b9-c5f2d315f5f3,05d2774d-cb3e-4485-9825-c04d4ca44bb6,0601cab0-163f-48e4-bc9f-87b068d8b591
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
N_unmapped,4044353,5339128,6380180,4124177,6961328,6493356,6575531,7431873,6489128,6831224
N_multimapping,14331323,16653665,19153916,15701391,4419194,11385005,35093405,4539648,26634100,14771366
N_noFeature,26589531,15661372,15602661,23206565,1891287,34334638,9511456,1450084,15495803,42653744
N_ambiguous,13803893,15990187,11210790,12030266,6484715,11647660,18753690,4828004,20607621,12461464
ENSG00000000003.15,4281,2030,4150,1380,361,1615,633,5935,3159,3340
ENSG00000000005.6,21,20,2,9,26,25386,8,10,9,13
ENSG00000000419.13,923,1020,1107,994,1893,1041,468,1447,742,1119
ENSG00000000457.14,1056,1431,1006,677,647,1002,515,478,821,1246
ENSG00000000460.17,577,1111,673,415,352,513,324,213,436,1056
ENSG00000000938.13,458,530,532,571,314,1041,692,1085,228,490


In [13]:
#visualize clinical metadata 
clinical = pd.read_csv("C:/Users/joann/Desktop/M2/Deep_Learning/clinical_gbm.csv")
clinical.shape
clinical.head()


Unnamed: 0,id,case_id,submitter_id,diagnoses,demographic.race,demographic.gender,demographic.ethnicity
0,025a7401-a65d-4ea0-8b4e-0ba775b0322a,025a7401-a65d-4ea0-8b4e-0ba775b0322a,TCGA-12-0819,"[{'age_at_diagnosis': 18588, 'primary_diagnosi...",black or african american,female,not hispanic or latino
1,e3711a9b-6d4c-44df-bbab-0a675046a5df,e3711a9b-6d4c-44df-bbab-0a675046a5df,TCGA-06-0208,"[{'age_at_diagnosis': 19257, 'primary_diagnosi...",white,female,not hispanic or latino
2,cc1459be-de8f-482e-9efe-65937db9dc45,cc1459be-de8f-482e-9efe-65937db9dc45,TCGA-12-1601,"[{'days_to_last_follow_up': None, 'age_at_diag...",not reported,not reported,not reported
3,d75996d6-9f02-4478-a4a1-dfa7ab41de77,d75996d6-9f02-4478-a4a1-dfa7ab41de77,TCGA-06-0131,"[{'days_to_last_follow_up': None, 'age_at_diag...",not reported,not reported,not reported
4,883dc176-925a-44f7-9ec8-e0cce33c2a54,883dc176-925a-44f7-9ec8-e0cce33c2a54,TCGA-02-0099,"[{'days_to_last_follow_up': 106.0, 'age_at_dia...",white,male,not hispanic or latino
