In [1]:
import pandas as pd
import re

GTEx data was downloaded from [GTEx Portal](https://gtexportal.org/home/downloads/adult-gtex/) on 12/10/2025

In [21]:
expression_data_path = '/private10/Projects/Nave_Oded_Project/resources/GTEx_Analysis_v10_RNASeQCv2.4.2_gene_tpm.gct.gz'
sample_attributes_path = '/private10/Projects/Nave_Oded_Project/resources/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt'
subject_attributes_path = '/private10/Projects/Nave_Oded_Project/resources/GTEx_Analysis_v10_Annotations_SubjectPhenotypesDS.txt'

In [3]:
expression_data_df = pd.read_csv(expression_data_path, low_memory=False, sep='\t', skiprows=2, comment=None)

In [4]:
# Remove version numbers from ENSG IDs and set as index
expression_data_df['Name'] = expression_data_df['Name'].str.split('.').str[0]
expression_data_df.set_index("Name", inplace=True)

In [31]:
sample_attributes_df = pd.read_csv(sample_attributes_path, sep='\t', low_memory=False)

In [33]:
sample_attributes_df.set_index("SAMPID", inplace=True)
# Keep only rows where SMTSD contains "brain" (case-insensitive)
brain_samples = sample_attributes_df[sample_attributes_df["SMTSD"].str.contains(r"brain", flags=re.IGNORECASE, na=False)]

print(brain_samples.shape)
print(brain_samples["SMTSD"].unique())


(8035, 118)
['Brain - Frontal Cortex (BA9)' 'Brain - Cerebellar Hemisphere'
 'Brain - Substantia nigra' 'Brain - Anterior cingulate cortex (BA24)'
 'Brain - Amygdala' 'Brain - Caudate (basal ganglia)'
 'Brain - Nucleus accumbens (basal ganglia)'
 'Brain - Putamen (basal ganglia)' 'Brain - Cortex' 'Brain - Hypothalamus'
 'Brain - Cerebellum' 'Brain - Spinal cord (cervical c-1)'
 'Brain - Hippocampus']


In [35]:
brain_samples[['SMTS', 'SMTSD']].head()

Unnamed: 0_level_0,SMTS,SMTSD
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1
GTEX-1117F-0011-R10a-SM-AHZ7F,Brain,Brain - Frontal Cortex (BA9)
GTEX-1117F-0011-R10b-SM-CYKQ8,Brain,Brain - Frontal Cortex (BA9)
GTEX-1117F-0011-R10b-SM-GI4VE,Brain,Brain - Frontal Cortex (BA9)
GTEX-1117F-0011-R10b-SM-LLLJO,Brain,Brain - Frontal Cortex (BA9)
GTEX-1117F-0011-R11b-SM-GIN8R,Brain,Brain - Cerebellar Hemisphere


In [40]:
# 1Ô∏è‚É£ Get the sample IDs that correspond to brain tissues
brain_sample_ids = brain_samples.index.tolist()

# 2Ô∏è‚É£ Select only those columns (samples) from the expression data
#    The expression dataframe has genes as rows and sample IDs as columns.
brain_expression_df = expression_data_df[
    ["Description"] + [c for c in expression_data_df.columns if c in brain_sample_ids]
].copy()

# 3Ô∏è‚É£ Check what you got
print("Brain expression shape:", brain_expression_df.shape)
print("Example columns:", brain_expression_df.columns[:5])


Brain expression shape: (59033, 3235)
Example columns: Index(['Description', 'GTEX-1117F-0011-R10b-SM-GI4VE',
       'GTEX-1117F-0011-R11b-SM-GIN8R', 'GTEX-1117F-0011-R2b-SM-GI4VL',
       'GTEX-1117F-0011-R3a-SM-GJ3PJ'],
      dtype='object')


In [41]:
brain_expression_path = "/private10/Projects/Nave_Oded_Project/resources/GTEx_v10_brain_samples_only.csv"
brain_expression_df.to_csv(brain_expression_path)


---

In [4]:
brain_expression_path = "/private10/Projects/Nave_Oded_Project/resources/GTEx_v10_brain_samples_only.csv"
brain_expression_df = pd.read_csv(brain_expression_path)

In [43]:
# print the head of the first 5 columns
print(brain_expression_df.head().iloc[:, :5])

                 Description  GTEX-1117F-0011-R10b-SM-GI4VE  \
Name                                                          
ENSG00000223972      DDX11L1                       0.000000   
ENSG00000227232       WASH7P                       3.579280   
ENSG00000278267    MIR6859-1                       0.000000   
ENSG00000243485  MIR1302-2HG                       0.093825   
ENSG00000237613      FAM138A                       0.000000   

                 GTEX-1117F-0011-R11b-SM-GIN8R  GTEX-1117F-0011-R2b-SM-GI4VL  \
Name                                                                           
ENSG00000223972                       0.000000                       0.00000   
ENSG00000227232                      10.189300                       2.96650   
ENSG00000278267                       0.000000                       0.00000   
ENSG00000243485                       0.034191                       0.00000   
ENSG00000237613                       0.000000                       0.01766  

In [44]:
brain_expression_df.head()

Unnamed: 0_level_0,Description,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,GTEX-1117F-0011-R6a-SM-GI4VX,GTEX-1117F-0011-R7a-SM-H65ZK,GTEX-1117F-3226-SM-5N9CT,...,GTEX-ZZPT-0011-R2a-SM-GNTB1,GTEX-ZZPT-0011-R3a-SM-GOQYT,GTEX-ZZPT-0011-R4a-SM-GPRX4,GTEX-ZZPT-0011-R5a-SM-GPRX5,GTEX-ZZPT-0011-R6a-SM-GPRX6,GTEX-ZZPT-0011-R7a-SM-H7OH8,GTEX-ZZPT-0011-R8a-SM-GQ1CW,GTEX-ZZPT-0011-R9a-SM-GOQYS,GTEX-ZZPT-2926-SM-5EQ5S,GTEX-ZZPT-3026-SM-5GZXH
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972,DDX11L1,0.0,0.0,0.0,0.0,0.012397,0.0,0.029395,0.0,0.018039,...,0.034923,0.0,0.0,0.025464,0.028247,0.0,0.0,0.043188,0.031779,0.01909
ENSG00000227232,WASH7P,3.57928,10.1893,2.9665,3.6828,3.59921,3.51443,4.28638,2.60558,6.62309,...,2.31891,1.03295,2.82166,1.62451,2.29855,2.86997,4.30291,3.37371,3.641,2.08779
ENSG00000278267,MIR6859-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000243485,MIR1302-2HG,0.093825,0.034191,0.0,0.0,0.02475,0.0,0.0,0.026287,0.0,...,0.0,0.03168,0.0,0.02542,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000237613,FAM138A,0.0,0.0,0.01766,0.0,0.0,0.0,0.0,0.0,0.0,...,0.024767,0.0,0.0,0.0,0.020033,0.0,0.0,0.0,0.0,0.0


üß¨ Compute overall (all brain samples) mean and median expression per gene

In [45]:
# Drop the Description column to focus only on numeric expression values
gene_expression_only = brain_expression_df.drop(columns=["Description"])

# Compute across all brain sample columns
brain_expression_df["avg_tpm_brain_overall"] = gene_expression_only.mean(axis=1)
brain_expression_df["median_tpm_brain_overall"] = gene_expression_only.median(axis=1)

In [46]:
print(brain_expression_df[["avg_tpm_brain_overall", "median_tpm_brain_overall"]].head())

                 avg_tpm_brain_overall  median_tpm_brain_overall
Name                                                            
ENSG00000223972               0.009852                  0.000000
ENSG00000227232               2.816716                  2.126625
ENSG00000278267               0.005011                  0.000000
ENSG00000243485               0.031329                  0.000000
ENSG00000237613               0.009133                  0.000000


Calculate per-region mean and median expression per gene

In [52]:
# Map sample IDs to tissue names
sample_to_tissue = brain_samples["SMTSD"].to_dict()

# Create empty dataframes to hold per-tissue medians/means
region_medians = pd.DataFrame(index=brain_expression_df.index)
region_means = pd.DataFrame(index=brain_expression_df.index)

# Loop through brain tissues
for tissue in sorted(brain_samples["SMTSD"].unique()):
    tissue_samples = [s for s in sample_to_tissue.keys() if sample_to_tissue[s] == tissue and s in brain_expression_df.columns]
    if not tissue_samples:
        continue
    region_medians[f"{tissue}_median_tpm"] = brain_expression_df[tissue_samples].median(axis=1)
    region_means[f"{tissue}_avg_tpm"] = brain_expression_df[tissue_samples].mean(axis=1)

# Combine all results
final_df = pd.concat(
    [brain_expression_df[["Description", "avg_tpm_brain_overall", "median_tpm_brain_overall"]],
     region_medians, region_means],
    axis=1
)
final_df.rename(columns={"Description": "gene_symbol"}, inplace=True)

# Save to file
final_df.to_csv("/private10/Projects/Nave_Oded_Project/resources/GTEx_v10_brain_summary_with_tissue_stats.csv")


In [53]:
final_df[final_df['median_tpm_brain_overall'] > 0].head()

Unnamed: 0_level_0,gene_symbol,avg_tpm_brain_overall,median_tpm_brain_overall,Brain - Amygdala_median_tpm,Brain - Anterior cingulate cortex (BA24)_median_tpm,Brain - Caudate (basal ganglia)_median_tpm,Brain - Cerebellar Hemisphere_median_tpm,Brain - Cerebellum_median_tpm,Brain - Cortex_median_tpm,Brain - Frontal Cortex (BA9)_median_tpm,...,Brain - Cerebellar Hemisphere_avg_tpm,Brain - Cerebellum_avg_tpm,Brain - Cortex_avg_tpm,Brain - Frontal Cortex (BA9)_avg_tpm,Brain - Hippocampus_avg_tpm,Brain - Hypothalamus_avg_tpm,Brain - Nucleus accumbens (basal ganglia)_avg_tpm,Brain - Putamen (basal ganglia)_avg_tpm,Brain - Spinal cord (cervical c-1)_avg_tpm,Brain - Substantia nigra_avg_tpm
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000227232,WASH7P,2.816716,2.126625,1.45902,1.72721,1.603355,4.89435,5.829275,2.462825,2.23392,...,5.882047,6.464196,2.817945,2.553002,2.074703,2.092068,1.771118,1.730482,3.038936,2.119569
ENSG00000268020,OR4G4P,0.063524,0.041579,0.044141,0.043285,0.044695,0.024959,0.03757,0.043004,0.039431,...,0.050194,0.055158,0.075366,0.060246,0.089534,0.082415,0.066923,0.063284,0.043207,0.057017
ENSG00000240361,OR4G11P,0.073866,0.050731,0.046082,0.051531,0.052891,0.038577,0.043402,0.058734,0.058738,...,0.054099,0.066238,0.094192,0.07469,0.107545,0.092589,0.075897,0.069169,0.051481,0.058933
ENSG00000186092,OR4F5,0.105407,0.061758,0.065919,0.059381,0.062508,0.05165,0.050438,0.074833,0.058579,...,0.081856,0.098967,0.125918,0.095711,0.145249,0.121289,0.104596,0.113074,0.074847,0.092685
ENSG00000238009,ENSG00000238009,0.054287,0.033257,0.029489,0.030933,0.026621,0.043516,0.055766,0.045757,0.039638,...,0.063887,0.079234,0.065417,0.059346,0.049212,0.062677,0.051246,0.039018,0.048691,0.051661


In [54]:
variants_table_path = "/private10/Projects/Nave_Oded_Project/extended_variants_table/Results/processed_varicarta_table_2nd_run_with_sfari_with_neighbors.csv"
variants_df = pd.read_csv(variants_table_path, low_memory=False)

In [57]:
# Merge with variants table based on 'Gene' and 'Name' 
merged_df = variants_df.merge(final_df, left_on='Gene', right_index=True, how='left')

In [58]:
merged_df.head()

Unnamed: 0,chr,pos,ref,alt,AF,AFR_AF,AMR_AF,Allele,Amino_acids,BIOTYPE,...,Brain - Cerebellar Hemisphere_avg_tpm,Brain - Cerebellum_avg_tpm,Brain - Cortex_avg_tpm,Brain - Frontal Cortex (BA9)_avg_tpm,Brain - Hippocampus_avg_tpm,Brain - Hypothalamus_avg_tpm,Brain - Nucleus accumbens (basal ganglia)_avg_tpm,Brain - Putamen (basal ganglia)_avg_tpm,Brain - Spinal cord (cervical c-1)_avg_tpm,Brain - Substantia nigra_avg_tpm
0,1,169824927,A,G,,,,G,-,protein_coding,...,6.20185,5.881541,2.460176,2.748795,1.887815,2.295096,1.976621,1.590595,2.834376,1.976708
1,5,133635407,T,C,,,,C,-,protein_coding,...,3.510633,3.11512,2.46017,2.978207,2.296509,2.952625,2.43347,1.907634,3.419125,2.635538
2,10,55583161,T,G,,,,G,-,protein_coding,...,0.554923,0.506453,1.261631,1.232025,2.024333,2.551597,1.287628,0.841595,1.166898,1.964081
3,15,28953439,G,A,,,,A,-,protein_coding,...,0.327159,0.466331,0.885908,1.281368,0.442546,0.536607,0.728726,0.436245,0.734239,0.4795
4,3,115350078,T,TTGTTT,,,,TGTTT,-,protein_coding,...,309.750739,246.401112,418.888019,561.639055,144.903615,353.694479,222.989646,74.781943,41.129144,163.266962


In [59]:
# Save the merged dataframe
merged_df.to_csv("/private10/Projects/Nave_Oded_Project/extended_variants_table/Results/processed_varicarta_table_2nd_run_with_sfari_with_neighbors_with_GTEx.csv", index=False)