In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

df = pd.read_csv('../../data/GSE218462_raw_counts_GRCh38.p13_NCBI.tsv', sep='\t')
df = df.T
df.columns = df.iloc[0]
df = df[1:]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data)

scaled_df.columns = df.columns
scaled_df.index = df.index
unedited = ['GSM6745632', 'GSM6745633', 'GSM6745634', 'GSM6745635', 'GSM6745636', 'GSM6745637']
scaled_df['Edited (1) or Unedited (0)'] = scaled_df.index.map(lambda gene: 0 if gene in unedited else 1)
mechanisms = {
    "BE4": ["GSM6745599", "GSM6745600", "GSM6745601", "GSM6745611", "GSM6745612", "GSM6745613"],
    "ABE8": ["GSM6745602", "GSM6745603", "GSM6745604", "GSM6745614", "GSM6745615", "GSM6745616"],
    "Cas9": ["GSM6745605", "GSM6745606", "GSM6745607", "GSM6745617", "GSM6745618", "GSM6745619"],
    "Utelectro": ["GSM6745608", "GSM6745609", "GSM6745610", "GSM6745620", "GSM6745621", "GSM6745622"],
    "dCas9": ["GSM6745623", "GSM6745624", "GSM6745625"],
    "BE4alone": ["GSM6745626", "GSM6745627", "GSM6745628"],
    "ABE8alone": ["GSM6745629", "GSM6745630", "GSM6745631"],
    "UT": ["GSM6745632", "GSM6745633", "GSM6745634", "GSM6745635", "GSM6745636", "GSM6745637"]
}

# Inverting the dictionary to map gene code to its corresponding key
mechanism_map = {gene: mechanism for mechanism, genes in mechanisms.items() for gene in genes}

scaled_df['editing mechanism'] = scaled_df.index.map(mechanism_map)

# print(scaled_df)

txt_file_path = '../EDA_sj/relevant_genes_1.6_250.txt'
with open(txt_file_path, 'r') as file:
    txt_data = file.read()

soham_gene_ids = txt_data.split(',')

column_names = scaled_df.columns.tolist()

column_names = [str(col) for col in scaled_df.columns.tolist()]
filtered_column_names = [col for col in scaled_df.columns if str(col) in soham_gene_ids]
scaled_df = scaled_df[filtered_column_names + ['Edited (1) or Unedited (0)', 'editing mechanism']]

# Output the filtered DataFrame
print(scaled_df)

GeneID      113219467     57801      9636  100288175  102465434  109623456  \
GSM6745599  -0.158632  1.070933  1.152150   0.499194  -0.342997  -0.697982   
GSM6745600   1.379276  0.767235  0.761592  -0.765431  -0.342997  -0.697982   
GSM6745601   5.355088  3.457126  2.062135   1.706335  -0.342997  -0.697982   
GSM6745602   2.002351  0.897391  0.336385  -0.075635  -0.342997   0.697982   
GSM6745603   0.679553  1.070933  0.673147   0.614160  -0.342997   0.697982   
GSM6745604  -0.213027  1.548171  0.828458  -0.018153  -0.342997  -0.697982   
GSM6745605   0.009500  0.810621 -0.304008   1.016540  -0.342997  -0.697982   
GSM6745606  -0.393521 -0.274013 -0.724960  -0.650465  -0.342997  -0.697982   
GSM6745607  -0.274840  0.289997 -0.605817   0.384228  -0.342997  -0.697982   
GSM6745609  -0.334181  1.070933 -0.549893   1.591370  -0.342997  -0.697982   
GSM6745610  -0.339126  2.329107 -0.600042   0.211779  -0.342997   0.697982   
GSM6745611  -0.299565 -0.577710  0.256450  -0.305567  -0.342997 

In [3]:
tsv_file_path = '../../data/Human.GRCh38.p13.annot.tsv'
tsv_df = pd.read_csv(tsv_file_path, sep='\t')

tsv_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GeneID,Symbol,Description,Synonyms,GeneType,EnsemblGeneID,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length,GOFunctionID,GOProcessID,GOComponentID,GOFunction,GOProcess,GOComponent
0,100287102,DDX11L1,DEAD/H-box helicase 11 like 1 (pseudogene),,pseudo,ENSG00000290825,active,NC_000001.11,11874,14409,positive,1652,,,,,,
1,653635,WASH7P,"WASP family homolog 7, pseudogene",FAM39F|WASH5P,pseudo,,active,NC_000001.11,14362,29370,negative,1769,,,,,,
2,102466751,MIR6859-1,microRNA 6859-1,hsa-mir-6859-1,ncRNA,ENSG00000278267,active,NC_000001.11,17369,17436,negative,68,,,,,,
3,107985730,MIR1302-2HG,MIR1302-2 host gene,,ncRNA,,active,NC_000001.11,29926,31295,positive,538,,,,,,
4,100302278,MIR1302-2,microRNA 1302-2,MIRN1302-2|hsa-mir-1302-2,ncRNA,ENSG00000284332,active,NC_000001.11,30366,30503,positive,138,,GO:0035195,,,miRNA-mediated gene silencing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39371,4541,ND6,NADH dehydrogenase subunit 6,MTND6,protein-coding,,active,NC_012920.1,14149,14673,negative,525,GO:0008137,GO:0006120///GO:0009060///GO:0032981///GO:0035...,GO:0005739///GO:0005743///GO:0005747,NADH dehydrogenase (ubiquinone) activity,"mitochondrial electron transport, NADH to ubiq...",mitochondrion///mitochondrial inner membrane//...
39372,4556,TRNE,tRNA-Glu,MTTE,tRNA,,active,NC_012920.1,14674,14742,negative,69,,,,,,
39373,4519,CYTB,cytochrome b,MTCYB,protein-coding,,active,NC_012920.1,14747,15887,positive,1141,GO:0008121///GO:0046872,GO:0006122///GO:0045333///GO:1902600,GO:0005739///GO:0005743///GO:0005750///GO:0016020,ubiquinol-cytochrome-c reductase activity///me...,"mitochondrial electron transport, ubiquinol to...",mitochondrion///mitochondrial inner membrane//...
39374,4576,TRNT,tRNA-Thr,MTTT,tRNA,,active,NC_012920.1,15888,15953,positive,66,,,,,,


In [4]:
metadata_file_path = "../../data/Human.GRCh38.p13.annot.tsv"
metadata = pd.read_csv(metadata_file_path, sep='\t')

transposed_data = scaled_df.T
transposed_data = transposed_data.reset_index()
transposed_data.columns.values[0] = 'GeneID'


merged_data = transposed_data.merge(metadata, on='GeneID', how='left').set_index('GeneID')
merged_data = merged_data.iloc[:39378]

print("Merged Data (first few rows):")

# Optionally, save to a new file
output_file_path = "merged_gene_expression_with_metadata.csv"
merged_data.to_csv(output_file_path)
print(merged_data.columns)
# print(f"Merged data saved to {output_file_path}")

  exec(code_obj, self.user_global_ns, self.user_ns)


Merged Data (first few rows):
Index(['GSM6745599', 'GSM6745600', 'GSM6745601', 'GSM6745602', 'GSM6745603',
       'GSM6745604', 'GSM6745605', 'GSM6745606', 'GSM6745607', 'GSM6745609',
       'GSM6745610', 'GSM6745611', 'GSM6745612', 'GSM6745613', 'GSM6745614',
       'GSM6745615', 'GSM6745616', 'GSM6745617', 'GSM6745618', 'GSM6745619',
       'GSM6745620', 'GSM6745621', 'GSM6745622', 'GSM6745623', 'GSM6745624',
       'GSM6745625', 'GSM6745626', 'GSM6745627', 'GSM6745628', 'GSM6745629',
       'GSM6745630', 'GSM6745631', 'GSM6745632', 'GSM6745633', 'GSM6745634',
       'GSM6745635', 'GSM6745636', 'GSM6745637', 'Symbol', 'Description',
       'Synonyms', 'GeneType', 'EnsemblGeneID', 'Status', 'ChrAcc', 'ChrStart',
       'ChrStop', 'Orientation', 'Length', 'GOFunctionID', 'GOProcessID',
       'GOComponentID', 'GOFunction', 'GOProcess', 'GOComponent'],
      dtype='object')


In [5]:
merged_data = merged_data.iloc[:, :-6]
merged_data = merged_data.drop(columns=['EnsemblGeneID'])
merged_data
output_file_path = "testing.csv"
merged_data.to_csv(output_file_path)

In [6]:
merged_data_nulls = merged_data['Description'].isnull().sum()
merged_data_nulls

145

In [7]:
d = merged_data.copy()

In [8]:
# Delete any genes with missing descriptions
merged_data = merged_data.dropna(subset=['Description'])
print(f"Number of rows: {merged_data.shape[0]}, Number of columns: {merged_data.shape[1]}")
merged_data.reset_index(inplace=True) # Apparently, I had GeneID as the index, so I reset it to a column cuz it's easier to work with
print(merged_data)

Number of rows: 4656, Number of columns: 48
         GeneID GSM6745599 GSM6745600 GSM6745601 GSM6745602 GSM6745603  \
0     113219467  -0.158632   1.379276   5.355088   2.002351   0.679553   
1         57801   1.070933   0.767235   3.457126   0.897391   1.070933   
2          9636    1.15215   0.761592   2.062135   0.336385   0.673147   
3     100288175   0.499194  -0.765431   1.706335  -0.075635    0.61416   
4     102465434  -0.342997  -0.342997  -0.342997  -0.342997  -0.342997   
...         ...        ...        ...        ...        ...        ...   
4651  105377243  -0.222988  -0.222988  -0.222988  -0.222988  -0.222988   
4652  107987359  -0.164399  -0.164399  -0.164399  -0.164399  -0.164399   
4653       4572  -0.135784   0.567824   3.460437    1.42779   2.131398   
4654       4512  -0.237304   1.302546   5.055768   2.327118   0.856884   
4655       4574  -0.162382   1.353526    5.39109   1.941426   0.658735   

     GSM6745604 GSM6745605 GSM6745606 GSM6745607  ...        Symbol

In [9]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [10]:
expression_data_T = merged_data.T
# Step 1: Reset the column names using the first row
expression_data_T.columns = expression_data_T.iloc[0]
expression_data_T = expression_data_T.iloc[1:].reset_index(drop=True)
expression_data_T = expression_data_T.apply(pd.to_numeric, errors='ignore')
expression_data_T = expression_data_T[:38]
expression_data_T

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,378949,378950,105377240,84559,401634,105377243,107987359,4572,4512,4574
0,-0.158632,1.070933,1.15215,0.499194,-0.342997,-0.697982,0.160668,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,-0.135784,-0.237304,-0.162382
1,1.379276,0.767235,0.761592,-0.765431,-0.342997,-0.697982,-1.27589,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,0.567824,1.302546,1.353526
2,5.355088,3.457126,2.062135,1.706335,-0.342997,-0.697982,2.435216,-0.602311,-0.164399,-0.164399,...,2.33513,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,3.460437,5.055768,5.39109
3,2.002351,0.897391,0.336385,-0.075635,-0.342997,0.697982,0.160668,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,1.42779,2.327118,1.941426
4,0.679553,1.070933,0.673147,0.61416,-0.342997,0.697982,0.579663,0.136006,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,2.131398,0.856884,0.658735
5,-0.213027,1.548171,0.828458,-0.018153,-0.342997,-0.697982,0.938803,0.136006,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,1.975041,-0.052044,-0.20611
6,0.0095,0.810621,-0.304008,1.01654,-0.342997,-0.697982,2.016221,2.350955,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,1.349611,-0.139682,-0.016622
7,-0.393521,-0.274013,-0.72496,-0.650465,-0.342997,-0.697982,-0.797037,0.136006,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,0.098752,-0.558824,-0.381023
8,-0.27484,0.289997,-0.605817,0.384228,-0.342997,-0.697982,0.45995,2.350955,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,0.724182,-0.258147,-0.269273
9,-0.334181,1.070933,-0.549893,1.59137,-0.342997,-0.697982,0.63952,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,-0.604856,-0.34289,-0.322719


In [11]:
expression_cols = merged_data.columns[:39]
#expression_cols
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')
expression_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.009500,-0.393521,-0.274840,...,-0.264950,-0.324291,-0.388576,-0.200665,-0.195719,-0.190774,-0.346543,-0.358906,-0.361378,-0.344071
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,-0.707866,-0.404169,-0.577710,-0.490939,-0.794637,-0.751251,-1.011563,-0.751251,-0.534325,-0.274013
2,9636,1.152150,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.724960,-0.605817,...,0.269823,-0.448682,-0.379384,-0.138667,-0.593051,-0.522234,-0.619798,-0.663869,-0.503086,-0.636818
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.614160,-0.018153,1.016540,-0.650465,0.384228,...,-0.018153,-0.592982,-0.248084,-0.133118,-1.512709,-1.340260,-1.570192,-1.225294,-1.110328,-1.167811
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,2.915476,2.915476,2.915476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,6.082763
4653,4572,-0.135784,0.567824,3.460437,1.427790,2.131398,1.975041,1.349611,0.098752,0.724182,...,-0.526678,-0.917571,-0.995750,0.020573,-0.057605,-0.448499,-0.839392,-0.683035,-0.448499,-0.604856
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,-0.286306,-0.433196,-0.606873,-0.096739,-0.143264,0.174674,-0.536076,-0.518510,-0.269883,-0.352416


In [12]:
expression_cols = merged_data.columns[1:39]
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')
expression_data

Unnamed: 0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
0,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.009500,-0.393521,-0.274840,-0.334181,...,-0.264950,-0.324291,-0.388576,-0.200665,-0.195719,-0.190774,-0.346543,-0.358906,-0.361378,-0.344071
1,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,1.070933,...,-0.707866,-0.404169,-0.577710,-0.490939,-0.794637,-0.751251,-1.011563,-0.751251,-0.534325,-0.274013
2,1.152150,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.724960,-0.605817,-0.549893,...,0.269823,-0.448682,-0.379384,-0.138667,-0.593051,-0.522234,-0.619798,-0.663869,-0.503086,-0.636818
3,0.499194,-0.765431,1.706335,-0.075635,0.614160,-0.018153,1.016540,-0.650465,0.384228,1.591370,...,-0.018153,-0.592982,-0.248084,-0.133118,-1.512709,-1.340260,-1.570192,-1.225294,-1.110328,-1.167811
4,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,2.915476,2.915476,2.915476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988
4652,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,6.082763
4653,-0.135784,0.567824,3.460437,1.427790,2.131398,1.975041,1.349611,0.098752,0.724182,-0.604856,...,-0.526678,-0.917571,-0.995750,0.020573,-0.057605,-0.448499,-0.839392,-0.683035,-0.448499,-0.604856
4654,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,-0.342890,...,-0.286306,-0.433196,-0.606873,-0.096739,-0.143264,0.174674,-0.536076,-0.518510,-0.269883,-0.352416


In [13]:
pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

In [14]:
expression_data_filled

Unnamed: 0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
0,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.009500,-0.393521,-0.274840,-0.334181,...,-0.264950,-0.324291,-0.388576,-0.200665,-0.195719,-0.190774,-0.346543,-0.358906,-0.361378,-0.344071
1,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,1.070933,...,-0.707866,-0.404169,-0.577710,-0.490939,-0.794637,-0.751251,-1.011563,-0.751251,-0.534325,-0.274013
2,1.152150,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.724960,-0.605817,-0.549893,...,0.269823,-0.448682,-0.379384,-0.138667,-0.593051,-0.522234,-0.619798,-0.663869,-0.503086,-0.636818
3,0.499194,-0.765431,1.706335,-0.075635,0.614160,-0.018153,1.016540,-0.650465,0.384228,1.591370,...,-0.018153,-0.592982,-0.248084,-0.133118,-1.512709,-1.340260,-1.570192,-1.225294,-1.110328,-1.167811
4,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,2.915476,2.915476,2.915476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988
4652,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,6.082763
4653,-0.135784,0.567824,3.460437,1.427790,2.131398,1.975041,1.349611,0.098752,0.724182,-0.604856,...,-0.526678,-0.917571,-0.995750,0.020573,-0.057605,-0.448499,-0.839392,-0.683035,-0.448499,-0.604856
4654,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,-0.342890,...,-0.286306,-0.433196,-0.606873,-0.096739,-0.143264,0.174674,-0.536076,-0.518510,-0.269883,-0.352416


In [15]:
# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

In [16]:
components_df

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,105376805,26870,29943,105376809,127707,51154,644068,30814,5320,26279
PC1,-0.034397,0.138742,-0.264705,0.025062,0.003796,-0.042319,-0.077518,0.124788,-0.074872,0.023672,...,-0.053555,0.089438,0.151901,-0.061128,0.118198,0.054578,0.187465,0.057647,0.036838,0.159415
PC2,-0.065762,-0.107845,-0.08009,-0.174027,-0.180051,-0.124951,-0.09841,-0.100538,-0.049386,-0.142808,...,-0.006629,-0.091397,-0.087516,-0.023358,0.519959,0.264382,0.345249,0.126039,0.186603,0.477684
PC3,0.037893,0.01738,0.08682,0.031085,0.041936,0.047165,0.046454,0.009759,0.031778,0.03127,...,-0.01602,-0.02354,-0.018421,-0.024175,-0.430821,-0.094422,-0.332662,0.015166,0.013255,0.81655
PC4,-0.029803,-0.028128,-0.073178,-0.053193,-0.073171,-0.067287,-0.057452,-0.02897,-0.020346,-0.029924,...,0.022878,0.013449,0.000797,0.031856,-0.631452,-0.002861,0.748151,0.00629,-0.013875,0.007467
PC5,-0.138683,-0.053234,-0.511585,-0.173487,-0.214557,-0.23197,-0.338442,-0.054666,-0.116477,-0.04486,...,0.109889,0.175037,0.091089,0.182911,-0.040395,-0.033446,-0.257498,-0.096661,-0.153314,0.05981
PC6,-0.026056,-0.03452,-0.274461,-0.09014,0.097427,0.098611,0.135271,-4.9e-05,-0.02437,-0.060653,...,0.000715,-0.032411,0.031163,-0.009066,-0.291282,0.742532,-0.240551,0.147142,0.330223,-0.163047
PC7,-0.031682,0.025641,0.587585,0.200201,-0.061661,-0.273903,-0.593375,-0.033817,-0.04607,-0.105197,...,-0.010443,0.048384,0.003829,0.045518,-0.086631,0.323233,-0.075922,0.036957,0.026589,-0.043333
PC8,-0.005512,-0.01386,-0.140664,-0.136458,0.910578,-0.153064,-0.279823,-0.020578,-0.01117,-0.011445,...,-0.008035,-0.038729,-0.069274,-0.023068,0.044852,-0.077346,0.059624,0.014993,0.000976,0.033976
PC9,0.014196,0.000752,-0.184956,0.236272,-0.033207,0.797946,-0.467163,-0.048314,0.03226,-0.092417,...,-0.045107,0.00712,-0.101555,-0.008658,0.021923,-0.033497,0.043466,0.008368,0.015932,0.015413
PC10,0.013298,-0.015102,-0.305613,0.850081,0.039802,-0.306051,0.136153,-0.015882,0.030721,-0.139468,...,-0.037364,-0.059413,-0.100986,-0.008532,0.022328,0.024611,0.022242,-0.013008,-0.003278,0.030668


In [17]:
pca_scores = expression_data.values.dot(components_df.values.T)
pca_scores_df = pd.DataFrame(
    pca_scores,
    index=expression_data.index,  # Samples from the original data
    columns=components_df.index   # Principal Components
)

pca_scores_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,-1.610801,-1.124606,0.564234,-0.734643,-3.618651,-1.639161,3.557396,-0.315058,-0.577133,0.320267,-0.602765,-0.591625,0.255450,-0.039731,-0.019299
1,-2.911410,-1.735701,1.317734,-0.940699,-3.834264,-0.906068,0.620352,0.068730,0.497001,-0.565768,-0.297182,1.192298,0.402776,-0.073049,0.607345
2,-4.055493,-0.467207,0.203212,-0.044257,-0.944002,-0.452983,0.636862,0.398732,0.431519,-0.503465,-0.385622,-0.214303,2.793984,-0.206021,0.445154
3,-4.606622,-2.334393,0.495513,-0.121233,-0.091624,-0.927259,-0.545998,0.218752,-0.915920,-0.361597,-0.292452,1.310490,-0.127047,-0.225957,-0.703438
4,-0.867140,2.928030,2.638840,0.254106,0.111407,0.834895,0.033320,0.391681,0.470763,0.508671,2.952041,-0.843354,-1.964977,-0.119911,-1.168180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-2.966374,0.410287,-0.243549,0.490846,1.811886,-0.503529,0.089607,0.365337,0.564906,0.850623,-0.135235,-0.191491,-3.090793,-0.147988,-2.121673
4652,0.995893,2.984167,5.101120,0.046649,0.373642,-1.018578,-0.270708,0.212255,0.096285,0.191589,-0.668701,0.161164,0.021907,-0.043157,-0.013271
4653,-2.687955,-1.520717,0.628180,-1.353141,-4.187248,-0.690604,0.610915,0.814484,0.975450,0.351347,-1.146660,-0.677757,-0.790476,0.610513,0.055030
4654,-2.110323,-0.949375,0.537370,-0.856538,-3.395351,-1.287146,3.615643,-0.100806,-0.169655,0.751010,-0.686807,-0.459572,-0.001929,0.077749,-0.043225


In [18]:
identifiers = d.tail(2)
identifiers

Unnamed: 0_level_0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Edited (1) or Unedited (0),1,1,1,1,1,1,1,1,1,1,...,,,,,,,,,,
editing mechanism,BE4,BE4,BE4,ABE8,ABE8,ABE8,Cas9,Cas9,Cas9,Utelectro,...,,,,,,,,,,


In [19]:
y = identifiers.head(1).iloc[:, :38]
y

Unnamed: 0_level_0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Edited (1) or Unedited (0),1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
# Assuming components_df is a pandas DataFrame (15x38)
X = components_df.T.values  # Transpose to get samples as rows (38x15)
y = identifiers.head(1).iloc[:, :38]
y = np.array(y).flatten()   # Ensure y is a 1D array (38,)
y = np.array(y).astype(int)  # Ensure y is integer type

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest as an example)
clf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

X shape: (38, 15), y shape: (38,)
Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      1.00      0.86         6

    accuracy                           0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
components_df

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,105376805,26870,29943,105376809,127707,51154,644068,30814,5320,26279
PC1,-0.034397,0.138742,-0.264705,0.025062,0.003796,-0.042319,-0.077518,0.124788,-0.074872,0.023672,...,-0.053555,0.089438,0.151901,-0.061128,0.118198,0.054578,0.187465,0.057647,0.036838,0.159415
PC2,-0.065741,-0.107835,-0.080094,-0.17403,-0.180051,-0.124953,-0.098411,-0.10056,-0.049377,-0.142808,...,-0.006641,-0.091398,-0.087511,-0.023359,0.519959,0.264383,0.34525,0.126042,0.186603,0.477684
PC3,0.037809,0.017361,0.086836,0.031094,0.041939,0.047176,0.046455,0.009829,0.031754,0.031275,...,-0.015924,-0.023542,-0.018449,-0.024191,-0.430824,-0.094426,-0.332663,0.015143,0.013256,0.816548
PC4,-0.029968,-0.028174,-0.073176,-0.053177,-0.073155,-0.067272,-0.057446,-0.028841,-0.020347,-0.029939,...,0.022837,0.013447,0.000808,0.031865,-0.631448,-0.00286,0.748157,0.006272,-0.013879,0.007474
PC5,-0.138818,-0.05325,-0.51158,-0.173475,-0.214548,-0.231961,-0.338441,-0.054592,-0.116505,-0.044889,...,0.10988,0.175034,0.091081,0.182923,-0.040394,-0.033451,-0.2575,-0.09668,-0.153313,0.059813
PC6,-0.0262,-0.034436,-0.274419,-0.090167,0.097567,0.098702,0.135255,-1.5e-05,-0.024184,-0.060465,...,0.000489,-0.032383,0.031094,-0.009239,-0.291256,0.742593,-0.240518,0.147107,0.330217,-0.163029
PC7,-0.03213,0.025292,0.587704,0.200284,-0.061953,-0.273866,-0.593173,-0.033389,-0.045836,-0.105188,...,-0.010829,0.048491,0.003987,0.045656,-0.086616,0.323259,-0.075922,0.036998,0.026514,-0.043315
PC8,-0.009276,-0.014483,-0.140327,-0.135952,0.910711,-0.152466,-0.280067,-0.017704,-0.011143,-0.011687,...,-0.00943,-0.038769,-0.069328,-0.022713,0.044917,-0.077283,0.059734,0.014606,0.0007,0.034126
PC9,0.011466,0.000256,-0.184952,0.236782,-0.033604,0.798001,-0.466892,-0.046565,0.033334,-0.092772,...,-0.046842,0.007053,-0.101352,-0.008296,0.021961,-0.033335,0.043535,0.008362,0.015503,0.015524
PC10,0.011095,-0.016058,-0.305058,0.850304,0.040019,-0.305979,0.136432,-0.013384,0.029255,-0.139903,...,-0.034356,-0.059466,-0.10118,-0.008714,0.022286,0.024489,0.022181,-0.013997,-0.003085,0.030672


In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Prepare the data
X = components_df.T.values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best cross-validation score: 0.890909090909091

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8


Top 5 Most Important Features:
   feature  importance
1        1    0.315019
9        9    0.111693
2        2    0.095257
7        7    0.094935
5        5    0.064179


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Cross-validation scores: [1.         1.         0.79487179 1.         1.        ]
Average CV score: 0.9589743589743589
CV score std: 0.08205128205128207


In [22]:
# Print predictions for each sample
predictions = grid_search.predict(X)
for i, (true_label, predicted_label) in enumerate(zip(y, predictions)):
    print(f"Sample {i}: True label = {true_label}, Predicted label = {predicted_label}")

# Optional: Also show prediction probabilities
#probabilities = grid_search.predict_proba(X)
#for i, probs in enumerate(probabilities):
 #   print(f"Sample {i} probabilities: Class 0 = {probs[0]:.3f}, Class 1 = {probs[1]:.3f}")

Sample 0: True label = 1, Predicted label = 1
Sample 1: True label = 1, Predicted label = 1
Sample 2: True label = 1, Predicted label = 1
Sample 3: True label = 1, Predicted label = 1
Sample 4: True label = 1, Predicted label = 1
Sample 5: True label = 1, Predicted label = 1
Sample 6: True label = 1, Predicted label = 1
Sample 7: True label = 1, Predicted label = 1
Sample 8: True label = 1, Predicted label = 1
Sample 9: True label = 1, Predicted label = 1
Sample 10: True label = 1, Predicted label = 1
Sample 11: True label = 1, Predicted label = 1
Sample 12: True label = 1, Predicted label = 1
Sample 13: True label = 1, Predicted label = 1
Sample 14: True label = 1, Predicted label = 1
Sample 15: True label = 1, Predicted label = 1
Sample 16: True label = 1, Predicted label = 1
Sample 17: True label = 1, Predicted label = 1
Sample 18: True label = 1, Predicted label = 1
Sample 19: True label = 1, Predicted label = 1
Sample 20: True label = 1, Predicted label = 1
Sample 21: True label =

In [23]:
# Assuming y and predictions are lists or numpy arrays
for i, (true_label, predicted_label) in enumerate(zip(y[-6:], predictions[-6:]), start=len(y)-6):
    print(f"Sample {i}: True label = {true_label}, Predicted label = {predicted_label}")

Sample 32: True label = 0, Predicted label = 0
Sample 33: True label = 0, Predicted label = 0
Sample 34: True label = 0, Predicted label = 0
Sample 35: True label = 0, Predicted label = 1
Sample 36: True label = 0, Predicted label = 0
Sample 37: True label = 0, Predicted label = 0


Performs best when:
- balanced: model adjusts weights inversely proportional to class frequencies, helping address class imbalance.
- max_depth=None: capture all patterns in data
- min_samples_leaf=1 and min_samples_split=2: full depth, max flexibility in splitting nodes
- small number of trees sufficient for strong performance
The cross-validation score of 0.89 indicates the model performed well during training across folds, particularly in capturing the balance between precision and recall.

Precision, Recall, F1-Score (Support = 8 test samples):

- Class 0: Performance for this class is poor (F1=0.00), likely due to having only 1 sample in the test set. This is too small to generalize reliably.
- Class 1: The model handled this majority class well, achieving perfect recall (100%) and strong precision (88%), leading to a high F1-score of 0.93.

Macro-average is low because it treats all classes equally, including underperforming Class 0.
Weighted average accounts for class imbalance, resulting in a higher score (0.82) due to the better performance for the dominant Class 1.

high variability in performance across folds; decent model performance overall; high variability in accuracy b/w classes

In [24]:
components_df

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,105376805,26870,29943,105376809,127707,51154,644068,30814,5320,26279
PC1,-0.034397,0.138742,-0.264705,0.025062,0.003796,-0.042319,-0.077518,0.124788,-0.074872,0.023672,...,-0.053555,0.089438,0.151901,-0.061128,0.118198,0.054578,0.187465,0.057647,0.036838,0.159415
PC2,-0.065762,-0.107845,-0.08009,-0.174027,-0.180051,-0.124951,-0.09841,-0.100538,-0.049386,-0.142808,...,-0.006629,-0.091397,-0.087516,-0.023358,0.519959,0.264382,0.345249,0.126039,0.186603,0.477684
PC3,0.037893,0.01738,0.08682,0.031085,0.041936,0.047165,0.046454,0.009759,0.031778,0.03127,...,-0.01602,-0.02354,-0.018421,-0.024175,-0.430821,-0.094422,-0.332662,0.015166,0.013255,0.81655
PC4,-0.029803,-0.028128,-0.073178,-0.053193,-0.073171,-0.067287,-0.057452,-0.02897,-0.020346,-0.029924,...,0.022878,0.013449,0.000797,0.031856,-0.631452,-0.002861,0.748151,0.00629,-0.013875,0.007467
PC5,-0.138683,-0.053234,-0.511585,-0.173487,-0.214557,-0.23197,-0.338442,-0.054666,-0.116477,-0.04486,...,0.109889,0.175037,0.091089,0.182911,-0.040395,-0.033446,-0.257498,-0.096661,-0.153314,0.05981
PC6,-0.026056,-0.03452,-0.274461,-0.09014,0.097427,0.098611,0.135271,-4.9e-05,-0.02437,-0.060653,...,0.000715,-0.032411,0.031163,-0.009066,-0.291282,0.742532,-0.240551,0.147142,0.330223,-0.163047
PC7,-0.031682,0.025641,0.587585,0.200201,-0.061661,-0.273903,-0.593375,-0.033817,-0.04607,-0.105197,...,-0.010443,0.048384,0.003829,0.045518,-0.086631,0.323233,-0.075922,0.036957,0.026589,-0.043333
PC8,-0.005512,-0.01386,-0.140664,-0.136458,0.910578,-0.153064,-0.279823,-0.020578,-0.01117,-0.011445,...,-0.008035,-0.038729,-0.069274,-0.023068,0.044852,-0.077346,0.059624,0.014993,0.000976,0.033976
PC9,0.014196,0.000752,-0.184956,0.236272,-0.033207,0.797946,-0.467163,-0.048314,0.03226,-0.092417,...,-0.045107,0.00712,-0.101555,-0.008658,0.021923,-0.033497,0.043466,0.008368,0.015932,0.015413
PC10,0.013298,-0.015102,-0.305613,0.850081,0.039802,-0.306051,0.136153,-0.015882,0.030721,-0.139468,...,-0.037364,-0.059413,-0.100986,-0.008532,0.022328,0.024611,0.022242,-0.013008,-0.003278,0.030668


In [70]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [64]:
expression_cols = merged_data.columns[1:39]
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')

pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

def get_relevant_genes_with_contribution(pc, cumulative_threshold=0.9):
    # Absolute loadings for the specified principal component
    loadings = components_df.iloc[pc-1].abs().sort_values(ascending=False)

    # Calculate cumulative variance contribution by each gene
    cumulative_variance = loadings.cumsum() / loadings.sum()

    # Select genes up to the cumulative threshold
    relevant_genes = loadings[cumulative_variance <= cumulative_threshold]

    # Calculate the contribution percentage for each gene
    contribution_percentages = (relevant_genes / loadings.sum()) * 100

    return pd.DataFrame({
        'Loading': relevant_genes,
        'Contribution (%)': contribution_percentages
    })

d = {}
# Example usage
for pc in range(1, n_components_75 + 1):
    print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes = get_relevant_genes_with_contribution(pc)
    print(len(rel_genes))
    #print(rel_genes)
    d[pc] = rel_genes


Relevant genes with contributions for PC1:
22

Relevant genes with contributions for PC2:
25

Relevant genes with contributions for PC3:
21

Relevant genes with contributions for PC4:
17

Relevant genes with contributions for PC5:
28

Relevant genes with contributions for PC6:
19

Relevant genes with contributions for PC7:
18

Relevant genes with contributions for PC8:
17

Relevant genes with contributions for PC9:
18

Relevant genes with contributions for PC10:
17

Relevant genes with contributions for PC11:
17

Relevant genes with contributions for PC12:
23

Relevant genes with contributions for PC13:
16

Relevant genes with contributions for PC14:
16

Relevant genes with contributions for PC15:
21


In [59]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [74]:
def get_relevant_genes_with_contribution_genes(pc, cumulative_threshold=0.9):
    # Absolute loadings for the specified principal component
    loadings = components_df.iloc[pc-1].abs().sort_values(ascending=False)

    # Calculate cumulative variance contribution by each gene
    cumulative_variance = loadings.cumsum() / loadings.sum()

    # Select genes up to the cumulative threshold
    relevant_genes = loadings[cumulative_variance <= cumulative_threshold]

    # Calculate the contribution percentage for each gene
    contribution_percentages = (relevant_genes / loadings.sum()) * 100

    return relevant_genes

d = {}
# Example usage
for pc in range(1, n_components_75 + 1):
    print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes = get_relevant_genes_with_contribution_genes(pc)
    print(rel_genes)
    #print(pc)
    d[pc] = rel_genes


Relevant genes with contributions for PC1:
GeneID
102724659    0.520019
105376737    0.404821
6060         0.336847
26869        0.281715
9636         0.264705
55092        0.248511
644068       0.187465
26279        0.159415
29943        0.151901
7799         0.142350
57801        0.138742
54751        0.132867
107984872    0.124788
127707       0.118198
155184       0.098019
26870        0.089438
105376691    0.082548
8510         0.077518
27129        0.076625
112268220    0.074872
63036        0.067704
105376809    0.061128
Name: PC1, dtype: float64

Relevant genes with contributions for PC2:
GeneID
127707       0.519959
26279        0.477684
644068       0.345250
51154        0.264382
5320         0.186602
102465434    0.180050
100288175    0.174026
284661       0.142806
6060         0.138240
30814        0.126041
109623456    0.124949
55092        0.115197
102724659    0.108263
57801        0.107846
105376737    0.101365
107984872    0.100524
8510         0.098411
26870        0

In [None]:
d[1].index

Int64Index([102724659, 105376737,      6060,     26869,      9636,     55092,
               644068,     26279,     29943,      7799,     57801,     54751,
            107984872,    127707,    155184,     26870, 105376691,      8510,
                27129, 112268220,     63036, 105376809],
           dtype='int64', name='GeneID')

In [None]:
d.items()

dict_items([(1, GeneID
102724659    0.520019
105376737    0.404821
6060         0.336847
26869        0.281715
9636         0.264705
55092        0.248511
644068       0.187465
26279        0.159415
29943        0.151901
7799         0.142350
57801        0.138742
54751        0.132867
107984872    0.124788
127707       0.118198
155184       0.098019
26870        0.089438
105376691    0.082548
8510         0.077518
27129        0.076625
112268220    0.074872
63036        0.067704
105376809    0.061128
Name: PC1, dtype: float64), (2, GeneID
127707       0.519959
26279        0.477684
644068       0.345250
51154        0.264383
5320         0.186603
102465434    0.180051
100288175    0.174030
284661       0.142808
6060         0.138240
30814        0.126042
109623456    0.124953
55092        0.115199
102724659    0.108262
57801        0.107835
105376737    0.101368
107984872    0.100560
8510         0.098411
26870        0.091398
155184       0.089014
29943        0.087511
106614088    0

In [66]:
lst = []
for k, gene in d.items():
    arr = d[k].index
    print(arr)
    for item in arr:
        if item not in lst:
            lst.append(item)

lst

Int64Index([102724659, 105376737,      6060,     26869,      9636,     55092,
               644068,     26279,     29943,      7799,     57801,     54751,
            107984872,    127707,    155184,     26870, 105376691,      8510,
                27129, 112268220,     63036, 105376809],
           dtype='int64', name='GeneID')
Int64Index([   127707,     26279,    644068,     51154,      5320, 102465434,
            100288175,    284661,      6060,     30814, 109623456,     55092,
            102724659,     57801, 105376737, 107984872,      8510,     26870,
               155184,     29943, 106614088,      9636,      1187,     26869,
                 1969],
           dtype='int64', name='GeneID')
Int64Index([    26279,    127707,    644068,     51154,      9636, 109623456,
                 8510, 102465434, 113219467, 102724659, 112268220,    284661,
            100288175, 105376691,     10630,     26829, 105376809,     26870,
            106614088,      7799, 105376739],
           

[102724659,
 105376737,
 6060,
 26869,
 9636,
 55092,
 644068,
 26279,
 29943,
 7799,
 57801,
 54751,
 107984872,
 127707,
 155184,
 26870,
 105376691,
 8510,
 27129,
 112268220,
 63036,
 105376809,
 51154,
 5320,
 102465434,
 100288175,
 284661,
 30814,
 109623456,
 106614088,
 1187,
 1969,
 113219467,
 10630,
 26829,
 105376739,
 26871,
 105376805]

In [65]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [None]:
merged_data = merged_data[merged_data['GeneID'].isin(lst)]
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
5,109623456,-0.697982,-0.697982,-0.697982,0.697982,0.697982,-0.697982,-0.697982,-0.697982,-0.697982,...,SNORD167,"small nucleolar RNA, C/D box 167",,snoRNA,active,NC_000001.11,1304729,1304812,negative,84.0
6,8510,0.160668,-1.27589,2.435216,0.160668,0.579663,0.938803,2.016221,-0.797037,0.45995,...,MMP23B,matrix metallopeptidase 23B,MIFR|MIFR-1|MMP22|MMP23A,protein-coding,active,NC_000001.11,1631681,1635638,positive,2764.0
7,107984872,-0.602311,-0.602311,-0.602311,-0.602311,0.136006,0.136006,2.350955,0.136006,2.350955,...,LOC107984872,uncharacterized LOC107984872,,ncRNA,active,NC_000001.11,1955857,1958986,positive,2799.0
8,112268220,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC112268220,uncharacterized LOC112268220,,ncRNA,active,NC_000001.11,2763419,2767518,positive,996.0
9,284661,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LINC01777,long intergenic non-protein coding RNA 1777,,ncRNA,active,NC_000001.11,4412051,4424684,positive,2074.0


In [75]:
remaining_genes = expression_data.columns.tolist()  # Start with all genes
expression_data_reduced = expression_data.copy()  # Copy the data to modify iteratively
final_results = {}  # To store relevant genes for each PC

for pc in range(1, n_components_75 + 1):
    # Fit PCA on the current reduced dataset
    pca_temp = PCA()
    pca_temp.fit(expression_data_reduced)
    n_components_temp = min(expression_data_reduced.shape[1], len(pca_temp.explained_variance_ratio_))
    
    # Stop if we run out of components to analyze
    if pc > n_components_temp:
        break
    
    # Update the components dataframe for the reduced dataset
    components_df = pd.DataFrame(
        pca_temp.components_,
        columns=expression_data_reduced.columns,  # Use current reduced dataset columns
        index=[f"PC{i+1}" for i in range(n_components_temp)]
    )

    # Get relevant genes for the current PC
    relevant_genes_df = get_relevant_genes_with_contribution(pc)
    relevant_genes = relevant_genes_df.index.tolist()

    # Store the results for this PC
    final_results[f"PC{pc}"] = relevant_genes

    # Drop the relevant genes for the current PC from the dataset
    expression_data_reduced = expression_data_reduced.drop(columns=relevant_genes, errors='ignore')
    remaining_genes = expression_data_reduced.columns.tolist()  # Update remaining genes list

    # Stop if no more genes remain
    if expression_data_reduced.empty:
        break

# Final output
for pc, genes in final_results.items():
    print(f"\nRelevant genes for {pc}: {len(genes)} genes")


Relevant genes for PC1: 22 genes

Relevant genes for PC2: 12 genes

Relevant genes for PC3: 1 genes


In [73]:
components_df

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,105376805,26870,29943,105376809,127707,51154,644068,30814,5320,26279
PC1,-0.034397,0.138742,-0.264705,0.025062,0.003796,-0.042319,-0.077518,0.124788,-0.074872,0.023672,...,-0.053555,0.089438,0.151901,-0.061128,0.118198,0.054578,0.187465,0.057647,0.036838,0.159415
PC2,-0.065782,-0.107846,-0.08009,-0.174026,-0.18005,-0.124949,-0.098411,-0.100524,-0.049378,-0.142806,...,-0.006621,-0.091396,-0.087515,-0.023356,0.519959,0.264382,0.34525,0.126041,0.186602,0.477684
PC3,0.037899,0.017363,0.086825,0.03109,0.041938,0.04717,0.046466,0.00973,0.031738,0.031281,...,-0.015995,-0.02356,-0.018417,-0.024199,-0.43082,-0.09442,-0.33266,0.015141,0.013264,0.816551
PC4,-0.030007,-0.028179,-0.073161,-0.053167,-0.073154,-0.067269,-0.05743,-0.028866,-0.020425,-0.029928,...,0.022935,0.013403,0.000789,0.031849,-0.631451,-0.002861,0.748156,0.006193,-0.013862,0.007476
PC5,-0.138684,-0.053239,-0.511575,-0.173467,-0.214538,-0.231943,-0.338405,-0.05471,-0.116702,-0.044834,...,0.109737,0.174987,0.091139,0.182855,-0.040388,-0.033443,-0.257487,-0.096714,-0.153287,0.059819
PC6,-0.025573,-0.034352,-0.274494,-0.090151,0.097454,0.098579,0.135362,-0.00056,-0.024801,-0.0605,...,0.000535,-0.032655,0.031182,-0.009278,-0.291278,0.742604,-0.240532,0.146862,0.330262,-0.163039
PC7,-0.031499,0.025579,0.587604,0.200171,-0.061667,-0.273889,-0.593371,-0.033737,-0.045998,-0.105278,...,-0.010584,0.048545,0.00384,0.045439,-0.086615,0.323205,-0.075913,0.037164,0.026532,-0.043338
PC8,-0.008307,-0.014243,-0.140606,-0.13596,0.91075,-0.15235,-0.279757,-0.018774,-0.011693,-0.011438,...,-0.009607,-0.039062,-0.069271,-0.023084,0.04494,-0.077271,0.059787,0.014235,0.000922,0.034141
PC9,0.012006,0.000682,-0.184752,0.236725,-0.033619,0.798175,-0.466766,-0.04636,0.031566,-0.092616,...,-0.046003,0.006851,-0.101739,-0.00855,0.021914,-0.033322,0.043493,0.007542,0.015724,0.0155
PC10,0.011087,-0.01558,-0.305315,0.850141,0.039766,-0.306108,0.135987,-0.012669,0.031745,-0.140848,...,-0.035263,-0.058462,-0.101468,-0.007919,0.022277,0.02432,0.02211,-0.012337,-0.003627,0.030614


In [None]:
expression_cols = merged_data.columns[1:39]
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')

pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Extract relevant genes for each PC and combine them into a set
relevant_genes_set = set()
for pc in range(1, n_components_75 + 1):
    relevant_genes = d[pc].index.tolist()  # Get the list of relevant genes for each PC
    relevant_genes_set.update(relevant_genes)  # Add the genes to the set

# Step 2: Filter the expression data to retain only the relevant genes
filtered_expression_data = expression_data[relevant_genes_set]

# Step 3: Prepare the labels (assuming you have a target variable like 'label_column')
# Replace 'label_column' with your actual target column
X_train, X_test, y_train, y_test = train_test_split(filtered_expression_data, merged_data['label_column'], test_size=0.2, random_state=42)

# Step 4: Build and train a classifier (e.g., Random Forest)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Step 5: Evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the classifier: {accuracy:.4f}")

KeyError: "None of [Int64Index([    10630, 113219467, 102465434,     26871, 109623456,      1187,\n                 9636, 105376805,     26279, 105376809,      6060, 100288175,\n               155184,      1969, 102724659,     55092, 105376691, 112268220,\n                63036,      8510,      5320,     57801, 106614088,     26829,\n                51154,    127707,     30814,     54751, 105376737, 105376739,\n               644068, 107984872,     26869,     26870,      7799,    284661,\n                27129,     29943],\n           dtype='int64')] are in the [columns]"