In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

df = pd.read_csv('../../data/GSE218462_raw_counts_GRCh38.p13_NCBI.tsv', sep='\t')
df = df.T
df.columns = df.iloc[0]
df = df[1:]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data)

scaled_df.columns = df.columns
scaled_df.index = df.index
unedited = ['GSM6745632', 'GSM6745633', 'GSM6745634', 'GSM6745635', 'GSM6745636', 'GSM6745637']
scaled_df['Edited (1) or Unedited (0)'] = scaled_df.index.map(lambda gene: 0 if gene in unedited else 1)
mechanisms = {
    "BE4": ["GSM6745599", "GSM6745600", "GSM6745601", "GSM6745611", "GSM6745612", "GSM6745613"],
    "ABE8": ["GSM6745602", "GSM6745603", "GSM6745604", "GSM6745614", "GSM6745615", "GSM6745616"],
    "Cas9": ["GSM6745605", "GSM6745606", "GSM6745607", "GSM6745617", "GSM6745618", "GSM6745619"],
    "Utelectro": ["GSM6745608", "GSM6745609", "GSM6745610", "GSM6745620", "GSM6745621", "GSM6745622"],
    "dCas9": ["GSM6745623", "GSM6745624", "GSM6745625"],
    "BE4alone": ["GSM6745626", "GSM6745627", "GSM6745628"],
    "ABE8alone": ["GSM6745629", "GSM6745630", "GSM6745631"],
    "UT": ["GSM6745632", "GSM6745633", "GSM6745634", "GSM6745635", "GSM6745636", "GSM6745637"]
}

# Inverting the dictionary to map gene code to its corresponding key
mechanism_map = {gene: mechanism for mechanism, genes in mechanisms.items() for gene in genes}

scaled_df['editing mechanism'] = scaled_df.index.map(mechanism_map)

# print(scaled_df)

txt_file_path = '../EDA_sj/relevant_genes_1.6_250.txt'
with open(txt_file_path, 'r') as file:
    txt_data = file.read()

soham_gene_ids = txt_data.split(',')

column_names = scaled_df.columns.tolist()

column_names = [str(col) for col in scaled_df.columns.tolist()]
filtered_column_names = [col for col in scaled_df.columns if str(col) in soham_gene_ids]
scaled_df = scaled_df[filtered_column_names + ['Edited (1) or Unedited (0)', 'editing mechanism']]

# Output the filtered DataFrame
print(scaled_df)

GeneID      113219467     57801      9636  100288175  102465434  109623456  \
GSM6745599  -0.158632  1.070933  1.152150   0.499194  -0.342997  -0.697982   
GSM6745600   1.379276  0.767235  0.761592  -0.765431  -0.342997  -0.697982   
GSM6745601   5.355088  3.457126  2.062135   1.706335  -0.342997  -0.697982   
GSM6745602   2.002351  0.897391  0.336385  -0.075635  -0.342997   0.697982   
GSM6745603   0.679553  1.070933  0.673147   0.614160  -0.342997   0.697982   
GSM6745604  -0.213027  1.548171  0.828458  -0.018153  -0.342997  -0.697982   
GSM6745605   0.009500  0.810621 -0.304008   1.016540  -0.342997  -0.697982   
GSM6745606  -0.393521 -0.274013 -0.724960  -0.650465  -0.342997  -0.697982   
GSM6745607  -0.274840  0.289997 -0.605817   0.384228  -0.342997  -0.697982   
GSM6745609  -0.334181  1.070933 -0.549893   1.591370  -0.342997  -0.697982   
GSM6745610  -0.339126  2.329107 -0.600042   0.211779  -0.342997   0.697982   
GSM6745611  -0.299565 -0.577710  0.256450  -0.305567  -0.342997 

In [2]:
tsv_file_path = '../../data/Human.GRCh38.p13.annot.tsv'
tsv_df = pd.read_csv(tsv_file_path, sep='\t')

tsv_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GeneID,Symbol,Description,Synonyms,GeneType,EnsemblGeneID,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length,GOFunctionID,GOProcessID,GOComponentID,GOFunction,GOProcess,GOComponent
0,100287102,DDX11L1,DEAD/H-box helicase 11 like 1 (pseudogene),,pseudo,ENSG00000290825,active,NC_000001.11,11874,14409,positive,1652,,,,,,
1,653635,WASH7P,"WASP family homolog 7, pseudogene",FAM39F|WASH5P,pseudo,,active,NC_000001.11,14362,29370,negative,1769,,,,,,
2,102466751,MIR6859-1,microRNA 6859-1,hsa-mir-6859-1,ncRNA,ENSG00000278267,active,NC_000001.11,17369,17436,negative,68,,,,,,
3,107985730,MIR1302-2HG,MIR1302-2 host gene,,ncRNA,,active,NC_000001.11,29926,31295,positive,538,,,,,,
4,100302278,MIR1302-2,microRNA 1302-2,MIRN1302-2|hsa-mir-1302-2,ncRNA,ENSG00000284332,active,NC_000001.11,30366,30503,positive,138,,GO:0035195,,,miRNA-mediated gene silencing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39371,4541,ND6,NADH dehydrogenase subunit 6,MTND6,protein-coding,,active,NC_012920.1,14149,14673,negative,525,GO:0008137,GO:0006120///GO:0009060///GO:0032981///GO:0035...,GO:0005739///GO:0005743///GO:0005747,NADH dehydrogenase (ubiquinone) activity,"mitochondrial electron transport, NADH to ubiq...",mitochondrion///mitochondrial inner membrane//...
39372,4556,TRNE,tRNA-Glu,MTTE,tRNA,,active,NC_012920.1,14674,14742,negative,69,,,,,,
39373,4519,CYTB,cytochrome b,MTCYB,protein-coding,,active,NC_012920.1,14747,15887,positive,1141,GO:0008121///GO:0046872,GO:0006122///GO:0045333///GO:1902600,GO:0005739///GO:0005743///GO:0005750///GO:0016020,ubiquinol-cytochrome-c reductase activity///me...,"mitochondrial electron transport, ubiquinol to...",mitochondrion///mitochondrial inner membrane//...
39374,4576,TRNT,tRNA-Thr,MTTT,tRNA,,active,NC_012920.1,15888,15953,positive,66,,,,,,


In [3]:
metadata_file_path = "../../data/Human.GRCh38.p13.annot.tsv"
metadata = pd.read_csv(metadata_file_path, sep='\t')

transposed_data = scaled_df.T
transposed_data = transposed_data.reset_index()
transposed_data.columns.values[0] = 'GeneID'


merged_data = transposed_data.merge(metadata, on='GeneID', how='left').set_index('GeneID')
merged_data = merged_data.iloc[:39378]

print("Merged Data (first few rows):")

# Optionally, save to a new file
output_file_path = "merged_gene_expression_with_metadata.csv"
merged_data.to_csv(output_file_path)
print(merged_data.columns)
# print(f"Merged data saved to {output_file_path}")

  exec(code_obj, self.user_global_ns, self.user_ns)


Merged Data (first few rows):
Index(['GSM6745599', 'GSM6745600', 'GSM6745601', 'GSM6745602', 'GSM6745603',
       'GSM6745604', 'GSM6745605', 'GSM6745606', 'GSM6745607', 'GSM6745609',
       'GSM6745610', 'GSM6745611', 'GSM6745612', 'GSM6745613', 'GSM6745614',
       'GSM6745615', 'GSM6745616', 'GSM6745617', 'GSM6745618', 'GSM6745619',
       'GSM6745620', 'GSM6745621', 'GSM6745622', 'GSM6745623', 'GSM6745624',
       'GSM6745625', 'GSM6745626', 'GSM6745627', 'GSM6745628', 'GSM6745629',
       'GSM6745630', 'GSM6745631', 'GSM6745632', 'GSM6745633', 'GSM6745634',
       'GSM6745635', 'GSM6745636', 'GSM6745637', 'Symbol', 'Description',
       'Synonyms', 'GeneType', 'EnsemblGeneID', 'Status', 'ChrAcc', 'ChrStart',
       'ChrStop', 'Orientation', 'Length', 'GOFunctionID', 'GOProcessID',
       'GOComponentID', 'GOFunction', 'GOProcess', 'GOComponent'],
      dtype='object')


In [4]:
merged_data = merged_data.iloc[:, :-6]
merged_data = merged_data.drop(columns=['EnsemblGeneID'])
merged_data
output_file_path = "testing.csv"
merged_data.to_csv(output_file_path)

In [5]:
merged_data_nulls = merged_data['Description'].isnull().sum()
merged_data_nulls

145

In [6]:
d = merged_data.copy()

In [7]:
# Delete any genes with missing descriptions
merged_data = merged_data.dropna(subset=['Description'])
print(f"Number of rows: {merged_data.shape[0]}, Number of columns: {merged_data.shape[1]}")
merged_data.reset_index(inplace=True) # Apparently, I had GeneID as the index, so I reset it to a column cuz it's easier to work with
print(merged_data)

Number of rows: 4656, Number of columns: 48
         GeneID GSM6745599 GSM6745600 GSM6745601 GSM6745602 GSM6745603  \
0     113219467  -0.158632   1.379276   5.355088   2.002351   0.679553   
1         57801   1.070933   0.767235   3.457126   0.897391   1.070933   
2          9636    1.15215   0.761592   2.062135   0.336385   0.673147   
3     100288175   0.499194  -0.765431   1.706335  -0.075635    0.61416   
4     102465434  -0.342997  -0.342997  -0.342997  -0.342997  -0.342997   
...         ...        ...        ...        ...        ...        ...   
4651  105377243  -0.222988  -0.222988  -0.222988  -0.222988  -0.222988   
4652  107987359  -0.164399  -0.164399  -0.164399  -0.164399  -0.164399   
4653       4572  -0.135784   0.567824   3.460437    1.42779   2.131398   
4654       4512  -0.237304   1.302546   5.055768   2.327118   0.856884   
4655       4574  -0.162382   1.353526    5.39109   1.941426   0.658735   

     GSM6745604 GSM6745605 GSM6745606 GSM6745607  ...        Symbol

In [8]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [9]:
expression_data_T = merged_data.T
# Step 1: Reset the column names using the first row
expression_data_T.columns = expression_data_T.iloc[0]
expression_data_T = expression_data_T.iloc[1:].reset_index(drop=True)
expression_data_T = expression_data_T.apply(pd.to_numeric, errors='ignore')
expression_data_T = expression_data_T[:38]
expression_data_T

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,378949,378950,105377240,84559,401634,105377243,107987359,4572,4512,4574
0,-0.158632,1.070933,1.15215,0.499194,-0.342997,-0.697982,0.160668,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,-0.135784,-0.237304,-0.162382
1,1.379276,0.767235,0.761592,-0.765431,-0.342997,-0.697982,-1.27589,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,0.567824,1.302546,1.353526
2,5.355088,3.457126,2.062135,1.706335,-0.342997,-0.697982,2.435216,-0.602311,-0.164399,-0.164399,...,2.33513,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,3.460437,5.055768,5.39109
3,2.002351,0.897391,0.336385,-0.075635,-0.342997,0.697982,0.160668,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,1.42779,2.327118,1.941426
4,0.679553,1.070933,0.673147,0.61416,-0.342997,0.697982,0.579663,0.136006,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,2.131398,0.856884,0.658735
5,-0.213027,1.548171,0.828458,-0.018153,-0.342997,-0.697982,0.938803,0.136006,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,1.975041,-0.052044,-0.20611
6,0.0095,0.810621,-0.304008,1.01654,-0.342997,-0.697982,2.016221,2.350955,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,1.349611,-0.139682,-0.016622
7,-0.393521,-0.274013,-0.72496,-0.650465,-0.342997,-0.697982,-0.797037,0.136006,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,0.098752,-0.558824,-0.381023
8,-0.27484,0.289997,-0.605817,0.384228,-0.342997,-0.697982,0.45995,2.350955,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,0.724182,-0.258147,-0.269273
9,-0.334181,1.070933,-0.549893,1.59137,-0.342997,-0.697982,0.63952,-0.602311,-0.164399,-0.164399,...,-0.274721,-0.235702,-0.222988,-0.164399,-0.164399,-0.222988,-0.164399,-0.604856,-0.34289,-0.322719


In [10]:
expression_cols = merged_data.columns[:39]
#expression_cols
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')
expression_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.009500,-0.393521,-0.274840,...,-0.264950,-0.324291,-0.388576,-0.200665,-0.195719,-0.190774,-0.346543,-0.358906,-0.361378,-0.344071
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,-0.707866,-0.404169,-0.577710,-0.490939,-0.794637,-0.751251,-1.011563,-0.751251,-0.534325,-0.274013
2,9636,1.152150,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.724960,-0.605817,...,0.269823,-0.448682,-0.379384,-0.138667,-0.593051,-0.522234,-0.619798,-0.663869,-0.503086,-0.636818
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.614160,-0.018153,1.016540,-0.650465,0.384228,...,-0.018153,-0.592982,-0.248084,-0.133118,-1.512709,-1.340260,-1.570192,-1.225294,-1.110328,-1.167811
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,2.915476,2.915476,2.915476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,6.082763
4653,4572,-0.135784,0.567824,3.460437,1.427790,2.131398,1.975041,1.349611,0.098752,0.724182,...,-0.526678,-0.917571,-0.995750,0.020573,-0.057605,-0.448499,-0.839392,-0.683035,-0.448499,-0.604856
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,-0.286306,-0.433196,-0.606873,-0.096739,-0.143264,0.174674,-0.536076,-0.518510,-0.269883,-0.352416


In [11]:
expression_cols = merged_data.columns[1:39]
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')
expression_data

Unnamed: 0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
0,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.009500,-0.393521,-0.274840,-0.334181,...,-0.264950,-0.324291,-0.388576,-0.200665,-0.195719,-0.190774,-0.346543,-0.358906,-0.361378,-0.344071
1,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,1.070933,...,-0.707866,-0.404169,-0.577710,-0.490939,-0.794637,-0.751251,-1.011563,-0.751251,-0.534325,-0.274013
2,1.152150,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.724960,-0.605817,-0.549893,...,0.269823,-0.448682,-0.379384,-0.138667,-0.593051,-0.522234,-0.619798,-0.663869,-0.503086,-0.636818
3,0.499194,-0.765431,1.706335,-0.075635,0.614160,-0.018153,1.016540,-0.650465,0.384228,1.591370,...,-0.018153,-0.592982,-0.248084,-0.133118,-1.512709,-1.340260,-1.570192,-1.225294,-1.110328,-1.167811
4,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,2.915476,2.915476,2.915476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988
4652,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,6.082763
4653,-0.135784,0.567824,3.460437,1.427790,2.131398,1.975041,1.349611,0.098752,0.724182,-0.604856,...,-0.526678,-0.917571,-0.995750,0.020573,-0.057605,-0.448499,-0.839392,-0.683035,-0.448499,-0.604856
4654,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,-0.342890,...,-0.286306,-0.433196,-0.606873,-0.096739,-0.143264,0.174674,-0.536076,-0.518510,-0.269883,-0.352416


In [12]:
pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

In [13]:
# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

In [14]:
pca_scores = expression_data.values.dot(components_df.values.T)
pca_scores_df = pd.DataFrame(
    pca_scores,
    index=expression_data.index,  # Samples from the original data
    columns=components_df.index   # Principal Components
)

pca_scores_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,-1.610801,-1.124601,0.564224,-0.734727,-3.618517,-1.642160,3.554775,-0.319615,-0.576515,0.317907,-0.604048,-0.587636,0.252575,-0.006866,-0.019470
1,-2.911410,-1.735713,1.317743,-0.940426,-3.834176,-0.904999,0.622104,0.069737,0.498821,-0.576462,-0.288624,1.177010,0.433593,-0.064411,0.567190
2,-4.055493,-0.467172,0.203044,-0.044119,-0.943726,-0.452800,0.634827,0.404158,0.436351,-0.495258,-0.407477,-0.215540,2.820361,-0.206225,0.456676
3,-4.606621,-2.334358,0.495360,-0.121762,-0.091382,-0.929582,-0.551514,0.220000,-0.915020,-0.358099,-0.298688,1.323217,-0.154980,-0.150400,-0.670888
4,-0.867140,2.928064,2.638784,0.253915,0.111597,0.830826,0.030582,0.395209,0.473722,0.530572,2.942740,-0.820274,-1.966113,-0.055807,-1.096644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-2.966374,0.410305,-0.243670,0.490874,1.812046,-0.508316,0.086657,0.370534,0.569015,0.873461,-0.147608,-0.147457,-3.060324,-0.003758,-2.012397
4652,0.995893,2.984163,5.101125,0.046704,0.373640,-1.017776,-0.270476,0.213154,0.096213,0.190008,-0.669202,0.162325,0.022308,-0.026206,-0.021736
4653,-2.687955,-1.520733,0.628312,-1.352734,-4.187140,-0.689558,0.615231,0.815830,0.977837,0.344002,-1.137483,-0.685559,-0.764796,0.648457,0.047816
4654,-2.110323,-0.949373,0.537358,-0.856578,-3.395222,-1.289364,3.614308,-0.106106,-0.169933,0.747963,-0.688980,-0.455598,-0.002413,0.120606,-0.049604


In [15]:
identifiers = d.tail(2)
identifiers

Unnamed: 0_level_0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Edited (1) or Unedited (0),1,1,1,1,1,1,1,1,1,1,...,,,,,,,,,,
editing mechanism,BE4,BE4,BE4,ABE8,ABE8,ABE8,Cas9,Cas9,Cas9,Utelectro,...,,,,,,,,,,


In [16]:
y = identifiers.head(1).iloc[:, :38]
y

Unnamed: 0_level_0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Edited (1) or Unedited (0),1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
# Assuming components_df is a pandas DataFrame (15x38)
X = components_df.T.values  # Transpose to get samples as rows (38x15)
y = identifiers.head(1).iloc[:, :38]
y = np.array(y).flatten()   # Ensure y is a 1D array (38,)
y = np.array(y).astype(int)  # Ensure y is integer type

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest as an example)
clf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

X shape: (38, 15), y shape: (38,)
Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      1.00      0.86         6

    accuracy                           0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Prepare the data
X = components_df.T.values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best cross-validation score: 0.890909090909091

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8


Top 5 Most Important Features:
   feature  importance
1        1    0.310651
9        9    0.117182
7        7    0.100431
2        2    0.093922
5        5    0.064974


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Cross-validation scores: [1.         0.46666667 0.79487179 1.         1.        ]
Average CV score: 0.8523076923076923
CV score std: 0.2085458876248424


Performs best when:
- balanced: model adjusts weights inversely proportional to class frequencies, helping address class imbalance.
- max_depth=None: capture all patterns in data
- min_samples_leaf=1 and min_samples_split=2: full depth, max flexibility in splitting nodes
- small number of trees sufficient for strong performance
The cross-validation score of 0.89 indicates the model performed well during training across folds, particularly in capturing the balance between precision and recall.

Precision, Recall, F1-Score (Support = 8 test samples):

- Class 0: Performance for this class is poor (F1=0.00), likely due to having only 1 sample in the test set. This is too small to generalize reliably.
- Class 1: The model handled this majority class well, achieving perfect recall (100%) and strong precision (88%), leading to a high F1-score of 0.93.

Macro-average is low because it treats all classes equally, including underperforming Class 0.
Weighted average accounts for class imbalance, resulting in a higher score (0.82) due to the better performance for the dominant Class 1.

high variability in performance across folds; decent model performance overall; high variability in accuracy b/w classes