In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

df = pd.read_csv('../../data/GSE218462_raw_counts_GRCh38.p13_NCBI.tsv', sep='\t')
df = df.T
df.columns = df.iloc[0]
df = df[1:]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data)

scaled_df.columns = df.columns
scaled_df.index = df.index
unedited = ['GSM6745632', 'GSM6745633', 'GSM6745634', 'GSM6745635', 'GSM6745636', 'GSM6745637']
scaled_df['Edited (1) or Unedited (0)'] = scaled_df.index.map(lambda gene: 0 if gene in unedited else 1)
mechanisms = {
    "BE4": ["GSM6745599", "GSM6745600", "GSM6745601", "GSM6745611", "GSM6745612", "GSM6745613"],
    "ABE8": ["GSM6745602", "GSM6745603", "GSM6745604", "GSM6745614", "GSM6745615", "GSM6745616"],
    "Cas9": ["GSM6745605", "GSM6745606", "GSM6745607", "GSM6745617", "GSM6745618", "GSM6745619"],
    "Utelectro": ["GSM6745608", "GSM6745609", "GSM6745610", "GSM6745620", "GSM6745621", "GSM6745622"],
    "dCas9": ["GSM6745623", "GSM6745624", "GSM6745625"],
    "BE4alone": ["GSM6745626", "GSM6745627", "GSM6745628"],
    "ABE8alone": ["GSM6745629", "GSM6745630", "GSM6745631"],
    "UT": ["GSM6745632", "GSM6745633", "GSM6745634", "GSM6745635", "GSM6745636", "GSM6745637"]
}

# Inverting the dictionary to map gene code to its corresponding key
mechanism_map = {gene: mechanism for mechanism, genes in mechanisms.items() for gene in genes}

scaled_df['editing mechanism'] = scaled_df.index.map(mechanism_map)

# print(scaled_df)

txt_file_path = '../EDA_sj/relevant_genes_1.6_250.txt'
with open(txt_file_path, 'r') as file:
    txt_data = file.read()

soham_gene_ids = txt_data.split(',')

column_names = scaled_df.columns.tolist()

column_names = [str(col) for col in scaled_df.columns.tolist()]
filtered_column_names = [col for col in scaled_df.columns if str(col) in soham_gene_ids]
scaled_df = scaled_df[filtered_column_names + ['Edited (1) or Unedited (0)', 'editing mechanism']]

# Output the filtered DataFrame
#print(scaled_df)

In [2]:
#scaled_df

In [3]:
tsv_file_path = '../../data/Human.GRCh38.p13.annot.tsv'
tsv_df = pd.read_csv(tsv_file_path, sep='\t')

#tsv_df

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
metadata_file_path = "../../data/Human.GRCh38.p13.annot.tsv"
metadata = pd.read_csv(metadata_file_path, sep='\t')

transposed_data = scaled_df.T
transposed_data = transposed_data.reset_index()
transposed_data.columns.values[0] = 'GeneID'


merged_data = transposed_data.merge(metadata, on='GeneID', how='left').set_index('GeneID')
merged_data = merged_data.iloc[:39378]

print("Merged Data (first few rows):")

# Optionally, save to a new file
output_file_path = "merged_gene_expression_with_metadata.csv"
merged_data.to_csv(output_file_path)
print(merged_data.columns)
# print(f"Merged data saved to {output_file_path}")

Merged Data (first few rows):
Index(['GSM6745599', 'GSM6745600', 'GSM6745601', 'GSM6745602', 'GSM6745603',
       'GSM6745604', 'GSM6745605', 'GSM6745606', 'GSM6745607', 'GSM6745609',
       'GSM6745610', 'GSM6745611', 'GSM6745612', 'GSM6745613', 'GSM6745614',
       'GSM6745615', 'GSM6745616', 'GSM6745617', 'GSM6745618', 'GSM6745619',
       'GSM6745620', 'GSM6745621', 'GSM6745622', 'GSM6745623', 'GSM6745624',
       'GSM6745625', 'GSM6745626', 'GSM6745627', 'GSM6745628', 'GSM6745629',
       'GSM6745630', 'GSM6745631', 'GSM6745632', 'GSM6745633', 'GSM6745634',
       'GSM6745635', 'GSM6745636', 'GSM6745637', 'Symbol', 'Description',
       'Synonyms', 'GeneType', 'EnsemblGeneID', 'Status', 'ChrAcc', 'ChrStart',
       'ChrStop', 'Orientation', 'Length', 'GOFunctionID', 'GOProcessID',
       'GOComponentID', 'GOFunction', 'GOProcess', 'GOComponent'],
      dtype='object')


In [5]:
merged_data = merged_data.iloc[:, :-6]
merged_data = merged_data.drop(columns=['EnsemblGeneID'])
merged_data
output_file_path = "testing.csv"
merged_data.to_csv(output_file_path)

In [6]:
merged_data_nulls = merged_data['Description'].isnull().sum()
merged_data_nulls

145

In [7]:
df_copy = merged_data.copy()

In [8]:
#df_copy

In [9]:
# Delete any genes with missing descriptions
merged_data = merged_data.dropna(subset=['Description'])
print(f"Number of rows: {merged_data.shape[0]}, Number of columns: {merged_data.shape[1]}")
merged_data.reset_index(inplace=True) # Apparently, I had GeneID as the index, so I reset it to a column cuz it's easier to work with
#print(merged_data)

Number of rows: 4656, Number of columns: 48


In [10]:
#merged_data

In [11]:
expression_data_T = merged_data.T
# Step 1: Reset the column names using the first row
expression_data_T.columns = expression_data_T.iloc[0]
expression_data_T = expression_data_T.iloc[1:].reset_index(drop=True)
expression_data_T = expression_data_T.apply(pd.to_numeric, errors='ignore')
expression_data_T = expression_data_T[:38]
#expression_data_T

In [12]:
expression_cols = merged_data.columns[:39]
#expression_cols
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')
#expression_data

In [13]:
gene_ids = expression_data[['GeneID']]
len(gene_ids)

4656

In [14]:
expression_cols = merged_data.columns[1:39]
#expression_cols
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')
len(expression_data)

4656

In [15]:
pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

In [16]:
#expression_data_filled

In [17]:
# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

In [18]:
components_df2 = pd.DataFrame(
    data=pca_result_final,
    columns=[f'PC{i+1}' for i in range(n_components_75)],
    index=expression_data.index  # Retain the same indices as the original data
)

In [19]:
components_df2

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,0.707622,-1.863062,0.411647,-0.767112,-3.223188,-1.476874,3.626903,-0.475525,-0.727924,0.144347,-0.411010,-0.607253,0.336086,-0.000246,-0.059222
1,-0.592987,-2.473887,1.165153,-0.973288,-3.439239,-0.745662,0.684155,-0.083073,0.345405,-0.742536,-0.100778,1.158453,0.485874,0.000662,0.610355
2,-1.737069,-1.205255,0.050398,-0.077252,-0.548921,-0.295293,0.702532,0.247976,0.270947,-0.688003,-0.197509,-0.260012,2.835824,-0.187446,0.502222
3,-2.288197,-3.072191,0.342819,-0.154070,0.302149,-0.767665,-0.482830,0.057858,-1.078175,-0.547660,-0.092566,1.272827,-0.094509,-0.194262,-0.667901
4,1.451283,2.189618,2.486210,0.221545,0.506669,0.996042,0.102117,0.232781,0.315427,0.332157,3.151352,-0.847174,-1.882244,-0.113130,-1.230138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-0.647951,-0.328400,-0.395984,0.459038,2.207655,-0.335294,0.164346,0.200308,0.419512,0.680058,0.050033,-0.189420,-2.987320,-0.078517,-2.197930
4652,3.314316,2.245816,4.948624,0.014318,0.768616,-0.855745,-0.202788,0.054677,-0.057553,0.013640,-0.477323,0.137395,0.091651,-0.008613,-0.046732
4653,-0.369532,-2.259361,0.475628,-1.385063,-3.791884,-0.527537,0.681076,0.661834,0.829407,0.179851,-0.948452,-0.684930,-0.653579,0.669854,0.006907
4654,0.208099,-1.687812,0.384780,-0.889051,-2.999897,-1.125283,3.684968,-0.260356,-0.321270,0.574399,-0.495559,-0.476817,0.076075,0.125734,-0.073056


In [20]:
components_df_merged = gene_ids.merge(components_df2, left_index=True, right_index=True, how='left')

In [21]:
components_df_merged

Unnamed: 0,GeneID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,113219467,0.707622,-1.863062,0.411647,-0.767112,-3.223188,-1.476874,3.626903,-0.475525,-0.727924,0.144347,-0.411010,-0.607253,0.336086,-0.000246,-0.059222
1,57801,-0.592987,-2.473887,1.165153,-0.973288,-3.439239,-0.745662,0.684155,-0.083073,0.345405,-0.742536,-0.100778,1.158453,0.485874,0.000662,0.610355
2,9636,-1.737069,-1.205255,0.050398,-0.077252,-0.548921,-0.295293,0.702532,0.247976,0.270947,-0.688003,-0.197509,-0.260012,2.835824,-0.187446,0.502222
3,100288175,-2.288197,-3.072191,0.342819,-0.154070,0.302149,-0.767665,-0.482830,0.057858,-1.078175,-0.547660,-0.092566,1.272827,-0.094509,-0.194262,-0.667901
4,102465434,1.451283,2.189618,2.486210,0.221545,0.506669,0.996042,0.102117,0.232781,0.315427,0.332157,3.151352,-0.847174,-1.882244,-0.113130,-1.230138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.647951,-0.328400,-0.395984,0.459038,2.207655,-0.335294,0.164346,0.200308,0.419512,0.680058,0.050033,-0.189420,-2.987320,-0.078517,-2.197930
4652,107987359,3.314316,2.245816,4.948624,0.014318,0.768616,-0.855745,-0.202788,0.054677,-0.057553,0.013640,-0.477323,0.137395,0.091651,-0.008613,-0.046732
4653,4572,-0.369532,-2.259361,0.475628,-1.385063,-3.791884,-0.527537,0.681076,0.661834,0.829407,0.179851,-0.948452,-0.684930,-0.653579,0.669854,0.006907
4654,4512,0.208099,-1.687812,0.384780,-0.889051,-2.999897,-1.125283,3.684968,-0.260356,-0.321270,0.574399,-0.495559,-0.476817,0.076075,0.125734,-0.073056


In [22]:
def get_relevant_genes_with_contribution_2(pc, cumulative_threshold=0.9):
    """
    Get relevant genes based on their contribution to the specified principal component.

    Args:
    pc (int): The principal component index (1-based, e.g., 1 for PC1).
    cumulative_threshold (float): The cumulative contribution threshold (default=0.9).

    Returns:
    pd.DataFrame: A DataFrame containing relevant genes, their loadings, and contribution percentages.
    """
    # Ensure the principal component exists in the DataFrame
    pc_column = f'PC{pc}'
    if pc_column not in components_df_merged.columns:
        raise ValueError(f"Principal Component {pc_column} not found in components_df_merged.")

    # Extract the absolute loadings for the specified principal component
    loadings = components_df_merged[[pc_column, 'GeneID']].copy()
    loadings[pc_column] = loadings[pc_column].abs()
    loadings = loadings.sort_values(by=pc_column, ascending=False)

    # Calculate cumulative variance contribution
    loadings['CumulativeVariance'] = loadings[pc_column].cumsum() / loadings[pc_column].sum()

    # Filter genes based on the cumulative threshold
    relevant_genes = loadings[loadings['CumulativeVariance'] <= cumulative_threshold]

    # Calculate the contribution percentage for each gene
    relevant_genes['Contribution (%)'] = (relevant_genes[pc_column] / loadings[pc_column].sum()) * 100

    # Return only relevant columns
    return relevant_genes[['GeneID', pc_column, 'Contribution (%)']]

In [23]:
d = {}
for pc in range(1, n_components_75 + 1):
    print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes = get_relevant_genes_with_contribution_2(pc, 0.75)
    print(len(rel_genes))
    #print(rel_genes)
    #print(rel_genes['GeneID'].tolist())
    d[pc] = rel_genes['GeneID'].tolist()


Relevant genes with contributions for PC1:
2730

Relevant genes with contributions for PC2:
2352

Relevant genes with contributions for PC3:
989

Relevant genes with contributions for PC4:
929

Relevant genes with contributions for PC5:
2135

Relevant genes with contributions for PC6:
1708

Relevant genes with contributions for PC7:
1620

Relevant genes with contributions for PC8:
1260

Relevant genes with contributions for PC9:
1348

Relevant genes with contributions for PC10:
1250

Relevant genes with contributions for PC11:
1643

Relevant genes with contributions for PC12:
1812

Relevant genes with contributions for PC13:
1677

Relevant genes with contributions for PC14:
1191

Relevant genes with contributions for PC15:
1221


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_genes['Contribution (%)'] = (relevant_genes[pc_column] / loadings[pc_column].sum()) * 100


In [24]:
d

{1: [105378480,
  100507521,
  102465509,
  105379157,
  112267868,
  105377466,
  9201,
  105373968,
  105374264,
  344148,
  105374353,
  283214,
  105373426,
  105372639,
  105379640,
  107984411,
  9721,
  107985969,
  170062,
  101928167,
  107987338,
  2830,
  105369905,
  102724834,
  105375587,
  107985926,
  127707,
  105377509,
  406967,
  4067,
  339,
  27164,
  4354,
  105374817,
  51393,
  29015,
  105379011,
  6503,
  9948,
  2182,
  6555,
  947,
  11040,
  9844,
  389332,
  105371346,
  3507,
  8645,
  6016,
  100128682,
  101929777,
  3937,
  64844,
  4478,
  6774,
  50810,
  60313,
  8780,
  50809,
  22894,
  10678,
  90411,
  6483,
  5567,
  10137,
  80829,
  8439,
  54902,
  202018,
  3688,
  55041,
  7071,
  57862,
  5292,
  109,
  2589,
  6641,
  8749,
  27246,
  9874,
  25793,
  79720,
  1315,
  30815,
  26994,
  3059,
  10735,
  219771,
  54808,
  84146,
  8805,
  3098,
  100526693,
  4892,
  10623,
  53339,
  10096,
  9517,
  51719,
  100861514,
  121256,
  372,

In [25]:
unique_ids = set()

In [26]:
for v in d.values():
    for geneid in v: 
        unique_ids.add(geneid)

In [27]:
len(unique_ids)

4647

In [28]:
def get_top_k_genes(pc, k):
    """
    Get the top k genes based on their contribution to the specified principal component.

    Args:
    pc (int): The principal component index (1-based, e.g., 1 for PC1).
    k (int): The number of top genes to retrieve.

    Returns:
    pd.DataFrame: A DataFrame containing the top k genes, their loadings, and contribution percentages.
    """
    # Ensure the principal component exists in the DataFrame
    pc_column = f'PC{pc}'
    if pc_column not in components_df_merged.columns:
        raise ValueError(f"Principal Component {pc_column} not found in components_df_merged.")

    # Extract the absolute loadings for the specified principal component
    loadings = components_df_merged[[pc_column, 'GeneID']].copy()
    loadings[pc_column] = loadings[pc_column].abs()
    loadings = loadings.sort_values(by=pc_column, ascending=False)

    # Select the top k genes
    top_k_genes = loadings.head(k)

    # Calculate the contribution percentage for each gene
    total_loading = loadings[pc_column].sum()
    top_k_genes['Contribution (%)'] = (top_k_genes[pc_column] / total_loading) * 100

    # Return only relevant columns
    return top_k_genes[['GeneID', pc_column, 'Contribution (%)']]

## Top 300 genes for each PC

In [29]:
d_top_k = {}
for pc in range(1, n_components_75 + 1):
    print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes_top = get_top_k_genes(pc, 300)
    #print(len(rel_genes_top))
    print(rel_genes_top)
    #print(rel_genes['GeneID'].tolist())
    d_top_k[pc] = rel_genes_top['GeneID'].tolist()


Relevant genes with contributions for PC1:
         GeneID       PC1  Contribution (%)
2541  105378480  4.336285          0.037115
2679  100507521  4.329503          0.037057
2192  102465509  4.227476          0.036183
1447  105379157  4.170397          0.035695
3236  112267868  4.109478          0.035173
...         ...       ...               ...
2484        310  3.548714          0.030374
929       55279  3.548711          0.030374
1503      10318  3.548558          0.030372
1452     133619  3.548280          0.030370
3846       8493  3.547527          0.030364

[300 rows x 3 columns]

Relevant genes with contributions for PC2:
         GeneID       PC2  Contribution (%)
3221       1053  4.856493          0.078830
2834       9573  4.820778          0.078251
2672        932  4.793611          0.077810
3181     643332  4.530921          0.073546
3975       5055  4.402059          0.071454
...         ...       ...               ...
783        1593  2.568346          0.041689
1617    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k_genes['Contribution (%)'] = (top_k_genes[pc_column] / total_loading) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k_genes['Contribution (%)'] = (top_k_genes[pc_column] / total_loading) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k_genes['Contribution (%)'] = (top_k_ge

In [30]:
unique_ids_top_k = set()
for v in d_top_k.values():
    for geneid in v: 
        unique_ids_top_k.add(geneid)
len(unique_ids_top_k)

2864

In [31]:
filtered_components = components_df_merged[components_df_merged["GeneID"].isin(unique_ids_top_k)]
filtered_components

Unnamed: 0,GeneID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,113219467,0.707622,-1.863062,0.411647,-0.767112,-3.223188,-1.476874,3.626903,-0.475525,-0.727924,0.144347,-0.411010,-0.607253,0.336086,-0.000246,-0.059222
1,57801,-0.592987,-2.473887,1.165153,-0.973288,-3.439239,-0.745662,0.684155,-0.083073,0.345405,-0.742536,-0.100778,1.158453,0.485874,0.000662,0.610355
2,9636,-1.737069,-1.205255,0.050398,-0.077252,-0.548921,-0.295293,0.702532,0.247976,0.270947,-0.688003,-0.197509,-0.260012,2.835824,-0.187446,0.502222
3,100288175,-2.288197,-3.072191,0.342819,-0.154070,0.302149,-0.767665,-0.482830,0.057858,-1.078175,-0.547660,-0.092566,1.272827,-0.094509,-0.194262,-0.667901
4,102465434,1.451283,2.189618,2.486210,0.221545,0.506669,0.996042,0.102117,0.232781,0.315427,0.332157,3.151352,-0.847174,-1.882244,-0.113130,-1.230138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.647951,-0.328400,-0.395984,0.459038,2.207655,-0.335294,0.164346,0.200308,0.419512,0.680058,0.050033,-0.189420,-2.987320,-0.078517,-2.197930
4652,107987359,3.314316,2.245816,4.948624,0.014318,0.768616,-0.855745,-0.202788,0.054677,-0.057553,0.013640,-0.477323,0.137395,0.091651,-0.008613,-0.046732
4653,4572,-0.369532,-2.259361,0.475628,-1.385063,-3.791884,-0.527537,0.681076,0.661834,0.829407,0.179851,-0.948452,-0.684930,-0.653579,0.669854,0.006907
4654,4512,0.208099,-1.687812,0.384780,-0.889051,-2.999897,-1.125283,3.684968,-0.260356,-0.321270,0.574399,-0.495559,-0.476817,0.076075,0.125734,-0.073056


In [32]:
scaled_df["Edited (1) or Unedited (0)"].values

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

In [33]:
filtered_merged_data = merged_data[merged_data["GeneID"].isin(unique_ids_top_k)] 

In [34]:
filtered_merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
# Assuming components_df is a pandas DataFrame (15x38)
#X = components_df.T.values  # Transpose to get samples as rows (38x15)
X = (filtered_merged_data.iloc[:, 1:39]).T.values
identifiers = scaled_df["Edited (1) or Unedited (0)"].values
y = np.array(identifiers).flatten()   # Ensure y is a 1D array (38,)
y = np.array(y).astype(int)  # Ensure y is integer type

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest as an example)
clf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

X shape: (38, 2864), y shape: (38,)
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         6

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



### with smote

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best cross-validation score: 0.9555555555555555

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Top 5 Most Important Features:
      feature  importance
771       771    0.040000
1757     1757    0.038457
2708     2708    0.036909
1678     1678    0.036904
208       208    0.035714

Cross-validation scores: [1. 1. 1. 1. 1.]
Average CV score: 1.0
CV score std: 0.0


### no smote

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    #('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best cross-validation score: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Top 5 Most Important Features:
      feature  importance
2339     2339    0.041538
1788     1788    0.040000
2530     2530    0.037274
2379     2379    0.037061
126       126    0.037037

Cross-validation scores: [1.         0.46666667 1.         1.         1.        ]
Average CV score: 0.8933333333333333
CV score std: 0.21333333333333335


In [37]:
feature_importances

Unnamed: 0,feature,importance
2609,2609,0.073872
1154,1154,0.038501
1721,1721,0.035726
19,19,0.035690
166,166,0.034122
...,...,...
969,969,0.000000
970,970,0.000000
971,971,0.000000
972,972,0.000000


In [38]:
filtered_merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [39]:
genes_lst = []
for feature in feature_importances['feature'].tolist():
    genes_id = filtered_merged_data['GeneID'].tolist()[feature]
    genes_lst.append(genes_id)

In [40]:
feature_importances['GeneID'] = genes_lst

In [41]:
feature_importances

Unnamed: 0,feature,importance,GeneID
2609,2609,0.073872,26051
1154,1154,0.038501,1278
1721,1721,0.035726,486
19,19,0.035690,26871
166,166,0.034122,6279
...,...,...,...
969,969,0.000000,101928519
970,970,0.000000,1871
971,971,0.000000,105374973
972,972,0.000000,102724749


## Top 50 genes for each PC

In [42]:
d_top_k = {}
for pc in range(1, n_components_75 + 1):
    #print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes_top = get_top_k_genes(pc, 50)
    #print(len(rel_genes_top))
    #print(rel_genes_top)
    #print(rel_genes['GeneID'].tolist())
    d_top_k[pc] = rel_genes_top['GeneID'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k_genes['Contribution (%)'] = (top_k_genes[pc_column] / total_loading) * 100


In [43]:
unique_ids_top_k = set()
for v in d_top_k.values():
    for geneid in v: 
        unique_ids_top_k.add(geneid)
len(unique_ids_top_k)

746

In [44]:
filtered_components = components_df_merged[components_df_merged["GeneID"].isin(unique_ids_top_k)]
filtered_components
filtered_merged_data = merged_data[merged_data["GeneID"].isin(unique_ids_top_k)] 

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
# Assuming components_df is a pandas DataFrame (15x38)
#X = components_df.T.values  # Transpose to get samples as rows (38x15)
X = (filtered_merged_data.iloc[:, 1:39]).T.values
identifiers = scaled_df["Edited (1) or Unedited (0)"].values
y = np.array(identifiers).flatten()   # Ensure y is a 1D array (38,)
y = np.array(y).astype(int)  # Ensure y is integer type

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest as an example)
clf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

X shape: (38, 746), y shape: (38,)
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         6

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best cross-validation score: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Top 5 Most Important Features:
     feature  importance
252      252    0.101047
632      632    0.059946
119      119    0.053158
517      517    0.047143
525      525    0.040000

Cross-validation scores: [1. 1. 1. 1. 1.]
Average CV score: 1.0
CV score std: 0.0


In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    #('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best cross-validation score: 0.8909090909090909

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Top 5 Most Important Features:
     feature  importance
346      346    0.082684
297      297    0.069185
639      639    0.060000
119      119    0.042812
443      443    0.040000

Cross-validation scores: [1.         1.         1.         0.46153846 1.        ]
Average CV score: 0.8923076923076924
CV score std: 0.2153846153846154


In [53]:
#feature_importances

In [54]:
genes_lst = []
for feature in feature_importances['feature'].tolist():
    genes_id = filtered_merged_data['GeneID'].tolist()[feature]
    genes_lst.append(genes_id)
feature_importances['GeneID'] = genes_lst
feature_importances

Unnamed: 0,feature,importance,GeneID
113,113,0.064511,2487
650,650,0.061399,116844
621,621,0.058457,8288
465,465,0.056904,339
642,642,0.048142,5055
...,...,...,...
264,264,0.000000,3482
265,265,0.000000,102724087
266,266,0.000000,102724152
267,267,0.000000,107986672


## Top 30 Genes for each PC

In [55]:
d_top_k = {}
for pc in range(1, n_components_75 + 1):
    #print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes_top = get_top_k_genes(pc, 30)
    #print(len(rel_genes_top))
    #print(rel_genes_top)
    #print(rel_genes['GeneID'].tolist())
    d_top_k[pc] = rel_genes_top['GeneID'].tolist()

unique_ids_top_k = set()
for v in d_top_k.values():
    for geneid in v: 
        unique_ids_top_k.add(geneid)
print(len(unique_ids_top_k))

filtered_components = components_df_merged[components_df_merged["GeneID"].isin(unique_ids_top_k)]
filtered_components
filtered_merged_data = merged_data[merged_data["GeneID"].isin(unique_ids_top_k)] 

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
# Assuming components_df is a pandas DataFrame (15x38)
#X = components_df.T.values  # Transpose to get samples as rows (38x15)
X = (filtered_merged_data.iloc[:, 1:39]).T.values
identifiers = scaled_df["Edited (1) or Unedited (0)"].values
y = np.array(identifiers).flatten()   # Ensure y is a 1D array (38,)
y = np.array(y).astype(int)  # Ensure y is integer type

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest as an example)
clf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

446
X shape: (38, 446), y shape: (38,)
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         6

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k_genes['Contribution (%)'] = (top_k_genes[pc_column] / total_loading) * 100


In [58]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    #('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best cross-validation score: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Top 5 Most Important Features:
     feature  importance
366      366    0.116131
427      427    0.100000
267      267    0.085492
317      317    0.064554
318      318    0.060000

Cross-validation scores: [1.         1.         0.79487179 1.         1.        ]
Average CV score: 0.9589743589743589
CV score std: 0.08205128205128207


In [57]:
genes_lst = []
for feature in feature_importances['feature'].tolist():
    genes_id = filtered_merged_data['GeneID'].tolist()[feature]
    genes_lst.append(genes_id)
feature_importances['GeneID'] = genes_lst
feature_importances

Unnamed: 0,feature,importance,GeneID
427,427,0.085495,7441
99,99,0.060000,2994
206,206,0.060000,84985
385,385,0.058355,5055
316,316,0.044274,6035
...,...,...,...
155,155,0.000000,102724152
154,154,0.000000,3482
153,153,0.000000,59351
152,152,0.000000,105377986


## Ignore everything below this- all old work

In [66]:
pca_scores = expression_data.values.dot(components_df.values.T)
pca_scores_df = pd.DataFrame(
    pca_scores,
    index=expression_data.index,  # Samples from the original data
    columns=components_df.index   # Principal Components
)

pca_scores_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15
0,-1.610801,-1.124612,0.564211,-0.734624,-3.618639,-1.639153,3.557678,-0.314250,-0.575037,0.321524,-0.599190,-0.594837,0.252738,-0.006920,-0.027782
1,-2.911410,-1.735694,1.317712,-0.940545,-3.833913,-0.907048,0.618616,0.073178,0.499551,-0.564525,-0.304401,1.191602,0.432260,-0.018935,0.626995
2,-4.055493,-0.467221,0.203235,-0.044084,-0.943576,-0.453179,0.635606,0.404088,0.433866,-0.497378,-0.395191,-0.204357,2.804860,-0.194958,0.440974
3,-4.606622,-2.334408,0.495480,-0.121445,-0.091666,-0.927872,-0.546720,0.215315,-0.919379,-0.365640,-0.297957,1.318070,-0.134467,-0.261727,-0.739012
4,-0.867140,2.928021,2.638838,0.254078,0.111445,0.834559,0.033122,0.390462,0.468481,0.510157,2.959354,-0.835093,-1.972993,-0.108533,-1.165114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-2.966373,0.410262,-0.243542,0.490986,1.811874,-0.502386,0.091627,0.365972,0.566013,0.855165,-0.123449,-0.191500,-3.095292,-0.111582,-2.157631
4652,0.995893,2.984168,5.101118,0.046689,0.373676,-1.018386,-0.270901,0.213141,0.096785,0.192280,-0.668920,0.161180,0.024012,-0.039370,-0.017529
4653,-2.687955,-1.520702,0.628166,-1.352839,-4.186906,-0.691076,0.609522,0.819615,0.978664,0.356809,-1.142182,-0.686211,-0.759652,0.675618,0.040650
4654,-2.110323,-0.949376,0.537346,-0.856502,-3.395312,-1.287285,3.615589,-0.099258,-0.166977,0.751454,-0.685482,-0.461239,0.000411,0.109145,-0.049747


In [67]:
identifiers = d.tail(2)
identifiers

AttributeError: 'dict' object has no attribute 'tail'

In [None]:
y = identifiers.head(1).iloc[:, :38]
y

Unnamed: 0_level_0,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,GSM6745609,...,GSM6745628,GSM6745629,GSM6745630,GSM6745631,GSM6745632,GSM6745633,GSM6745634,GSM6745635,GSM6745636,GSM6745637
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Edited (1) or Unedited (0),1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,0,0


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
# Assuming components_df is a pandas DataFrame (15x38)
X = components_df.T.values  # Transpose to get samples as rows (38x15)
y = identifiers.head(1).iloc[:, :38]
y = np.array(y).flatten()   # Ensure y is a 1D array (38,)
y = np.array(y).astype(int)  # Ensure y is integer type

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest as an example)
clf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

X shape: (38, 15), y shape: (38,)
Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.75      1.00      0.86         6

    accuracy                           0.75         8
   macro avg       0.38      0.50      0.43         8
weighted avg       0.56      0.75      0.64         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Prepare the data
X = components_df.T.values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(k_neighbors = 3, random_state=42)), #SMOTE: balances dataset by generating synthetic samples for unedited class
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # testing across increments
    'classifier__max_depth': [None, 5, 10], # none for full depth (capture all patterns in data), #5 and 10: avoid overfitting
    'classifier__min_samples_split': [2, 5], # 2: max flexibility in splitting nodes, 5: more samples for a split -> reduce overfitting
    'classifier__min_samples_leaf': [1, 2], # single sample (more depth) or 2 samples (reduce overfitting)
    'classifier__class_weight': ['balanced', 'balanced_subsample'] # balanced: adjusts class weights based on sample freqs; subsample: recalculates weights 
                                                                    # per bootstrapped sample
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5, # 5-fold cross validation, balance b/w bias and variance 
    scoring='f1_macro', # equal considerations for both classes, averages the F1 scores of each class
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Make predictions with the best model
y_pred = grid_search.predict(X_test)

# Print detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get feature importances
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': grid_search.best_estimator_.named_steps['classifier'].feature_importances_
})
feature_importances = feature_importances.sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importances.head())

# Perform cross-validation to get a more robust estimate of model performance
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='f1_macro')
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())
print("CV score std:", cv_scores.std())

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best cross-validation score: 0.890909090909091

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8


Top 5 Most Important Features:
   feature  importance
1        1    0.258732
9        9    0.148743
2        2    0.123382
7        7    0.093493
5        5    0.083315

Cross-validation scores: [1.         0.46666667 0.79487179 0.46153846 1.        ]
Average CV score: 0.7446153846153847
CV score std: 0.240979810154374


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Print predictions for each sample
predictions = grid_search.predict(X)
for i, (true_label, predicted_label) in enumerate(zip(y, predictions)):
    print(f"Sample {i}: True label = {true_label}, Predicted label = {predicted_label}")

# Optional: Also show prediction probabilities
#probabilities = grid_search.predict_proba(X)
#for i, probs in enumerate(probabilities):
 #   print(f"Sample {i} probabilities: Class 0 = {probs[0]:.3f}, Class 1 = {probs[1]:.3f}")

Sample 0: True label = 1, Predicted label = 1
Sample 1: True label = 1, Predicted label = 1
Sample 2: True label = 1, Predicted label = 1
Sample 3: True label = 1, Predicted label = 1
Sample 4: True label = 1, Predicted label = 1
Sample 5: True label = 1, Predicted label = 1
Sample 6: True label = 1, Predicted label = 1
Sample 7: True label = 1, Predicted label = 1
Sample 8: True label = 1, Predicted label = 1
Sample 9: True label = 1, Predicted label = 1
Sample 10: True label = 1, Predicted label = 1
Sample 11: True label = 1, Predicted label = 1
Sample 12: True label = 1, Predicted label = 1
Sample 13: True label = 1, Predicted label = 1
Sample 14: True label = 1, Predicted label = 1
Sample 15: True label = 1, Predicted label = 1
Sample 16: True label = 1, Predicted label = 1
Sample 17: True label = 1, Predicted label = 1
Sample 18: True label = 1, Predicted label = 1
Sample 19: True label = 1, Predicted label = 1
Sample 20: True label = 1, Predicted label = 1
Sample 21: True label =

In [None]:
# Assuming y and predictions are lists or numpy arrays
for i, (true_label, predicted_label) in enumerate(zip(y[-6:], predictions[-6:]), start=len(y)-6):
    print(f"Sample {i}: True label = {true_label}, Predicted label = {predicted_label}")

Sample 32: True label = 0, Predicted label = 0
Sample 33: True label = 0, Predicted label = 0
Sample 34: True label = 0, Predicted label = 0
Sample 35: True label = 0, Predicted label = 1
Sample 36: True label = 0, Predicted label = 0
Sample 37: True label = 0, Predicted label = 0


Performs best when:
- balanced: model adjusts weights inversely proportional to class frequencies, helping address class imbalance.
- max_depth=None: capture all patterns in data
- min_samples_leaf=1 and min_samples_split=2: full depth, max flexibility in splitting nodes
- small number of trees sufficient for strong performance
The cross-validation score of 0.89 indicates the model performed well during training across folds, particularly in capturing the balance between precision and recall.

Precision, Recall, F1-Score (Support = 8 test samples):

- Class 0: Performance for this class is poor (F1=0.00), likely due to having only 1 sample in the test set. This is too small to generalize reliably.
- Class 1: The model handled this majority class well, achieving perfect recall (100%) and strong precision (88%), leading to a high F1-score of 0.93.

Macro-average is low because it treats all classes equally, including underperforming Class 0.
Weighted average accounts for class imbalance, resulting in a higher score (0.82) due to the better performance for the dominant Class 1.

high variability in performance across folds; decent model performance overall; high variability in accuracy b/w classes

In [None]:
components_df

GeneID,113219467,57801,9636,100288175,102465434,109623456,8510,107984872,112268220,284661,...,105376805,26870,29943,105376809,127707,51154,644068,30814,5320,26279
PC1,-0.034397,0.138742,-0.264705,0.025062,0.003796,-0.042319,-0.077518,0.124788,-0.074872,0.023672,...,-0.053555,0.089438,0.151901,-0.061128,0.118198,0.054578,0.187465,0.057647,0.036838,0.159415
PC2,-0.065775,-0.107842,-0.080089,-0.174025,-0.180049,-0.124947,-0.098408,-0.100521,-0.049392,-0.142804,...,-0.006629,-0.091391,-0.087517,-0.023358,0.51996,0.264383,0.34525,0.126037,0.186603,0.477684
PC3,0.037886,0.017371,0.086822,0.031088,0.041937,0.047169,0.046456,0.009776,0.031783,0.031275,...,-0.01596,-0.023535,-0.018428,-0.024184,-0.430821,-0.09442,-0.332662,0.015163,0.013255,0.81655
PC4,-0.029987,-0.028162,-0.073164,-0.053165,-0.073151,-0.06726,-0.057442,-0.028813,-0.020357,-0.029909,...,0.022848,0.013459,0.000791,0.031855,-0.631449,-0.002856,0.748156,0.006269,-0.013871,0.007472
PC5,-0.138815,-0.053242,-0.51158,-0.173473,-0.214538,-0.231945,-0.33843,-0.054539,-0.116546,-0.044876,...,0.109898,0.17506,0.091093,0.182922,-0.040388,-0.033445,-0.257495,-0.096685,-0.153315,0.059818
PC6,-0.026412,-0.034524,-0.274417,-0.090281,0.097494,0.098483,0.135104,-0.000188,-0.024059,-0.060805,...,0.000358,-0.032685,0.031164,-0.009056,-0.291311,0.742572,-0.240574,0.147101,0.33016,-0.163049
PC7,-0.032348,0.025621,0.587717,0.200223,-0.061321,-0.273622,-0.593387,-0.033052,-0.046025,-0.105156,...,-0.010043,0.049153,0.003711,0.045504,-0.08649,0.323171,-0.075797,0.036878,0.026461,-0.043237
PC8,-0.007754,-0.014081,-0.1408,-0.135983,0.910701,-0.152411,-0.279748,-0.018734,-0.011237,-0.011385,...,-0.00929,-0.039075,-0.069418,-0.023081,0.044965,-0.077482,0.059792,0.014656,0.000821,0.034117
PC9,0.012098,0.000246,-0.184938,0.236798,-0.033599,0.798121,-0.466779,-0.0466,0.032275,-0.092523,...,-0.046319,0.006612,-0.101537,-0.008624,0.021914,-0.033455,0.043499,0.007977,0.015745,0.015482
PC10,0.010012,-0.015331,-0.305256,0.850177,0.039789,-0.306029,0.136489,-0.013013,0.029662,-0.14067,...,-0.035947,-0.058759,-0.101025,-0.008113,0.022276,0.024629,0.02218,-0.013967,-0.003291,0.030713


In [None]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [None]:
expression_cols = merged_data.columns[1:39]
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')

pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

def get_relevant_genes_with_contribution(pc, cumulative_threshold=0.9):
    # Absolute loadings for the specified principal component
    loadings = components_df.iloc[pc-1].abs().sort_values(ascending=False)

    # Calculate cumulative variance contribution by each gene
    cumulative_variance = loadings.cumsum() / loadings.sum()

    # Select genes up to the cumulative threshold
    relevant_genes = loadings[cumulative_variance <= cumulative_threshold]

    # Calculate the contribution percentage for each gene
    contribution_percentages = (relevant_genes / loadings.sum()) * 100

    return pd.DataFrame({
        'Loading': relevant_genes,
        'Contribution (%)': contribution_percentages
    })

d = {}
# Example usage
for pc in range(1, n_components_75 + 1):
    print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes = get_relevant_genes_with_contribution(pc)
    print(len(rel_genes))
    #print(rel_genes)
    d[pc] = rel_genes


Relevant genes with contributions for PC1:
22

Relevant genes with contributions for PC2:
25

Relevant genes with contributions for PC3:
21

Relevant genes with contributions for PC4:
17

Relevant genes with contributions for PC5:
28

Relevant genes with contributions for PC6:
19

Relevant genes with contributions for PC7:
18

Relevant genes with contributions for PC8:
17

Relevant genes with contributions for PC9:
18

Relevant genes with contributions for PC10:
17

Relevant genes with contributions for PC11:
17

Relevant genes with contributions for PC12:
23

Relevant genes with contributions for PC13:
16

Relevant genes with contributions for PC14:
15

Relevant genes with contributions for PC15:
18


In [None]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [None]:
def get_relevant_genes_with_contribution_genes(pc, cumulative_threshold=0.9):
    # Absolute loadings for the specified principal component
    loadings = components_df.iloc[pc-1].abs().sort_values(ascending=False)

    # Calculate cumulative variance contribution by each gene
    cumulative_variance = loadings.cumsum() / loadings.sum()

    # Select genes up to the cumulative threshold
    relevant_genes = loadings[cumulative_variance <= cumulative_threshold]

    # Calculate the contribution percentage for each gene
    contribution_percentages = (relevant_genes / loadings.sum()) * 100

    return relevant_genes

d = {}
# Example usage
for pc in range(1, n_components_75 + 1):
    print(f"\nRelevant genes with contributions for PC{pc}:")
    rel_genes = get_relevant_genes_with_contribution_genes(pc)
    print(rel_genes)
    #print(pc)
    d[pc] = rel_genes


Relevant genes with contributions for PC1:
GeneID
102724659    0.520019
105376737    0.404821
6060         0.336847
26869        0.281714
9636         0.264705
55092        0.248511
644068       0.187465
26279        0.159415
29943        0.151901
7799         0.142350
57801        0.138742
54751        0.132867
107984872    0.124788
127707       0.118198
155184       0.098019
26870        0.089438
105376691    0.082548
8510         0.077518
27129        0.076625
112268220    0.074872
63036        0.067704
105376809    0.061128
Name: PC1, dtype: float64

Relevant genes with contributions for PC2:
GeneID
127707       0.519960
26279        0.477686
644068       0.345250
51154        0.264382
5320         0.186605
102465434    0.180048
100288175    0.174025
284661       0.142815
6060         0.138238
30814        0.126029
109623456    0.124944
55092        0.115195
102724659    0.108266
57801        0.107856
105376737    0.101369
107984872    0.100484
8510         0.098409
26870        0

In [None]:
d[1].index

Int64Index([102724659, 105376737,      6060,     26869,      9636,     55092,
               644068,     26279,     29943,      7799,     57801,     54751,
            107984872,    127707,    155184,     26870, 105376691,      8510,
                27129, 112268220,     63036, 105376809],
           dtype='int64', name='GeneID')

In [None]:
d.items()

dict_items([(1, GeneID
102724659    0.520019
105376737    0.404821
6060         0.336847
26869        0.281714
9636         0.264705
55092        0.248511
644068       0.187465
26279        0.159415
29943        0.151901
7799         0.142350
57801        0.138742
54751        0.132867
107984872    0.124788
127707       0.118198
155184       0.098019
26870        0.089438
105376691    0.082548
8510         0.077518
27129        0.076625
112268220    0.074872
63036        0.067704
105376809    0.061128
Name: PC1, dtype: float64), (2, GeneID
127707       0.519960
26279        0.477686
644068       0.345250
51154        0.264382
5320         0.186605
102465434    0.180048
100288175    0.174025
284661       0.142815
6060         0.138238
30814        0.126029
109623456    0.124944
55092        0.115195
102724659    0.108266
57801        0.107856
105376737    0.101369
107984872    0.100484
8510         0.098409
26870        0.091395
155184       0.089016
29943        0.087510
106614088    0

In [None]:
lst = []
for k, gene in d.items():
    arr = d[k].index
    print(arr)
    for item in arr:
        if item not in lst:
            lst.append(item)

lst

Int64Index([102724659, 105376737,      6060,     26869,      9636,     55092,
               644068,     26279,     29943,      7799,     57801,     54751,
            107984872,    127707,    155184,     26870, 105376691,      8510,
                27129, 112268220,     63036, 105376809],
           dtype='int64', name='GeneID')
Int64Index([   127707,     26279,    644068,     51154,      5320, 102465434,
            100288175,    284661,      6060,     30814, 109623456,     55092,
            102724659,     57801, 105376737, 107984872,      8510,     26870,
               155184,     29943, 106614088,      9636,      1187,     26869,
                 1969],
           dtype='int64', name='GeneID')
Int64Index([    26279,    127707,    644068,     51154,      9636, 109623456,
                 8510, 102465434, 113219467, 102724659, 112268220,    284661,
            100288175, 105376691,     10630,     26829, 105376809,     26870,
            106614088,      7799, 105376739],
           

[102724659,
 105376737,
 6060,
 26869,
 9636,
 55092,
 644068,
 26279,
 29943,
 7799,
 57801,
 54751,
 107984872,
 127707,
 155184,
 26870,
 105376691,
 8510,
 27129,
 112268220,
 63036,
 105376809,
 51154,
 5320,
 102465434,
 100288175,
 284661,
 30814,
 109623456,
 106614088,
 1187,
 1969,
 113219467,
 10630,
 26829,
 105376739,
 26871,
 105376805]

In [None]:
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,105377243,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,-0.222988,...,REREP2Y,arginine-glutamic acid dipeptide repeats pseud...,,pseudo,active,NC_000024.10,26196209,26235374,negative,2725.0
4652,107987359,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC107987359,uncharacterized LOC107987359,,ncRNA,active,NT_167208.1,35630,43681,negative,745.0
4653,4572,-0.135784,0.567824,3.460437,1.42779,2.131398,1.975041,1.349611,0.098752,0.724182,...,TRNQ,tRNA-Gln,MTTQ,tRNA,active,NC_012920.1,4329,4400,negative,72.0
4654,4512,-0.237304,1.302546,5.055768,2.327118,0.856884,-0.052044,-0.139682,-0.558824,-0.258147,...,COX1,cytochrome c oxidase subunit I,COI|MTCO1,protein-coding,active,NC_012920.1,5904,7445,positive,1542.0


In [None]:
merged_data = merged_data[merged_data['GeneID'].isin(lst)]
merged_data

Unnamed: 0,GeneID,GSM6745599,GSM6745600,GSM6745601,GSM6745602,GSM6745603,GSM6745604,GSM6745605,GSM6745606,GSM6745607,...,Symbol,Description,Synonyms,GeneType,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length
0,113219467,-0.158632,1.379276,5.355088,2.002351,0.679553,-0.213027,0.0095,-0.393521,-0.27484,...,MIR12136,microRNA 12136,,ncRNA,active,NC_000001.11,632615,632685,negative,71.0
1,57801,1.070933,0.767235,3.457126,0.897391,1.070933,1.548171,0.810621,-0.274013,0.289997,...,HES4,hes family bHLH transcription factor 4,bHLHb42,protein-coding,active,NC_000001.11,998962,1001052,negative,1920.0
2,9636,1.15215,0.761592,2.062135,0.336385,0.673147,0.828458,-0.304008,-0.72496,-0.605817,...,ISG15,ISG15 ubiquitin like modifier,G1P2|IFI15|IMD38|IP17|UCRP|hUCRP,protein-coding,active,NC_000001.11,1013497,1014540,positive,637.0
3,100288175,0.499194,-0.765431,1.706335,-0.075635,0.61416,-0.018153,1.01654,-0.650465,0.384228,...,LOC100288175,uncharacterized LOC100288175,,ncRNA,active,NC_000001.11,1059734,1066453,positive,974.0
4,102465434,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,-0.342997,...,MIR6726,microRNA 6726,hsa-mir-6726,ncRNA,active,NC_000001.11,1296110,1296170,negative,61.0
5,109623456,-0.697982,-0.697982,-0.697982,0.697982,0.697982,-0.697982,-0.697982,-0.697982,-0.697982,...,SNORD167,"small nucleolar RNA, C/D box 167",,snoRNA,active,NC_000001.11,1304729,1304812,negative,84.0
6,8510,0.160668,-1.27589,2.435216,0.160668,0.579663,0.938803,2.016221,-0.797037,0.45995,...,MMP23B,matrix metallopeptidase 23B,MIFR|MIFR-1|MMP22|MMP23A,protein-coding,active,NC_000001.11,1631681,1635638,positive,2764.0
7,107984872,-0.602311,-0.602311,-0.602311,-0.602311,0.136006,0.136006,2.350955,0.136006,2.350955,...,LOC107984872,uncharacterized LOC107984872,,ncRNA,active,NC_000001.11,1955857,1958986,positive,2799.0
8,112268220,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LOC112268220,uncharacterized LOC112268220,,ncRNA,active,NC_000001.11,2763419,2767518,positive,996.0
9,284661,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,-0.164399,...,LINC01777,long intergenic non-protein coding RNA 1777,,ncRNA,active,NC_000001.11,4412051,4424684,positive,2074.0


In [None]:
remaining_genes = expression_data.columns.tolist()  # Start with all genes
expression_data_reduced = expression_data.copy()  # Copy the data to modify iteratively
final_results = {}  # To store relevant genes for each PC

for pc in range(1, n_components_75 + 1):
    # Fit PCA on the current reduced dataset
    pca_temp = PCA()
    pca_temp.fit(expression_data_reduced)
    n_components_temp = min(expression_data_reduced.shape[1], len(pca_temp.explained_variance_ratio_))
    
    # Stop if we run out of components to analyze
    if pc > n_components_temp:
        break
    
    # Update the components dataframe for the reduced dataset
    components_df = pd.DataFrame(
        pca_temp.components_,
        columns=expression_data_reduced.columns,  # Use current reduced dataset columns
        index=[f"PC{i+1}" for i in range(n_components_temp)]
    )

    # Get relevant genes for the current PC
    relevant_genes_df = get_relevant_genes_with_contribution(pc)
    relevant_genes = relevant_genes_df.index.tolist()

    # Store the results for this PC
    final_results[f"PC{pc}"] = relevant_genes

    # Drop the relevant genes for the current PC from the dataset
    expression_data_reduced = expression_data_reduced.drop(columns=relevant_genes, errors='ignore')
    remaining_genes = expression_data_reduced.columns.tolist()  # Update remaining genes list

    # Stop if no more genes remain
    if expression_data_reduced.empty:
        break

# Final output
for pc, genes in final_results.items():
    print(f"\nRelevant genes for {pc}: {len(genes)} genes")


Relevant genes for PC1: 22 genes

Relevant genes for PC2: 12 genes

Relevant genes for PC3: 1 genes


In [None]:
components_df

Unnamed: 0,GSM6745599,GSM6745609,GSM6745614,GSM6745617
PC1,0.060443,0.994332,-0.027039,-0.083188
PC2,0.825387,-0.087452,-0.475876,-0.290913
PC3,0.480868,-0.010714,0.874468,-0.062903
PC4,0.289566,0.059515,-0.090091,0.951048


In [None]:
expression_cols = merged_data.columns[1:39]
expression_data = merged_data[expression_cols].apply(pd.to_numeric, errors='coerce')

pca_target_variance = 0.75
pca_full = PCA()
expression_data_filled = expression_data.apply(lambda row: row.bfill().ffill(), axis=1)
nan_counts = expression_data.isna().sum().sum()
nan_counts

pca_full.fit(expression_data)

# Determine number of components needed to explain 75% variance
cumulative_variance = pca_full.explained_variance_ratio_.cumsum()
n_components_75 = np.argmax(cumulative_variance >= pca_target_variance) + 1

# Apply PCA with the determined number of components
pca_final = PCA(n_components=n_components_75)
pca_result_final = pca_final.fit_transform(expression_data)

# Retrieve top contributing genes for each principal component
components_df = pd.DataFrame(
    pca_final.components_,
    columns=merged_data['GeneID'][:expression_data.shape[1]],
    index=[f"PC{i+1}" for i in range(n_components_75)]
)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Extract relevant genes for each PC and combine them into a set
relevant_genes_set = set()
for pc in range(1, n_components_75 + 1):
    relevant_genes = d[pc].index.tolist()  # Get the list of relevant genes for each PC
    relevant_genes_set.update(relevant_genes)  # Add the genes to the set

# Step 2: Filter the expression data to retain only the relevant genes
filtered_expression_data = expression_data[relevant_genes_set]

# Step 3: Prepare the labels (assuming you have a target variable like 'label_column')
# Replace 'label_column' with your actual target column
X_train, X_test, y_train, y_test = train_test_split(filtered_expression_data, merged_data['label_column'], test_size=0.2, random_state=42)

# Step 4: Build and train a classifier (e.g., Random Forest)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Step 5: Evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the classifier: {accuracy:.4f}")

KeyError: "None of [Int64Index([    10630, 113219467, 102465434,     26871, 109623456,      1187,\n                 9636, 105376805,     26279, 105376809,      6060, 100288175,\n               155184,      1969, 102724659,     55092, 105376691, 112268220,\n                63036,      8510,      5320,     57801, 106614088,     26829,\n                51154,    127707,     30814,     54751, 105376737, 105376739,\n               644068, 107984872,     26869,     26870,      7799,    284661,\n                27129,     29943],\n           dtype='int64')] are in the [columns]"