In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('../../data/GSE218462_raw_counts_GRCh38.p13_NCBI.tsv', sep='\t')
df = df.T
df.columns = df.iloc[0]
df = df[1:]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data)

scaled_df.columns = df.columns
scaled_df.index = df.index
unedited = ['GSM6745632', 'GSM6745633', 'GSM6745634', 'GSM6745635', 'GSM6745636', 'GSM6745637']
scaled_df['Edited (1) or Unedited (0)'] = scaled_df.index.map(lambda gene: 0 if gene in unedited else 1)
mechanisms = {
    "BE4": ["GSM6745599", "GSM6745600", "GSM6745601", "GSM6745611", "GSM6745612", "GSM6745613"],
    "ABE8": ["GSM6745602", "GSM6745603", "GSM6745604", "GSM6745614", "GSM6745615", "GSM6745616"],
    "Cas9": ["GSM6745605", "GSM6745606", "GSM6745607", "GSM6745617", "GSM6745618", "GSM6745619"],
    "Utelectro": ["GSM6745608", "GSM6745609", "GSM6745610", "GSM6745620", "GSM6745621", "GSM6745622"],
    "dCas9": ["GSM6745623", "GSM6745624", "GSM6745625"],
    "BE4alone": ["GSM6745626", "GSM6745627", "GSM6745628"],
    "ABE8alone": ["GSM6745629", "GSM6745630", "GSM6745631"],
    "UT": ["GSM6745632", "GSM6745633", "GSM6745634", "GSM6745635", "GSM6745636", "GSM6745637"]
}

# Inverting the dictionary to map gene code to its corresponding key
mechanism_map = {gene: mechanism for mechanism, genes in mechanisms.items() for gene in genes}

# Adding a new column "editing mechanism" to categorize the gene codes in the index
scaled_df['editing mechanism'] = scaled_df.index.map(mechanism_map)

scaled_df

GeneID,100287102,653635,102466751,107985730,100302278,645520,79501,100996442,729737,102725121,...,4575,4568,4540,4541,4556,4519,4576,4571,Edited (1) or Unedited (0),editing mechanism
GSM6745599,0.212625,-0.318248,-0.042237,0.184916,1.06341,-0.29277,-0.235702,0.117663,0.140983,-0.006097,...,-0.267632,0.466206,-0.318275,-0.133587,0.136065,-0.60444,-0.270274,-0.375282,1,BE4
GSM6745600,-1.311857,-1.500716,-1.647234,-0.818915,1.06341,-0.29277,-0.235702,-1.159822,-0.267197,-1.511998,...,0.007233,1.309818,-0.981598,-0.819478,-0.337051,-1.184536,-1.084285,-0.450273,1,BE4
GSM6745601,1.127315,1.33452,0.492762,-0.818915,-0.773389,-0.29277,-0.235702,1.760144,1.722682,0.688935,...,0.96926,2.15343,1.484606,1.848533,1.74128,0.865277,1.357749,1.874436,1,BE4
GSM6745602,0.365073,-0.318248,-0.577236,0.184916,-0.773389,-0.29277,-0.235702,1.121401,0.498141,0.22558,...,0.282098,2.715838,0.638045,1.031012,0.913327,0.141225,0.041475,0.237141,1,ABE8
GSM6745603,0.66997,0.11174,-0.042237,1.188747,-0.773389,-0.29277,-0.235702,0.39141,-0.165152,0.804773,...,0.831828,1.028614,0.434101,0.974346,1.293509,0.187012,0.110753,0.624593,1,ABE8
GSM6745604,-0.092271,0.622352,1.56276,0.184916,-0.773389,-0.29277,-0.235702,0.756406,0.804276,-0.121935,...,1.381557,0.185003,1.132972,1.712771,2.053874,0.594337,1.115277,1.137029,1,ABE8
GSM6745605,0.66997,0.622352,-1.112235,-0.818915,-0.773389,-0.29277,-0.235702,0.026414,-0.012084,0.22558,...,2.206152,0.466206,-0.349242,0.224115,0.7866,-0.666955,-0.339551,0.862063,1,Cas9
GSM6745606,-0.702064,-1.231973,-0.577236,-1.822745,1.06341,3.41565,-0.235702,-1.159822,-0.828445,-1.048644,...,-0.817361,-0.096201,-0.984529,-0.787604,-0.413087,-1.281728,-0.997688,-0.900216,1,Cas9
GSM6745607,0.517522,0.138615,1.56276,1.188747,-0.773389,-0.29277,-0.235702,0.208912,-0.012084,0.457258,...,-0.130199,0.185003,0.108853,0.397654,0.474005,-0.268787,-0.443467,-0.125313,1,Cas9
GSM6745609,-0.092271,-0.801985,-1.112235,-0.818915,-0.773389,-0.29277,-0.235702,-0.886075,-1.185602,0.109742,...,-1.367091,-0.658609,-0.198254,-0.324244,-0.945342,-0.94986,-1.188201,-0.95021,1,Utelectro


In [22]:

tsv_file_path = '../../data/Human.GRCh38.p13.annot.tsv'  # Replace with the actual path to your TSV file
tsv_df = pd.read_csv(tsv_file_path, sep='\t')

tsv_df

  tsv_df = pd.read_csv(tsv_file_path, sep='\t')


Unnamed: 0,GeneID,Symbol,Description,Synonyms,GeneType,EnsemblGeneID,Status,ChrAcc,ChrStart,ChrStop,Orientation,Length,GOFunctionID,GOProcessID,GOComponentID,GOFunction,GOProcess,GOComponent
0,100287102,DDX11L1,DEAD/H-box helicase 11 like 1 (pseudogene),,pseudo,ENSG00000290825,active,NC_000001.11,11874,14409,positive,1652,,,,,,
1,653635,WASH7P,"WASP family homolog 7, pseudogene",FAM39F|WASH5P,pseudo,,active,NC_000001.11,14362,29370,negative,1769,,,,,,
2,102466751,MIR6859-1,microRNA 6859-1,hsa-mir-6859-1,ncRNA,ENSG00000278267,active,NC_000001.11,17369,17436,negative,68,,,,,,
3,107985730,MIR1302-2HG,MIR1302-2 host gene,,ncRNA,,active,NC_000001.11,29926,31295,positive,538,,,,,,
4,100302278,MIR1302-2,microRNA 1302-2,MIRN1302-2|hsa-mir-1302-2,ncRNA,ENSG00000284332,active,NC_000001.11,30366,30503,positive,138,,GO:0035195,,,miRNA-mediated gene silencing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39371,4541,ND6,NADH dehydrogenase subunit 6,MTND6,protein-coding,,active,NC_012920.1,14149,14673,negative,525,GO:0008137,GO:0006120///GO:0009060///GO:0032981///GO:0035...,GO:0005739///GO:0005743///GO:0005747,NADH dehydrogenase (ubiquinone) activity,"mitochondrial electron transport, NADH to ubiq...",mitochondrion///mitochondrial inner membrane//...
39372,4556,TRNE,tRNA-Glu,MTTE,tRNA,,active,NC_012920.1,14674,14742,negative,69,,,,,,
39373,4519,CYTB,cytochrome b,MTCYB,protein-coding,,active,NC_012920.1,14747,15887,positive,1141,GO:0008121///GO:0046872,GO:0006122///GO:0045333///GO:1902600,GO:0005739///GO:0005743///GO:0005750///GO:0016020,ubiquinol-cytochrome-c reductase activity///me...,"mitochondrial electron transport, ubiquinol to...",mitochondrion///mitochondrial inner membrane//...
39374,4576,TRNT,tRNA-Thr,MTTT,tRNA,,active,NC_012920.1,15888,15953,positive,66,,,,,,


In [25]:
metadata_file_path = "../../data/Human.GRCh38.p13.annot.tsv"
metadata = pd.read_csv(metadata_file_path, sep='\t')
metadata_dict = metadata.set_index("GeneID").to_dict('index')

# Prepare a list to gather expression and metadata rows
combined_rows = []

# Add expression data rows
for sample_id, row in scaled_df.iterrows():
    combined_rows.append(pd.Series(row, name=sample_id))

# Add metadata rows for each gene
for gene_id in scaled_df.columns[:-2]:  # Exclude 'Edited' and 'editing mechanism'
    if gene_id in metadata_dict:
        for meta_key, meta_value in metadata_dict[gene_id].items():
            # Create a row with metadata type and value, indexed by gene_id
            metadata_row = pd.Series(
                [meta_value if meta_key == gene_id else pd.NA for meta_key in scaled_df.columns],
                name=f"{gene_id}_{meta_key}"
            )
            combined_rows.append(metadata_row)

# Combine all rows into a single DataFrame
final_df = pd.DataFrame(combined_rows)

# Display the first few rows of the merged data
print("Final DataFrame (first few rows):")
print(final_df.head())

# Optionally, save to a new file
output_file_path = "merged_gene_expression_with_metadata_rows.csv"
final_df.to_csv(output_file_path)
print(f"Final data saved to {output_file_path}")


  metadata = pd.read_csv(metadata_file_path, sep='\t')


Merged Data (first few rows):
GeneID     100287102    653635 102466751 107985730 100302278   645520  \
GSM6745599  0.212625 -0.318248 -0.042237  0.184916   1.06341 -0.29277   
GSM6745600 -1.311857 -1.500716 -1.647234 -0.818915   1.06341 -0.29277   
GSM6745601  1.127315   1.33452  0.492762 -0.818915 -0.773389 -0.29277   
GSM6745602  0.365073 -0.318248 -0.577236  0.184916 -0.773389 -0.29277   
GSM6745603   0.66997   0.11174 -0.042237  1.188747 -0.773389 -0.29277   

GeneID         79501 100996442    729737 102725121  ... 101928043_Status  \
GSM6745599 -0.235702  0.117663  0.140983 -0.006097  ...             <NA>   
GSM6745600 -0.235702 -1.159822 -0.267197 -1.511998  ...             <NA>   
GSM6745601 -0.235702  1.760144  1.722682  0.688935  ...             <NA>   
GSM6745602 -0.235702  1.121401  0.498141   0.22558  ...             <NA>   
GSM6745603 -0.235702   0.39141 -0.165152  0.804773  ...             <NA>   

GeneID     101928043_ChrAcc 101928043_ChrStart 101928043_ChrStop  \
GSM674