In [None]:
#gene level pre-processing

import pandas as pd

def filter_genes(path_exprs, size=2000):
    exprs = pd.read_csv(path_exprs, sep='\t', index_col=0)  

    exprs = exprs.loc[~exprs.index.duplicated(keep='first')]

    print("Original number of genes:", exprs.shape[0])

    top_genes = exprs.std(axis=1).nlargest(size).index
    exprs = exprs.loc[top_genes]

    return exprs

def process_column(df):
    new_columns = [col.split('.')[0] for col in df.columns]
    df.columns = new_columns
    return df

exprs = filter_genes('/Users/yazgisert/Desktop/D_Cardio_Data/Human_DCM_EMB_deseq2_normalized_gene_counts.csv', size=2000)
exprs = process_column(exprs)
exprs.to_csv('/Users/yazgisert/Desktop/genes_cardio_top2000.csv', sep=',')


In [None]:
#transcript level work

In [6]:
import pandas as pd
import numpy as np
from scipy import stats

def filter_genes(path_exprs, log2=True, zscores=True, size=5000):
    exprs = pd.read_csv(path_exprs, index_col=0)

    print("Original number of genes:", exprs.shape[0])

    if log2:
        minimal = exprs.min().min()
        if minimal <= 0:
            exprs += np.abs(minimal - 1)
        exprs = np.log2(exprs)

    top_genes = exprs.std(axis=1).nlargest(size).index
    exprs = exprs.loc[top_genes]

    if zscores:
        exprs = pd.DataFrame(stats.zscore(exprs, axis=1), index=exprs.index, columns=exprs.columns)

    exprs = exprs.loc[~exprs.index.duplicated(keep='first')]
    return exprs

def process_column(df):
    new_columns = [col.split('-')[0] for col in df.columns]
    df.columns = new_columns
    return df

exprs = filter_genes('/Users/yazgisert/Desktop/D_Cardio_Data/final_dominant_transcripts.csv', log2=True, zscores=True, size=2000)
exprs = process_column(exprs)
exprs.to_csv('/Users/yazgisert/Desktop/D_Cardio_Data/dominant_cardio_top2000.csv', sep=',')


Original number of genes: 60929


In [2]:
import pandas as pd

mart_export = '/Users/yazgisert/Desktop/tez/mart_export_trans.csv'
transcript_file = '/Users/yazgisert/Desktop/D_Cardio_Data/Human_DCM_EMB_salmon_tpms.tsv'
output_path = '/Users/yazgisert/Desktop/D_Cardio_Data/dominant_transcripts.csv'

mart_export = pd.read_csv(mart_export)
enst_to_gene_mapping = dict(zip(mart_export['Transcript stable ID'], mart_export['Gene stable ID']))
                        

In [3]:
transcript_data = pd.read_csv(transcript_file, sep='\t', dtype={0:str})
transcript_data['Gene stable ID'] = transcript_data.iloc[:, 0].map(enst_to_gene_mapping)
transcript_data.to_csv(output_path, index=False)

In [8]:
display(transcript_data)

Unnamed: 0,Name,21Mar37559_A01-L1_S1_tmps,21Mar37559_A02-L1_S2_tmps,21Mar37559_A03-L1_S3_tmps,21Mar37559_A04-L1_S4_tmps,21Mar37559_A05-L1_S5_tmps,21Mar37559_A06-L1_S1_tmps,21Mar37559_A07-L1_S2_tmps,21Mar37559_A08-L1_S3_tmps,21Mar37559_A09-L1_S4_tmps,...,21Mar37560_G06-L1_S32_tmps,21Mar37560_G07-L1_S33_tmps,21Mar37560_H01-L1_S37_tmps,21Mar37560_H02-L1_S38_tmps,21Mar37560_H03-L1_S34_tmps,21Mar37560_H04-L1_S35_tmps,21Mar37560_H05-L1_S36_tmps,21Mar37560_H06-L1_S37_tmps,21Mar37560_H07-L1_S38_tmps,Gene stable ID
0,ENST00000456328,0.008081,0.007859,0.009108,0.072325,0.031019,0.019091,0.122064,0.000000,0.046177,...,0.015854,0.087377,0.089179,0.066125,0.035786,0.035851,0.032555,0.000000,0.060475,ENSG00000290825
1,ENST00000450305,0.012661,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ENSG00000223972
2,ENST00000488147,0.996537,1.274606,0.634398,2.131830,2.909487,1.166722,1.582639,1.014172,1.622711,...,0.868454,1.190456,1.420899,1.233528,1.666160,0.881181,1.837196,1.278904,1.408571,ENSG00000227232
3,ENST00000619216,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ENSG00000278267
4,ENST00000473358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016705,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ENSG00000243485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236182,ENST00000361681,12716.932103,11446.049968,7088.634828,9352.719197,9355.584313,10224.526306,11926.052710,10633.266476,14367.727040,...,13579.165048,10983.950340,9267.455843,11709.006490,9009.237326,10333.945241,9730.253100,10605.797218,11273.076990,ENSG00000198695
236183,ENST00000387459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ENSG00000210194
236184,ENST00000361789,24733.558783,22371.250656,19497.791026,17873.520279,21075.514314,20372.400380,18390.957128,16607.843306,16078.700582,...,22551.602153,18441.167714,16503.328460,19957.564249,17645.622233,28018.366937,17101.124272,27054.282174,21248.741935,ENSG00000198727
236185,ENST00000387460,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,ENSG00000210195


In [4]:
file = '/Users/yazgisert/Desktop/D_Cardio_Data/dominant_transcripts.csv'
df = pd.read_csv(file)
df['median_expression'] = df.iloc[:, 1:-1].median(axis=1)

grouped_df = df.loc[df.groupby('Gene stable ID')['median_expression'].idxmax()]

output = '/Users/yazgisert/Desktop/D_Cardio_Data/processed_dominant_transcripts.csv'
grouped_df.to_csv(output, index=False)

In [5]:
df = pd.read_csv('/Users/yazgisert/Desktop/D_Cardio_Data/processed_dominant_transcripts.csv')
martdf = pd.read_csv('/Users/yazgisert/Desktop/tez/mart_export_trans.csv')


enst_to_gene_mapping = dict(zip(martdf['Transcript stable ID'], martdf['Gene stable ID']))

df['Name'] = df['Name'].map(enst_to_gene_mapping)

df.drop(columns=['Gene stable ID', 'median_expression'], inplace=True)

output_path = '/Users/yazgisert/Desktop/D_Cardio_Data/final_dominant_transcripts.csv'
df.to_csv(output_path, index=False)

In [9]:
#metadata filtering

import pandas as pd

def extract(input_file, output_file):
    df = pd.read_csv(input_file, delimiter='\t')

    selected_columns = ["RNA_seq_name", "LVEF"]
    selected_data = df[selected_columns]

    selected_data.to_csv(output_file, index=False)


extract('/Users/yazgisert/Desktop/tez/D_Cardio_Data/Human_DCM_EMB_metadata', '/Users/yazgisert/Desktop/tez/D_Cardio_Data/LVEF_metadata.csv')


In [15]:

df = pd.read_csv('/Users/yazgisert/Desktop/tez/D_Cardio_Data/LVEDD_Henry_metadata.csv')

df = df.sort_values(by='LVEDD_Henry', ascending=False)

part_length = len(df) // 3

df.loc[:part_length, 'LVEDD_Henry'] = 'P1'
df.loc[part_length:2*part_length, 'LVEDD_Henry'] = 'P2'
df.loc[2*part_length:, 'LVEDD_Henry'] = 'P3'

df.to_csv('/Users/yazgisert/Desktop/tez/D_Cardio_Data/LVEDD_Henry_modified.csv', index=False)

    

In [20]:
import pandas as pd

df = pd.read_csv('/Users/yazgisert/Desktop/tez/D_Cardio_Data/LVEF_metadata.csv')

df['LVEF'].fillna(0, inplace=True)

df['LVEF'] = pd.to_numeric(df['LVEF'], errors='coerce')

df['LVEF_Label'] = ''
df.loc[df['LVEF'] < 20, 'LVEF_Label'] = 'P1'
df.loc[(df['LVEF'] >= 20) & (df['LVEF'] <= 40), 'LVEF_Label'] = 'P3'
df.loc[df['LVEF'] > 40, 'LVEF_Label'] = 'P2'

df.drop(columns=['LVEF'], inplace=True)

df.rename(columns={'LVEF_Label': 'LVEF'}, inplace=True)

df.to_csv('/Users/yazgisert/Desktop/tez/D_Cardio_Data/LVEF_modified.csv', index=False)



In [8]:
import openpyxl

wb = openpyxl.load_workbook('/Users/yazgisert/Desktop/tez/D_Cardio_Data/dominant_isoform/sex/biclusters_183826b6b7689c63b4df65eea4174c52.xlsx')

with open('/Users/yazgisert/Desktop/tez/D_Cardio_Data/dominant_isoform/sex/query.txt', 'w') as text_file:
    for sheet_index in range(59, 109):
        sheet_name = f"bicluster{sheet_index}"
        if sheet_name in wb.sheetnames:
            sheet = wb[sheet_name]
            for row in sheet.iter_rows(min_row=2, max_col=1, values_only=True):
                value = row[0]
                text_file.write(str(value) + '\n')
            text_file.write('\n')
wb.close()



In [101]:
import openpyxl
import os

excel_file = '/Users/yazgisert/Desktop/tez/D_Cardio_Data/dominant_isoform/Maggic_Score/biclusters_d09750d0bb1467e0239040d4be048b3e.xlsx'
wb = openpyxl.load_workbook(excel_file)

sheet_name = "bicluster112"

output_directory = os.path.dirname(excel_file)

with open(f'{output_directory}/{sheet_name}_output.txt', 'w') as text_file:
    if sheet_name in wb.sheetnames:
        sheet = wb[sheet_name]

        for row in sheet.iter_rows(min_row=2, max_col=1, values_only=True):
            value = row[0]
            text_file.write(str(value) + '\n')

wb.close()


In [3]:
#ID Conversion for BiCoN 
import pandas as pd
def process_column(df):
    new_columns = [col.split('.')[0] for col in df.columns]
    df.columns = new_columns
    return df

path_exprs = '/Users/yazgisert/Desktop/tez/D_Cardio_Data/Human_DCM_EMB_deseq2_normalized_gene_counts.csv'
exprs = pd.read_csv(path_exprs, sep='\t', index_col=0)  
exprs = process_column(exprs)
exprs.to_csv('/Users/yazgisert/Desktop/d_cardio_gene.csv', sep=',')


In [10]:
dataframe = pd.read_csv('/Users/yazgisert/Desktop/d_cardio_gene.csv')
dataframe.rename(columns={dataframe.columns[0]: 'sample'}, inplace=True)

ensemble_ids = dataframe['sample'].tolist()

entries = {}
for value in ensemble_ids:
    entries[value] = -1

with open('/Users/yazgisert/Desktop/tez/mart_export.csv', 'r') as file:
    row = file.readline()
    while row:
        row = row.strip()
        cols = row.split(',')
        if entries.get(cols[0]) == -1:
            entries[cols[0]] = cols[-1]
        row = file.readline()

entries = {key: value for key, value in entries.items() if value != -1 and value != ''}

dataframe['sample'] = dataframe['sample'].map(entries)

dataframe = dataframe.dropna(subset=["sample"])
if "Unnamed: 0" in dataframe.columns:
    dataframe = dataframe.drop("Unnamed: 0", axis=1)

dataframe.to_csv('/Users/yazgisert/Desktop/d_cardio_gene_modified.csv', index=False)


In [11]:
dataframe = pd.read_csv('/Users/yazgisert/Desktop/tez/D_Cardio_Data/final_dominant_transcripts.csv')
dataframe.rename(columns={dataframe.columns[0]: 'sample'}, inplace=True)

ensemble_ids = dataframe['sample'].tolist()

entries = {}
for value in ensemble_ids:
    entries[value] = -1

with open('/Users/yazgisert/Desktop/tez/mart_export.csv', 'r') as file:
    row = file.readline()
    while row:
        row = row.strip()
        cols = row.split(',')
        if entries.get(cols[0]) == -1:
            entries[cols[0]] = cols[-1]
        row = file.readline()

entries = {key: value for key, value in entries.items() if value != -1 and value != ''}

dataframe['sample'] = dataframe['sample'].map(entries)

dataframe = dataframe.dropna(subset=["sample"])
if "Unnamed: 0" in dataframe.columns:
    dataframe = dataframe.drop("Unnamed: 0", axis=1)

dataframe.to_csv('/Users/yazgisert/Desktop/d_cardio_transcr_modified.csv', index=False)