### Calculate GIC score

In [1]:
# Human

import math
import pandas as pd
from tqdm import tqdm


# record all triplet combinations
BASES = ['a', 't', 'c', 'g']
TRIPLETS = []
for B1 in BASES:
    for B2 in BASES:
        for B3 in BASES:
            TRIPLETS.append(B1+B2+B3)

mers = {} #construct a list of triplets that are initialized to 0

# obtain the corresponding relationship between lncRNA and transcript
def readlnc_transID(filename):
    lnc_trans = pd.read_csv(filename, header=None, names=['lncRNA_id', 'transcript_id'])
    #Create the mapping relationship between genes and transcripts
    lnc_trans_mapping = lnc_trans.groupby('lncRNA_id')['transcript_id'].aggregate(list).to_dict()

    return lnc_trans_mapping

# Get the secondary structure minimum free energy of each transcript of lncRNA
def readlnc_mfe(filename):
    trans_mfe = pd.read_csv(filename)
    #Create the mapping relationship between transcripts and MFE
    trans_mfe_mapping = pd.Series(trans_mfe['MFE'].values, index=trans_mfe['transcript_id']).to_dict()

    return trans_mfe_mapping

# Get transcript sequence
def readfasta(filename):
    f = open(filename,'r')
    res = {} #Preserve transcripts and their sequence
    for line in f:
        if line.startswith('>'):
            line = line.strip()
            ID = line.split('>',2)[1]
            res[ID] = ''
        else:
            res[ID] += line
    return res

#sliding window
def slidingWindow(seq, l, win, step=1): 
    length = l
    mod = divmod((length-win), step)[1]
    if (win >= length):
        return seq
    else:
        start = 0
        end = win
        fragments = []
        while (len(seq[start:end]) == win):
            fragments.append(seq[start:end])
            start += step
            end += step
        if (mod > 0):
            fragments.append(seq[(length-win):])
        return fragments
 
#calculate the frequency of the triplet
def stat3mer(seq, l):   
    freq = {}
    for item in TRIPLETS:
        mers[item] = 0
    num3mer = float(l-2)
    all3mer = slidingWindow(seq, l, win=3, step=1)
    for i in set(TRIPLETS):
        mers[i] = all3mer.count(i)
    for triplet in TRIPLETS:
        freq[triplet] = mers[triplet]/num3mer
    return freq


if __name__ == '__main__':

    lnc_trans_path = './human/filtered_lnc_trans.csv'  
    MFE_path = './human/trans_MFE.csv'
    seq_path = './human/transcript_sequences.fasta'  

    lnc_transID = readlnc_transID(lnc_trans_path)
    trans_mfe = readlnc_mfe(MFE_path)
    trans_seq = readfasta(seq_path)

    # parameter Settings of GIC
    LRMODEL_FEATURES_7 = ['intercept', 'length', 'mfe/L', 'cga', 'gcg', 'tcg', 'acg', 'tca']
    LRMODEL_COEFS_7 = [0.7417, 2.612e-04, 4.295, 48.66, 15.64, 76.23, -1.113, -60.29]
    LRMODEL_7 = dict(zip(LRMODEL_FEATURES_7, LRMODEL_COEFS_7))

    #calculated the eigenvalues of each transcript
    features_trans = {}  #save the features of each transcript
    for id, seq in tqdm(trans_seq.items(), desc="Processing Transcripts"):
        if id not in trans_mfe:
            continue
        feature = {}
        seq = seq.replace('\n','')
        seq = seq.lower()
        length = len(seq)
        feature['trans'] = id
        feature['length'] = length
        feature['mfe/L'] = trans_mfe[id]/length
        freq = stat3mer(seq, length)
        for item in TRIPLETS:
            feature[item] = freq[item]
        # tmp = LRMODEL_7['intercept']+sum([feature[_]*LRMODEL_7[_] for _ in LRMODEL_FEATURES_7[1:]])
        # gic = math.exp(tmp)/(math.exp(tmp)+1)
        # feature['GIC_7'] = gic
        features_trans[id] = feature
    
    #calculated the eigenvalues of each lncRNA
    lncRNA_GIC_score = {}  ##save the GIC score of each lncRNA
    for lnc,transcripts in lnc_transID.items():
        feature = {} #save the features of lncRNA gene
        feature['length'] = 0 # initialize the value of each feature to 0
        feature['mfe/L'] = 0
        for item in TRIPLETS:
            feature[item] = 0
    
        for trans in transcripts:
            if trans not in features_trans:
                feature = {}
                break
            else:
                tranfeature = features_trans[trans]
                feature['length'] += tranfeature['length']
                feature['mfe/L']  += tranfeature['mfe/L']
                for item in TRIPLETS:
                    feature[item] += tranfeature[item]
        if len(feature)>0:
            feature['lncRNA_id'] = lnc
            feature['length'] = feature['length']/len(transcripts)
            feature['mfe/L'] = feature['mfe/L']/len(transcripts)
            for item in TRIPLETS:
                feature[item] =  feature[item]/len(transcripts)
            tmp = LRMODEL_7['intercept']+sum([feature[_]*LRMODEL_7[_] for _ in LRMODEL_FEATURES_7[1:]])
            gic = math.exp(tmp)/(math.exp(tmp)+1)
            feature['GIC_score'] = gic
            lncRNA_GIC_score[lnc] = feature['GIC_score']

    # sort all LncRNAs according to GIC scores
    lncRNA_GIC_score = sorted(lncRNA_GIC_score.items(),key=lambda d:d[1],reverse=True)
    
    # Convert the lncRNA GIC scores to a DataFrame
    lncRNA_scores_df_sorted = pd.DataFrame(lncRNA_GIC_score, columns=['lncRNA_id', 'GIC_score'])

    # Write the sorted results to a CSV file
    sorted_GIC_path = './human/sorted_GIC_score.csv'
    lncRNA_scores_df_sorted.to_csv(sorted_GIC_path, index=False)

    print("GIC scores have been calculated and saved successfully.")
    print(f"Output file is located at: {sorted_GIC_path}")

Processing Transcripts: 100%|██████████| 163300/163300 [02:02<00:00, 1330.57it/s]


GIC scores have been calculated and saved successfully.
Output file is located at: ./human/sorted_GIC_score.csv


In [2]:
# Mouse

import math
import pandas as pd
from tqdm import tqdm


# record all triplet combinations
BASES = ['a', 't', 'c', 'g']
TRIPLETS = []
for B1 in BASES:
    for B2 in BASES:
        for B3 in BASES:
            TRIPLETS.append(B1+B2+B3)

mers = {} #construct a list of triplets that are initialized to 0

# obtain the corresponding relationship between lncRNA and transcript
def readlnc_transID(filename):
    lnc_trans = pd.read_csv(filename, header=None, names=['lncRNA_id', 'transcript_id'])
    #Create the mapping relationship between genes and transcripts
    lnc_trans_mapping = lnc_trans.groupby('lncRNA_id')['transcript_id'].aggregate(list).to_dict()

    return lnc_trans_mapping

# Get the secondary structure minimum free energy of each transcript of lncRNA
def readlnc_mfe(filename):
    trans_mfe = pd.read_csv(filename)
    #Create the mapping relationship between transcripts and MFE
    trans_mfe_mapping = pd.Series(trans_mfe['MFE'].values, index=trans_mfe['transcript_id']).to_dict()

    return trans_mfe_mapping

# Get transcript sequence
def readfasta(filename):
    f = open(filename,'r')
    res = {} #Preserve transcripts and their sequence
    for line in f:
        if line.startswith('>'):
            line = line.strip()
            ID = line.split('>',2)[1]
            res[ID] = ''
        else:
            res[ID] += line
    return res

#sliding window
def slidingWindow(seq, l, win, step=1): 
    length = l
    mod = divmod((length-win), step)[1]
    if (win >= length):
        return seq
    else:
        start = 0
        end = win
        fragments = []
        while (len(seq[start:end]) == win):
            fragments.append(seq[start:end])
            start += step
            end += step
        if (mod > 0):
            fragments.append(seq[(length-win):])
        return fragments
 
#calculate the frequency of the triplet
def stat3mer(seq, l):   
    freq = {}
    for item in TRIPLETS:
        mers[item] = 0
    num3mer = float(l-2)
    all3mer = slidingWindow(seq, l, win=3, step=1)
    for i in set(TRIPLETS):
        mers[i] = all3mer.count(i)
    for triplet in TRIPLETS:
        freq[triplet] = mers[triplet]/num3mer
    return freq


if __name__ == '__main__':

    lnc_trans_path = './mouse/filtered_lnc_trans.csv'  
    MFE_path = './mouse/trans_MFE.csv'
    seq_path = './mouse/transcript_sequences.fasta'  

    lnc_transID = readlnc_transID(lnc_trans_path)
    trans_mfe = readlnc_mfe(MFE_path)
    trans_seq = readfasta(seq_path)

    # parameter Settings of GIC
    LRMODEL_FEATURES_7 = ['intercept', 'length', 'mfe/L', 'cga', 'gcg', 'tcg', 'acg', 'tca']
    LRMODEL_COEFS_7 = [0.1625, 2.638e-04, 2.194, 19.88, 37.59, 50.37, 35.44, -64.66]
    LRMODEL_7 = dict(zip(LRMODEL_FEATURES_7, LRMODEL_COEFS_7))

    #calculated the eigenvalues of each transcript
    features_trans = {}  #save the features of each transcript
    for id, seq in tqdm(trans_seq.items(), desc="Processing Transcripts"):
        if id not in trans_mfe:
            continue
        feature = {}
        seq = seq.replace('\n','')
        seq = seq.lower()
        length = len(seq)
        feature['trans'] = id
        feature['length'] = length
        feature['mfe/L'] = trans_mfe[id]/length
        freq = stat3mer(seq, length)
        for item in TRIPLETS:
            feature[item] = freq[item]
        # tmp = LRMODEL_7['intercept']+sum([feature[_]*LRMODEL_7[_] for _ in LRMODEL_FEATURES_7[1:]])
        # gic = math.exp(tmp)/(math.exp(tmp)+1)
        # feature['GIC_7'] = gic
        features_trans[id] = feature
    
    #calculated the eigenvalues of each lncRNA
    lncRNA_GIC_score = {}  ##save the GIC score of each lncRNA
    for lnc,transcripts in lnc_transID.items():
        feature = {} #save the features of lncRNA gene
        feature['length'] = 0 # initialize the value of each feature to 0
        feature['mfe/L'] = 0
        for item in TRIPLETS:
            feature[item] = 0
    
        for trans in transcripts:
            if trans not in features_trans:
                feature = {}
                break
            else:
                tranfeature = features_trans[trans]
                feature['length'] += tranfeature['length']
                feature['mfe/L']  += tranfeature['mfe/L']
                for item in TRIPLETS:
                    feature[item] += tranfeature[item]
        if len(feature)>0:
            feature['lncRNA_id'] = lnc
            feature['length'] = feature['length']/len(transcripts)
            feature['mfe/L'] = feature['mfe/L']/len(transcripts)
            for item in TRIPLETS:
                feature[item] =  feature[item]/len(transcripts)
            tmp = LRMODEL_7['intercept']+sum([feature[_]*LRMODEL_7[_] for _ in LRMODEL_FEATURES_7[1:]])
            gic = math.exp(tmp)/(math.exp(tmp)+1)
            feature['GIC_score'] = gic
            lncRNA_GIC_score[lnc] = feature['GIC_score']

    # sort all LncRNAs according to GIC scores
    lncRNA_GIC_score = sorted(lncRNA_GIC_score.items(),key=lambda d:d[1],reverse=True)
    
    # Convert the lncRNA GIC scores to a DataFrame
    lncRNA_scores_df_sorted = pd.DataFrame(lncRNA_GIC_score, columns=['lncRNA_id', 'GIC_score'])

    # Write the sorted results to a CSV file
    sorted_GIC_path = './mouse/sorted_GIC_score.csv'
    lncRNA_scores_df_sorted.to_csv(sorted_GIC_path, index=False)

    print("GIC scores have been calculated and saved successfully.")
    print(f"Output file is located at: {sorted_GIC_path}")

Processing Transcripts: 100%|██████████| 56374/56374 [01:23<00:00, 675.32it/s] 


GIC scores have been calculated and saved successfully.
Output file is located at: ./mouse/sorted_GIC_score.csv


### Get essential lncRNA samples and non-essential lncRNA samples

In [3]:
import pandas as pd

esslnc = pd.read_csv("../../data/raw/esslnc.csv")

esslnc = esslnc[["Noncode_id","target","gene_name","ensembl_id","Organism","cancer_related","disease_related","vivo","vitro"]]

human_esslnc = esslnc[esslnc['Organism']=='Human']
mouse_esslnc = esslnc[esslnc['Organism']=='Mouse']

human_esslnc.to_csv('../../data/benchmark/human/human_esslnc.csv',index=False)
mouse_esslnc.to_csv('../../data/benchmark/mouse/mouse_esslnc.csv',index=False)

In [None]:
# Human
import pandas as pd

# Read the lncRNA data
ess_lnc = pd.read_csv("../../data/benchmark/human/human_esslnc.csv")
lncRNA = pd.read_csv("../../data/LPI/human/lncRNA.csv")
annotation = pd.read_csv("../../annotate/human/valid_heart_annotation.csv")

annotated_lncRNA = lncRNA[lncRNA['lncRNA_id'].isin(annotation['lncRNA_id'])].copy()

ess_lnc['Noncode_id'] = ess_lnc['Noncode_id'].str.split('.').str[0]
def is_essential(id, name):
    if id != '-' and any(ess_lnc[col].isin([id]).any() for col in ['Noncode_id', 'ensembl_id']):
        return 1
    if name != '-' and any(ess_lnc[col].isin([name]).any() for col in ['gene_name', 'target']):
        return 1
    return 0

annotated_lncRNA.loc[:, 'essential'] = annotated_lncRNA.apply(lambda row: is_essential(row['gene_id'], row['gene_name']), axis=1)
ess_lnc = annotated_lncRNA[annotated_lncRNA['essential'] == 1]
ess_lnc = ess_lnc[['lncRNA_id']]

ess_lnc.to_csv('../../data/benchmark/human/ess_lnc.csv', index=False)

lnc_GIC = pd.read_csv("./human/sorted_GIC_score.csv")

unlabel_lnc = lnc_GIC[~lnc_GIC['lncRNA_id'].isin(ess_lnc['lncRNA_id'])]
unlabel_lnc = unlabel_lnc[unlabel_lnc['lncRNA_id'].isin(annotated_lncRNA['lncRNA_id'])]

ess_counts = ess_lnc.shape[0]

noness_lnc = unlabel_lnc.tail(ess_counts)[['lncRNA_id']].reset_index(drop=True)

noness_lnc.to_csv("../../data/benchmark/human/noness_lnc.csv", index=False)

In [None]:
# Mouse
import pandas as pd

# Read the lncRNA data
ess_lnc = pd.read_csv("../../data/benchmark/mouse/mouse_esslnc.csv")
lncRNA = pd.read_csv("../../data/LPI/mouse/lncRNA.csv")
annotation = pd.read_csv("../../annotate/mouse/valid_heart_annotation.csv")

annotated_lncRNA = lncRNA[lncRNA['lncRNA_id'].isin(annotation['lncRNA_id'])].copy()

ess_lnc['Noncode_id'] = ess_lnc['Noncode_id'].str.split('.').str[0]
def is_essential(id, name):
    if id != '-' and any(ess_lnc[col].isin([id]).any() for col in ['Noncode_id', 'ensembl_id']):
        return 1
    if name != '-' and any(ess_lnc[col].isin([name]).any() for col in ['gene_name', 'target']):
        return 1
    return 0

annotated_lncRNA.loc[:, 'essential'] = annotated_lncRNA.apply(lambda row: is_essential(row['gene_id'], row['gene_name']), axis=1)

ess_lnc = annotated_lncRNA[annotated_lncRNA['essential'] == 1]
ess_lnc = ess_lnc[['lncRNA_id']]

ess_lnc.to_csv('../../data/benchmark/mouse/ess_lnc.csv', index=False)

lnc_GIC = pd.read_csv("./mouse/sorted_GIC_score.csv")

unlabel_lnc = lnc_GIC[~lnc_GIC['lncRNA_id'].isin(ess_lnc['lncRNA_id'])]
unlabel_lnc = unlabel_lnc[unlabel_lnc['lncRNA_id'].isin(annotated_lncRNA['lncRNA_id'])]

ess_counts = ess_lnc.shape[0]

noness_lnc = unlabel_lnc.tail(ess_counts)[['lncRNA_id']].reset_index(drop=True)

noness_lnc.to_csv("../../data/benchmark/mouse/noness_lnc.csv", index=False)