In [1]:
import pandas as pd

In [2]:
####################################################################
# Import network
####################################################################

networkDS = pd.read_csv('./data/adjDataset.tsv', sep='\t')
networkDS

Unnamed: 0,TF,target,importance
0,RPL6,RPL9,4.349956e+02
1,RPL6,RPL5,4.060449e+02
2,RPL6,RPS3A,3.924514e+02
3,RPL6,RPL18,3.697235e+02
4,RPL6,RPL13,3.601602e+02
...,...,...,...
183202,GTF3A,WDR47,4.335981e-10
183203,GTPBP6,RAP1GAP2,3.715861e-10
183204,GTPBP6,ADAMTS19,3.431918e-10
183205,SMARCB1,CCDC170,1.186833e-10


In [3]:
####################################################################
# Import lists of genes
####################################################################

# Import HSA21 genes
HSA21genesDataframe = pd.read_csv("./data/HSA21_genes_biomaRt_conversion.csv")
HSA21genes = [x for x in HSA21genesDataframe["hgnc_symbol"] if str(x) != 'nan']
len(HSA21genes)

225

In [4]:
# Import more HSA21 genes
HSA21genesDataframeFull = pd.read_csv("./data/fullHSA21genes.csv")
HSA21genesFull = [x for x in HSA21genesDataframeFull["hgnc_symbol"] if str(x) != 'nan']

count = 0
for geneA in HSA21genes:
    if geneA in HSA21genesFull:
        count += 1
    else:
        HSA21genesFull.append(geneA)
HSA21genes = HSA21genesFull
len(HSA21genes)

259

In [5]:
# Import Transcription Factors
TFsList = open("./data/TFs.txt", "r").read().split("\n")
len(TFsList)

1840

In [6]:
# Create list for TFs that also are HSA21 genes
HSA21_TFs = []
for gene in HSA21genes:
    if gene in TFsList:
        HSA21_TFs.append(gene)

HSA21Regulators = [
    "ZNF294", "LTN1", "RNF160", 
    "ZNF295", "ZBTB21", "KIAA1227",
    "Pred65", "ZNF355P", "PRED65", 
    "ZNF298", "PRDM15", 
    "APECED", 
    "KIAA0136", "MORC3", "ZCWCC3", "NXP2",
    "GCFC", "PAXBP1", "GCFC1",
    "SON", "NREBP", "BASS1",
    "PKNOX1", "PREP1", 
    "HSF2BP", "MEILB2", "POF19",
    "NRIP1", "RIP140", "NRIP1"
]

# HSA21_TFs += HSA21Regulators
HSA21_TFs

['PRDM15',
 'SOD1',
 'GABPA',
 'BACH1',
 'ERG',
 'ETS2',
 'RUNX1',
 'SIM2',
 'HLCS',
 'TFF3',
 'PKNOX1',
 'U2AF1',
 'AIRE',
 'OLIG1',
 'ADARB1',
 'OLIG2']

In [7]:
# Import Neuro Development Disease genes
neuroDDdf = pd.read_csv('./data/NDDgenes.csv')
neuroDDdf = neuroDDdf[neuroDDdf["High Confidence NDD genes"] == True]
neuroDD = neuroDDdf["Symbol"].tolist()
len(neuroDD)

1586

In [8]:
# Import NeuroDevelopmental Genes
neuroDevGenesDF = pd.read_csv('./data/NervousSystemDevelopmentGO.tsv', sep='\t')
neuroDevGenes = neuroDevGenesDF["Gene"].tolist()
len(neuroDevGenes)
# Select only relevant GO Terms
# In this case I have decided to include all genes

4815

In [9]:
regulatedGenes = pd.read_csv('./data/superGeneOrderDataframe-pval<0.01.csv')
regulatedGenes = regulatedGenes.set_index('Gene')
regulatedGenes['Mean'] = regulatedGenes.mean(axis=1)
upDownRegulated = regulatedGenes['Mean'].to_dict()
regulatedGenes

Unnamed: 0_level_0,IN_2,IN_1,IN,nIN_1,nIN,nEN_1,nEN,IPC,MGE,RG,Mean
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LINC01409,0.347271,-0.663075,-0.668930,0.855479,-2.944556,0.425603,0.174239,-0.126681,0.095841,-0.064386,-0.256919
LINC01128,0.345923,-2.846940,0.591948,0.402350,-2.859292,0.828230,0.712282,-1.413110,-0.098942,-0.012728,-0.435028
AL645608.6,1.217709,0.099018,-0.154763,1.322247,-1.647330,0.896042,0.543718,-0.211704,-0.814285,-0.147440,0.110321
SAMD11,-0.336866,-0.467202,-0.241348,0.353962,-1.370589,1.800136,0.165134,-0.555332,0.240605,-0.696171,-0.110767
NOC2L,-0.148234,-0.217521,-0.487569,0.522000,1.077892,0.529973,-0.147666,-0.142678,-0.488490,-0.114286,0.038342
...,...,...,...,...,...,...,...,...,...,...,...
MT-CYB,0.195976,-0.272919,0.199545,0.281428,-0.304296,0.310947,-0.177389,0.045330,0.097074,-0.038290,0.033741
AC011043.1,-1.000879,-0.105622,0.699476,0.987021,-2.839346,0.454375,0.246574,-0.028093,0.449371,0.094934,-0.104219
AL592183.1,-1.028316,-1.009383,-3.073957,-2.159972,-1.080623,-1.078441,-2.316533,-2.339817,-0.874355,-1.421793,-1.638319
AC240274.1,-0.065734,0.568319,0.523530,0.587644,-2.673716,0.299896,0.592940,0.474852,0.518068,0.487736,0.131354


In [10]:
####################################################################
# Map list of genes on networkDS
####################################################################

# Filter network at 50 importance
networkDS = networkDS[networkDS["importance"]>50]

# Type of lists
# HSA21genes TFsList HSA21_TFs neuroDD neuroDevGenes
listDict = {
    "GeneralTF": TFsList,
    "NeuroDev": neuroDevGenes,
    "NeuroDisease": neuroDD,
    "HSA21gene": HSA21genes,
    "HSA21TF": HSA21_TFs,
}
networkDS

Unnamed: 0,TF,target,importance
0,RPL6,RPL9,434.995598
1,RPL6,RPL5,406.044919
2,RPL6,RPS3A,392.451358
3,RPL6,RPL18,369.723475
4,RPL6,RPL13,360.160228
...,...,...,...
1640,MEIS2,ZHX3,50.052399
1641,RBFOX2,KIRREL3,50.047622
1642,RPL6,GTF3A,50.043773
1643,PBX1,SORCS3,50.032963


In [11]:
keyRegulators = {}

listOfTFs = networkDS["TF"].tolist()

for gene in listOfTFs:
    countGene = listOfTFs.count(gene)
    keyRegulators[gene] = countGene

In [12]:
keyRegulatorsDataFrame = pd.DataFrame(keyRegulators.items(), columns=['Gene', 'RegulatedGenes'])
keyRegulatorsDataFrame = keyRegulatorsDataFrame.sort_values(by=['RegulatedGenes'], ascending=False)
keyRegulatorsDataFrame

Unnamed: 0,Gene,RegulatedGenes
0,RPL6,151
4,YBX1,151
7,SOX4,135
18,CELF4,93
53,RBFOX2,92
...,...,...
63,SMARCB1,1
57,BHLHE22,1
38,GLIS3,1
21,ID1,1


In [23]:
# Filter at above 50 regulated genes
keyRegulatorsAbove30 = keyRegulatorsDataFrame[keyRegulatorsDataFrame["RegulatedGenes"]>5]
keyRegulatorsAbove30

Unnamed: 0,Gene,RegulatedGenes
15,RPL6,159
26,RPS4X,134
25,RPL35,112
10,RPS10,109
92,YBX1,92
...,...,...
53,ZBTB20,6
52,PKM,6
35,ZNF76,6
165,ETV6,6


In [25]:
keyRegulatorsAbove30["Gene"].to_list()

['RPL6',
 'RPS4X',
 'RPL35',
 'RPS10',
 'YBX1',
 'HMGB2',
 'HNRNPA1',
 'SOX4',
 'CELF4',
 'CREB5',
 'RBFOX2',
 'ZFHX3',
 'LHX1',
 'GLI3',
 'ENO1',
 'EBF1',
 'PBX3',
 'DLX5',
 'MEIS2',
 'HMGB1',
 'SOX11',
 'SIX3',
 'NFIB',
 'FEZ1',
 'TPI1',
 'NFIA',
 'BCL11B',
 'TRIM33',
 'CCDC25',
 'MEIS1',
 'ZIC1',
 'SRP9',
 'MNX1',
 'TCF12',
 'UQCRB',
 'DLX2',
 'H2AFZ',
 'ZNF362',
 'EZR',
 'NEUROD2',
 'CTCFL',
 'PRDX5',
 'MXD3',
 'SOX5',
 'ZNF93',
 'PPARG',
 'NR2F2',
 'HES6',
 'NR2F1',
 'FOXO1',
 'JUN',
 'RAD21',
 'SREBF2',
 'STAT1',
 'LEF1',
 'DMRT1',
 'ESRRG',
 'NHLH1',
 'HIVEP3',
 'YWHAE',
 'LMX1A',
 'HSPA5',
 'DTL',
 'ZBTB20',
 'PKM',
 'ZNF76',
 'ETV6',
 'EEF1D']

In [13]:
# Filter at above 50 regulated genes
keyRegulatorsAbove50 = keyRegulatorsDataFrame[keyRegulatorsDataFrame["RegulatedGenes"]>50]
keyRegulatorsAbove50

Unnamed: 0,Gene,RegulatedGenes
15,RPL6,159
26,RPS4X,134
25,RPL35,112
10,RPS10,109
92,YBX1,92
22,HMGB2,76
42,HNRNPA1,66
19,SOX4,66
16,CELF4,63


In [14]:
labellingTable = pd.DataFrame(columns=['name', 'shared name','type','avgExpression'])
labellingTable

Unnamed: 0,name,shared name,type,avgExpression


In [15]:
genesLabelled = keyRegulatorsAbove50["Gene"].to_list() + HSA21genes
genesLabelled = list(dict.fromkeys(genesLabelled))
labellingTable["name"] = genesLabelled
labellingTable["shared name"] = genesLabelled
labellingTable

Unnamed: 0,name,shared name,type,avgExpression
0,RPL6,RPL6,,
1,RPS4X,RPS4X,,
2,RPL35,RPL35,,
3,RPS10,RPS10,,
4,YBX1,YBX1,,
...,...,...,...,...
262,H2BS1,H2BS1,,
263,CFAP298-TCP10L,CFAP298-TCP10L,,
264,FAM243B,FAM243B,,
265,GATD3B,GATD3B,,


In [16]:
# Give type to TFs
listOfGenes = labellingTable["name"].tolist()
Gene_Type = []
geneExpression = []

for gene in listOfGenes:
    value = "None"
    for key in listDict:
        if gene in listDict[key]:
            value = key
    Gene_Type.append(value)
    try:
        geneExpression.append(upDownRegulated[gene])
    except:
        geneExpression.append("Not Known")

labellingTable["type"] =  Gene_Type
labellingTable["avgExpression"] =  geneExpression
labellingTable

Unnamed: 0,name,shared name,type,avgExpression
0,RPL6,RPL6,GeneralTF,-0.272356
1,RPS4X,RPS4X,GeneralTF,-0.162235
2,RPL35,RPL35,GeneralTF,-0.292019
3,RPS10,RPS10,GeneralTF,-0.09295
4,YBX1,YBX1,GeneralTF,-0.104825
...,...,...,...,...
262,H2BS1,H2BS1,HSA21gene,Not Known
263,CFAP298-TCP10L,CFAP298-TCP10L,HSA21gene,Not Known
264,FAM243B,FAM243B,HSA21gene,Not Known
265,GATD3B,GATD3B,HSA21gene,-0.560153


In [17]:
labellingTable = labellingTable[labellingTable["avgExpression"] != "Not Known"]
labellingTable

Unnamed: 0,name,shared name,type,avgExpression
0,RPL6,RPL6,GeneralTF,-0.272356
1,RPS4X,RPS4X,GeneralTF,-0.162235
2,RPL35,RPL35,GeneralTF,-0.292019
3,RPS10,RPS10,GeneralTF,-0.09295
4,YBX1,YBX1,GeneralTF,-0.104825
...,...,...,...,...
256,CFAP298,CFAP298,HSA21gene,0.13549
257,GATD3A,GATD3A,HSA21gene,3.181376
258,CFAP410,CFAP410,HSA21gene,-0.140809
260,GET1,GET1,HSA21gene,0.539401


In [18]:
output = ""
output = '\n'.join(labellingTable["name"].to_list())

In [19]:
with open('STRINGgenesKeyRegulators.txt', 'w') as f:
    f.write(output)

In [20]:
labellingTable.to_csv("labellingTableSTRINGgenesKeyRegulators.csv", index=False)