In [1]:
using CSV, DataFrames
using LightGraphs

## Load data

In [2]:
gene_tf = CSV.read("../data/fantom5_cat/gene_tf_map.csv")
first(gene_tf, 10)

Unnamed: 0_level_0,gene_id,tf_hgnc_id,post_prob
Unnamed: 0_level_1,String,String,Float64
1,CATG00000000004.1,HGNC:11241,0.104402
2,CATG00000000004.1,HGNC:11291,0.278845
3,CATG00000000004.1,HGNC:16736,0.333713
4,CATG00000000004.1,HGNC:18562,0.120572
5,CATG00000000004.1,HGNC:4170,0.324268
6,CATG00000000004.1,HGNC:7566,0.356093
7,CATG00000000004.1,HGNC:7786,0.122919
8,CATG00000000004.1,HGNC:7794,0.139591
9,CATG00000000004.1,HGNC:7804,0.213821
10,CATG00000000004.1,HGNC:7978,0.14079


In [3]:
tf = CSV.read("../data/fantom5_cat/tf_list.csv")
first(tf, 10)

Unnamed: 0_level_0,gene_id,hgnc_id,is_autoregulatory
Unnamed: 0_level_1,String,String,String
1,ENSG00000151702.12,HGNC:3749,f
2,ENSG00000158711.9,HGNC:3326,t
3,ENSG00000137203.6,HGNC:11742,t
4,ENSG00000068305.13,HGNC:6993,t
5,ENSG00000091831.17,HGNC:3467,t
6,ENSG00000131759.13,HGNC:9864,t
7,ENSG00000177932.6,HGNC:16736,f
8,ENSG00000163064.6,HGNC:3342,f
9,ENSG00000166478.5,HGNC:12928,t
10,ENSG00000139515.5,HGNC:6107,t


In [4]:
tf_set = Set(tf.hgnc_id)
length(tf_set)

1990

## Make HGNC mapping to gene id

In [5]:
hgnc2id = Dict(tf[i, :hgnc_id] => tf[i, :gene_id] for i = 1:nrow(tf))

Dict{String,String} with 1990 entries:
  "HGNC:16960" => "ENSG00000138136.5"
  "HGNC:6876"  => "ENSG00000112062.6"
  "HGNC:3490"  => "ENSG00000006468.9"
  "HGNC:29158" => "ENSG00000064932.11"
  "HGNC:18039" => "ENSG00000117139.12"
  "HGNC:6347"  => "ENSG00000127528.5"
  "HGNC:3436"  => "ENSG00000175595.10"
  "HGNC:16806" => "ENSG00000104517.8"
  "HGNC:12892" => "ENSG00000062370.12"
  "HGNC:3093"  => "ENSG00000127334.10"
  "HGNC:18348" => "ENSG00000127666.8"
  "HGNC:3115"  => "ENSG00000112242.10"
  "HGNC:13075" => "ENSG00000162702.7"
  "HGNC:13077" => "ENSG00000167637.12"
  "HGNC:28495" => "ENSG00000176083.13"
  "HGNC:25741" => "ENSG00000198093.6"
  "HGNC:11715" => "ENSG00000074219.9"
  "HGNC:13198" => "ENSG00000198205.5"
  "HGNC:18320" => "ENSG00000165244.6"
  "HGNC:30729" => "ENSG00000178662.11"
  "HGNC:14625" => "ENSG00000170779.10"
  "HGNC:9189"  => "ENSG00000102978.8"
  "HGNC:24787" => "ENSG00000172888.7"
  "HGNC:34032" => "ENSG00000185869.9"
  "HGNC:23508" => "ENSG00000165730.10"


In [6]:
# not all tf has hgnc id
gene_tf = gene_tf[map(x -> x in tf_set, gene_tf[!, :tf_hgnc_id]), :]

Unnamed: 0_level_0,gene_id,tf_hgnc_id,post_prob
Unnamed: 0_level_1,String,String,Float64
1,CATG00000000004.1,HGNC:11241,0.104402
2,CATG00000000004.1,HGNC:11291,0.278845
3,CATG00000000004.1,HGNC:16736,0.333713
4,CATG00000000004.1,HGNC:18562,0.120572
5,CATG00000000004.1,HGNC:4170,0.324268
6,CATG00000000004.1,HGNC:7566,0.356093
7,CATG00000000004.1,HGNC:7786,0.122919
8,CATG00000000004.1,HGNC:7794,0.139591
9,CATG00000000004.1,HGNC:7804,0.213821
10,CATG00000000004.1,HGNC:7978,0.14079


In [7]:
gene_tf[!, :tf_id] = map(x -> hgnc2id[x], gene_tf[!, :tf_hgnc_id])
first(gene_tf, 10)

Unnamed: 0_level_0,gene_id,tf_hgnc_id,post_prob,tf_id
Unnamed: 0_level_1,String,String,Float64,String
1,CATG00000000004.1,HGNC:11241,0.104402,ENSG00000066336.7
2,CATG00000000004.1,HGNC:11291,0.278845,ENSG00000112658.6
3,CATG00000000004.1,HGNC:16736,0.333713,ENSG00000177932.6
4,CATG00000000004.1,HGNC:18562,0.120572,ENSG00000163497.2
5,CATG00000000004.1,HGNC:4170,0.324268,ENSG00000102145.9
6,CATG00000000004.1,HGNC:7566,0.356093,ENSG00000111046.3
7,CATG00000000004.1,HGNC:7786,0.122919,ENSG00000141905.13
8,CATG00000000004.1,HGNC:7794,0.139591,ENSG00000109320.7
9,CATG00000000004.1,HGNC:7804,0.213821,ENSG00000001167.10
10,CATG00000000004.1,HGNC:7978,0.14079,ENSG00000113580.10


In [8]:
gene_set = Set(gene_tf.gene_id)
length(gene_set)

42140

In [9]:
tf_set = Set(gene_tf.tf_id)
length(tf_set)

117

## Check gene set includes all tf set

In [10]:
issubset(tf_set, gene_set)

false

In [11]:
gene_set = gene_set ∪ setdiff(tf_set, gene_set)

Set(["ENSG00000007541.10", "ENSG00000100228.8", "ENSG00000213005.2", "ENSG00000183484.7", "ENSG00000251655.2", "CATG00000029625.1", "CATG00000109254.1", "CATG00000050206.1", "ENSG00000261441.1", "ENSG00000205730.5"  …  "ENSG00000129270.11", "ENSG00000234474.2", "ENSG00000235859.4", "CATG00000077874.1", "ENSG00000258973.1", "ENSG00000197976.6", "ENSG00000168779.15", "ENSG00000233839.1", "ENSG00000264608.1", "CATG00000083902.1", "CATG00000096583.1"])

## Make sorted gene list

In [12]:
gene_set = sort([x for x = gene_set])

42141-element Array{String,1}:
 "CATG00000000004.1"
 "CATG00000000008.1"
 "CATG00000000010.1"
 "CATG00000000011.1"
 "CATG00000000018.1"
 "CATG00000000020.1"
 "CATG00000000025.1"
 "CATG00000000026.1"
 "CATG00000000027.1"
 "CATG00000000031.1"
 "CATG00000000068.1"
 "CATG00000000070.1"
 "CATG00000000075.1"
 ⋮                  
 "ENSG00000273437.1"
 "ENSG00000273439.1"
 "ENSG00000273443.1"
 "ENSG00000273451.1"
 "ENSG00000273456.1"
 "ENSG00000273464.1"
 "ENSG00000273472.1"
 "ENSG00000273473.1"
 "ENSG00000273481.1"
 "ENSG00000273492.1"
 "ENSGR0000169100.8"
 "ENSGR0000225661.2"

In [13]:
gene2num = Dict(x => i for (i, x) in enumerate(gene_set))

Dict{String,Int64} with 42141 entries:
  "ENSG00000007541.10" => 14631
  "ENSG00000100228.8"  => 16663
  "ENSG00000213005.2"  => 32419
  "ENSG00000183484.7"  => 29184
  "ENSG00000264608.1"  => 40586
  "CATG00000029625.1"  => 3568
  "CATG00000050206.1"  => 6426
  "CATG00000109254.1"  => 13338
  "ENSG00000205730.5"  => 31913
  "ENSG00000261441.1"  => 40169
  "CATG00000004484.1"  => 523
  "ENSG00000224276.1"  => 33383
  "CATG00000055094.1"  => 7086
  "CATG00000056050.1"  => 7228
  "CATG00000014732.1"  => 1805
  "ENSG00000196109.6"  => 30389
  "ENSG00000273176.1"  => 42069
  "ENSG00000064655.14" => 15334
  "ENSG00000087589.12" => 16192
  "ENSG00000223597.2"  => 33226
  "CATG00000078853.1"  => 9768
  "ENSG00000241720.2"  => 36683
  "CATG00000029808.1"  => 3592
  "ENSG00000140718.14" => 22329
  "CATG00000003309.1"  => 420
  ⋮                    => ⋮

## Make simple directed graph

In [14]:
dg = SimpleDiGraph(length(gene_set))

{42141, 0} directed simple Int64 graph

In [15]:
for i = 1:nrow(gene_tf)
    g = gene2num[gene_tf[i, :gene_id]]
    tf = gene2num[gene_tf[i, :tf_id]]
    add_edge!(dg, tf, g)
end

In [16]:
dg

{42141, 2067895} directed simple Int64 graph

In [17]:
using JLD2

In [18]:
@save "../results/tf_gene_network.jld2" dg
@save "../results/gene_set.jld2" gene_set