In [4]:
import json
import os

import numpy as np
import pandas as pd

In [5]:
with open("encode_chip_seq_matrix.json") as f:
    encode_matrix = json.load(f)


In [6]:
encode_tf_list = [i["key"] for i in encode_matrix["facets"][6]["terms"]]

In [7]:
with open("/home/ubuntu/s3/jaspar_profile/names.json") as f:
    jaspar_meta = json.load(f)

In [8]:
# let's do this: for each TF in the ENCODE list, find the list of profile names and profile base names in JASPAR, store that in a dict

tf_profile_dict = {}
tf_profile_base_dict = {}
tf_profile_dict_agnostic = {}
tf_profile_base_dict_agnostic = {}

for tf in encode_tf_list:
    tf_profile_dict[tf] = set()
    tf_profile_base_dict[tf] = set()
    tf_profile_dict_agnostic[tf] = set()
    tf_profile_base_dict_agnostic[tf] = set()
    for profile, tf_ in jaspar_meta.items():
        if tf == tf_:
            tf_profile_dict[tf].add(profile)
            tf_profile_base_dict[tf].add(profile.split(".")[0])
        if tf.lower() == tf_.lower():
            tf_profile_dict_agnostic[tf].add(profile)
            tf_profile_base_dict_agnostic[tf].add(profile.split(".")[0])

    

In [9]:
# find those TFs that have exactly one profile base name
for tf, profiles in tf_profile_base_dict.items():
    if len(profiles) == 1:
        print(tf, list(profiles)[0])

NR3C1 MA0113
REST MA0138
CEBPB MA0466
MYC MA0147
MAX MA0058
FOSL2 MA0478
YY1 MA0095
CREB1 MA0018
SP1 MA0079
ZBTB33 MA0527
ATF3 MA0605
EGR1 MA0162
ELF1 MA0473
GABPA MA0062
RELA MA0107
USF1 MA0093
FOXA1 MA0148
Cebpa MA0102
HES2 MA0616
MAZ MA1522
NRF1 MA0506
SRF MA0083
TCF12 MA1648
USF2 MA0526
ZNF274 MA1592
TEAD4 MA0809
ATF2 MA1632
ESR1 MA0112
MAFK MA0496
RFX5 MA0510
TCF7L2 MA0523
ZNF24 MA1124
BHLHE40 MA0464
E2F1 MA0024
E2F4 MA0470
ELK1 MA0028
ESRRA MA0592
GATA2 MA0036
GATA3 MA0037
IRF1 MA0050
MXI1 MA1108
STAT1 MA0137
ZNF143 MA0088
ETV6 MA0645
MNT MA0825
ELF4 MA0641
ETS1 MA0098
FOSL1 MA0477
FOXA2 MA0047
FOXK2 MA1103
IKZF1 MA1508
TBP MA0108
ATF4 MA0833
ATF7 MA0834
CREM MA0609
E2F6 MA0471
IRF3 MA1418
MYBL2 MA0777
NFYA MA0060
NFYB MA0502
NR2F2 MA1111
NR2F6 MA1539
PBX3 MA1114
SP2 MA0516
STAT3 MA0144
ZBTB7A MA0750
ZNF263 MA0528
BACH1 MA1633
CUX1 MA0754
E2F8 MA0865
FOXP1 MA0481
GATA1 MA0035
IRF2 MA0051
KLF16 MA0741
MEF2A MA0052
MLX MA0663
NFATC3 MA0625
NFE2 MA0841
NFE2L2 MA0150
PATZ1 MA1961
PAX

In [10]:
# create a df where the index is the TF name, the two columns are the profile names and profile base names, and the values are lists of names
df = pd.DataFrame.from_dict(
    {k: [list(v), list(tf_profile_base_dict[k])] for k, v in tf_profile_dict.items()},
    orient="index",
    columns=["profiles", "profile_bases"],
)


In [39]:
df.profile_bases.map(len).value_counts()

0    799
1    322
2     25
3      3
4      1
Name: profile_bases, dtype: int64

In [40]:
df.index[df.profile_bases.map(len).argmax()]

'CTCF'

In [41]:
df_chosen = df[df.profile_bases.map(len) == 1].copy()
profiles = []
for i in df_chosen.itertuples():
    profile_base = i.profile_bases[0]
    versions = [int(j.split(".")[1]) for j in i.profiles if j.split(".")[0] == profile_base]
    profiles.append(profile_base + "." + str(max(versions)))

df_chosen["profile"] = profiles
df_chosen = df_chosen[["profile"]]
df_chosen.reset_index(inplace=True)
df_chosen.rename(columns={"index": "tf"}, inplace=True)
df_chosen.to_csv("chosen_tfs.csv", index=False)

In [42]:
# split into three parts
df_chosen.iloc[:100].to_csv("chosen_tfs_1.csv", index=False)
df_chosen.iloc[100:200].to_csv("chosen_tfs_2.csv", index=False)
df_chosen.iloc[200:].to_csv("chosen_tfs_3.csv", index=False)

In [43]:
df.to_pickle("tf_profile_dict.pkl")

In [None]:
for i in df[df.profile_bases.map(len) == 1].itertuples():
    os.makedirs(f"./data_scan/hg38/{i.Index}", exist_ok=True)

In [59]:
# for those with no profile, try to find the captial-agnostic version
df_zero = df[df.profile_bases.map(len) == 0].copy()
df_zero.index = df_zero.index
df_zero["profiles_agnostic"] = df_zero.index.map(
    lambda x: tf_profile_dict_agnostic[x.upper()]
    if x.upper() in tf_profile_dict_agnostic
    else set()
)
df_zero["profile_bases_agnostic"] = df_zero.profiles_agnostic.map(
    lambda x: set([i.split(".")[0] for i in x])
)
df_zero.profile_bases_agnostic.map(len).value_counts()


0    758
1     40
2      1
Name: profile_bases_agnostic, dtype: int64

In [61]:
df_zero.iloc[df_zero.profile_bases_agnostic.map(len).argmax()]

profiles                                    []
profile_bases                               []
profiles_agnostic         {MA0204.1, MA2001.1}
profile_bases_agnostic        {MA2001, MA0204}
Name: SIX4, dtype: object

## Get JASPAR profiles on the hg38 page

In [1]:
# find all hg38 profiles
import requests
from bs4 import BeautifulSoup

r = requests.get("http://expdata.cmmt.ubc.ca/JASPAR/downloads/UCSC_tracks/2022/hg38/")
soup = BeautifulSoup(r.text, "html.parser")
hg38_profiles = [i.text for i in soup.find_all("a") if i.text.endswith(".tsv.gz")]

In [2]:
hg38_profiles = [i[:-7] for i in hg38_profiles]

In [3]:
with open("/home/ubuntu/s3/jaspar_profile/names_hg38.txt", "w") as f:
    print(*hg38_profiles, sep="\n", file=f)

In [12]:
# intersect encode TFs with hg38 profiles
jaspar_meta_hg38 = {i: jaspar_meta[i] for i in hg38_profiles}

In [13]:
# let's do this: for each TF in the ENCODE list, find the list of profile names and profile base names in JASPAR, store that in a dict
jaspar_meta = jaspar_meta_hg38

tf_profile_dict = {}
tf_profile_base_dict = {}
tf_profile_dict_agnostic = {}
tf_profile_base_dict_agnostic = {}

for tf in encode_tf_list:
    tf_profile_dict[tf] = set()
    tf_profile_base_dict[tf] = set()
    tf_profile_dict_agnostic[tf] = set()
    tf_profile_base_dict_agnostic[tf] = set()
    for profile, tf_ in jaspar_meta.items():
        if tf == tf_:
            tf_profile_dict[tf].add(profile)
            tf_profile_base_dict[tf].add(profile.split(".")[0])
        if tf.lower() == tf_.lower():
            tf_profile_dict_agnostic[tf].add(profile)
            tf_profile_base_dict_agnostic[tf].add(profile.split(".")[0])

In [14]:
# find those TFs that have exactly one profile base name
for tf, profiles in tf_profile_base_dict.items():
    if len(profiles) == 1:
        print(tf, list(profiles)[0])

JUN MA0488
NR3C1 MA0113
REST MA0138
CEBPB MA0466
MYC MA0147
MAX MA0058
FOSL2 MA0478
CREB1 MA0018
SP1 MA0079
ZBTB33 MA0527
ATF3 MA0605
EGR1 MA0162
ELF1 MA0473
GABPA MA0062
RELA MA0107
USF1 MA0093
FOXA1 MA0148
HES2 MA0616
MAZ MA1522
SRF MA0083
TCF12 MA1648
USF2 MA0526
ZNF274 MA1592
TEAD4 MA0809
ATF2 MA1632
ESR1 MA0112
MAFK MA0496
RFX5 MA0510
TCF7L2 MA0523
ZNF24 MA1124
BHLHE40 MA0464
E2F1 MA0024
E2F4 MA0470
ELK1 MA0028
ESRRA MA0592
GATA2 MA0036
MXI1 MA1108
STAT1 MA0137
ZNF143 MA0088
ETV6 MA0645
MNT MA0825
ELF4 MA0641
ETS1 MA0098
FOSL1 MA0477
FOXA2 MA0047
FOXK2 MA1103
IKZF1 MA1508
TBP MA0108
ATF4 MA0833
ATF7 MA0834
CREM MA0609
E2F6 MA0471
IRF3 MA1418
MYBL2 MA0777
NFYA MA0060
NFYB MA0502
NR2F2 MA1111
NR2F6 MA1539
PBX3 MA1114
RFX1 MA0509
SP2 MA0516
STAT3 MA0144
ZBTB7A MA0750
ZNF263 MA0528
BACH1 MA1633
CUX1 MA0754
E2F8 MA0865
FOXP1 MA0481
GATA1 MA0035
IRF2 MA0051
KLF16 MA0741
MEF2A MA0052
MLX MA0663
NFATC3 MA0625
NFE2 MA0841
PATZ1 MA1961
PAX5 MA0014
PKNOX1 MA0782
POU5F1 MA1115
SMAD3 MA0795
TC

In [15]:
# create a df where the index is the TF name, the two columns are the profile names and profile base names, and the values are lists of names
df = pd.DataFrame.from_dict(
    {k: [list(v), list(tf_profile_base_dict[k])] for k, v in tf_profile_dict.items()},
    orient="index",
    columns=["profiles", "profile_bases"],
)
df.profile_bases.map(len).value_counts()


0    820
1    307
2     19
3      4
Name: profile_bases, dtype: int64

In [16]:
df_chosen = df[df.profile_bases.map(len) == 1].copy()
profiles = []
for i in df_chosen.itertuples():
    profile_base = i.profile_bases[0]
    versions = [int(j.split(".")[1]) for j in i.profiles if j.split(".")[0] == profile_base]
    profiles.append(profile_base + "." + str(max(versions)))

df_chosen["profile"] = profiles
df_chosen = df_chosen[["profile"]]
df_chosen.reset_index(inplace=True)
df_chosen.rename(columns={"index": "tf"}, inplace=True)

In [17]:
df_chosen.shape

(307, 2)

In [18]:
df_chosen.to_csv("chosen_tfs.csv", index=False)
# split into three parts
df_chosen.iloc[:100].to_csv("chosen_tfs_1.csv", index=False)
df_chosen.iloc[100:200].to_csv("chosen_tfs_2.csv", index=False)
df_chosen.iloc[200:].to_csv("chosen_tfs_3.csv", index=False)

In [20]:
# for those with no profile, try to find the captial-agnostic version
df_zero = df[df.profile_bases.map(len) == 0].copy()
df_zero.index = df_zero.index
df_zero["profiles_agnostic"] = df_zero.index.map(
    lambda x: tf_profile_dict_agnostic[x.upper()]
    if x.upper() in tf_profile_dict_agnostic
    else set()
)
df_zero["profile_bases_agnostic"] = df_zero.profiles_agnostic.map(
    lambda x: set([i.split(".")[0] for i in x])
)
df_zero.profile_bases_agnostic.map(len).value_counts()

0    761
1     59
Name: profile_bases_agnostic, dtype: int64

In [23]:
df_chosen.shape

(307, 2)

In [25]:
for i in df_chosen.itertuples():
    os.makedirs(f"./data_scan/hg38/{i.tf}", exist_ok=True)