# Motif Family Definitions for *S. cerevisiae*

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import motifs

In [5]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


## K-mer definitions

In [6]:
all_4mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 4))]
all_5mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 5))]
all_6mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 6))]

pd.DataFrame.from_dict({'motif' : all_4mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.4mers.txt'), sep = "\t", index = False, header = False)
pd.DataFrame.from_dict({'motif' : all_5mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.5mers.txt'), sep = "\t", index = False, header = False)
pd.DataFrame.from_dict({'motif' : all_6mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.6mers.txt'), sep = "\t", index = False, header = False)


## S. cerevisiae

In [7]:
family_cmap = {
    'A-rich_d0'     : '#2278B5',
    'A-rich_d1'     : '#6AB1E3',
    'A-rich_d2'     : '#9CCBEC',
    'T-rich_d0'     : '#2FA148',
    'T-rich_d1'     : '#73D689',
    'T-rich_d2'     : '#A1E4B0',
    'TA/TA-rich_d0' : '#D62A28',
    'TA/TA-rich_d1' : '#E77F7E',
    'TA/TA-rich_d2' : '#EFAAA9',
    'Other'         : '#F7F8F8',
}


### By pattern

In [8]:
## Identify motif patterns

variants = ['TATAAA', 'AGTAAA', 'TTTAAA', 'AATACA', 'CATAAA', 'GATAAA', 'AATATA', 'AATGAA', 'AAGAAA']

tata_patterns = ['TATA','ATAT']
tgta_patterns = ['TGTA','ATGT','TATG','GTAT','GATA','ATAG','TAGA','AGAT']
tcta_patterns = ['TCTA','ATCT','TATC','CTAT','CATA','ATAC','TACA','ACAT']
taaa_patterns = ['TAAA','AAAT','ATAA','AATA']
ttaa_patterns = ['TTAA','AATT']
gt_patterns   = ['GTGT','TGTG','GTCT','CTGT','TCTG','TGTC']


## Assign motifs to families based on priority

a_rich = list(set([x for x in all_6mers if x.count('A') >= 4]))
t_rich = list(set([x for x in all_6mers if x.count('T') >= 4]))
g_rich = list(set([x for x in all_6mers if x.count('G') >= 4]))

tata_rich = list(set([x for x in all_6mers for p in tata_patterns if (p in x)]))
tgta_rich = list(set([x for x in all_6mers for p in tgta_patterns if (p in x)]))
tcta_rich = list(set([x for x in all_6mers for p in tcta_patterns if (p in x)]))
taaa_rich = list(set([x for x in all_6mers for p in taaa_patterns if (p in x)]))
ttaa_rich = list(set([x for x in all_6mers for p in ttaa_patterns if (p in x)]))
gt_rich   = list(set([x for x in all_6mers for p in gt_patterns if ((p in x) and ("TGTA" not in x))]))

ta_rich = list(set([x for x in all_6mers if ((x.count('A') >= 3) and (x.count('T') >= 2)) or ((x.count('T') >= 3) and (x.count('A') >= 2))]))


## Assign motifs to families based on priority

motif_priority = {
    'G-rich'      : g_rich,
    'G/T-rich'    : gt_rich,
    'T/A-rich'    : ta_rich,
    'A-rich'      : a_rich,
    'T-rich'      : t_rich,
    'TT/AA-rich'  : ttaa_rich,
    'TA/AA-rich'  : taaa_rich,
    'TC/TA-rich'  : tcta_rich,
    'TG/TA-rich'  : tgta_rich,
    'TA/TA-rich'  : tata_rich,
}

motif_family = {}

for motif in all_6mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf

        
## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
    print(outstring)


## Record motif family definitions

pattern_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
}

with open(os.path.join(OUTDIR, 'motif_definitions.scer.6mers.patterns.pickle'), mode = 'wb') as handle:
    pickle.dump(pattern_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.scer.6mers.patterns.txt'))
)


G-rich 	 150
G/T-rich 	 164
T/A-rich 	 56
A-rich 	 89
T-rich 	 129
TT/AA-rich 	 76
TA/AA-rich 	 106
TC/TA-rich 	 250
TG/TA-rich 	 286
TA/TA-rich 	 80

g_rich <- c('AAGGGG', 'ACGGGG', 'AGAGGG', 'AGCGGG', 'AGGAGG', 'AGGCGG', 'AGGGAG', 'AGGGCG', 'AGGGGA', 'AGGGGC', 'AGGGGG', 'AGGGGT', 'AGGGTG', 'AGGTGG', 'AGTGGG', 'ATGGGG', 'CAGGGG', 'CCGGGG', 'CGAGGG', 'CGCGGG', 'CGGAGG', 'CGGCGG', 'CGGGAG', 'CGGGCG', 'CGGGGA', 'CGGGGC', 'CGGGGG', 'CGGGGT', 'CGGGTG', 'CGGTGG', 'CGTGGG', 'CTGGGG', 'GAAGGG', 'GACGGG', 'GAGAGG', 'GAGCGG', 'GAGGAG', 'GAGGCG', 'GAGGGA', 'GAGGGC', 'GAGGGG', 'GAGGGT', 'GAGGTG', 'GAGTGG', 'GATGGG', 'GCAGGG', 'GCCGGG', 'GCGAGG', 'GCGCGG', 'GCGGAG', 'GCGGCG', 'GCGGGA', 'GCGGGC', 'GCGGGG', 'GCGGGT', 'GCGGTG', 'GCGTGG', 'GCTGGG', 'GGAAGG', 'GGACGG', 'GGAGAG', 'GGAGCG', 'GGAGGA', 'GGAGGC', 'GGAGGG', 'GGAGGT', 'GGAGTG', 'GGATGG', 'GGCAGG', 'GGCCGG', 'GGCGAG', 'GGCGCG', 'GGCGGA', 'GGCGGC', 'GGCGGG', 'GGCGGT', 'GGCGTG', 'GGCTGG', 'GGGAAG', 'GGGACG', 'GGGAGA', 'GGGAGC', 'GGGAGG', 'GGGAGT

### By Hamming distance to archetypical motifs

#### 6 mers

In [9]:
## Identify archetypical motifs and assign motifs to their most similar archetype

archetypes = np.asarray(['TATATA','ATATAT','AAAAAA','TTTTTT'])

closest_family,closest_distance,closest_matches = zip(*[motifs.closest_archetype(motif, archetypes, kmer=6, max_mismatch=2) for motif in all_6mers])
closest_family   = np.asarray(closest_family)
closest_distance = np.asarray(closest_distance, dtype = float)

ambiguous_motifs = [h for h,m in zip(all_6mers, closest_matches) if len(m) > 1]


## Define family of TA/TA-rich motifs

fam_tatata_d0 = sorted(np.asarray(all_6mers)[((closest_family == 'TATATA') | (closest_family == 'ATATAT')) & (closest_distance == 0)].tolist())
fam_tatata_d1 = sorted(np.asarray(all_6mers)[((closest_family == 'TATATA') | (closest_family == 'ATATAT')) & (closest_distance == 1)].tolist())
fam_tatata_d2 = sorted(np.asarray(all_6mers)[((closest_family == 'TATATA') | (closest_family == 'ATATAT')) & (closest_distance == 2)].tolist())

print(len(fam_tatata_d0), fam_tatata_d0[:5])
print(len(fam_tatata_d1), fam_tatata_d1[:5])
print(len(fam_tatata_d2), fam_tatata_d2[:5])


## Define family of A-rich motifs

fam_a_rich_d0 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 0)].tolist())
fam_a_rich_d1 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 1)].tolist())
fam_a_rich_d2 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 2)].tolist())

print(len(fam_a_rich_d0), fam_a_rich_d0[:5])
print(len(fam_a_rich_d1), fam_a_rich_d1[:5])
print(len(fam_a_rich_d2), fam_a_rich_d2[:5])


## Define family of T-rich motifs

fam_t_rich_d0 = sorted(np.asarray(all_6mers)[(closest_family == 'TTTTTT') & (closest_distance == 0)].tolist())
fam_t_rich_d1 = sorted(np.asarray(all_6mers)[(closest_family == 'TTTTTT') & (closest_distance == 1)].tolist())
fam_t_rich_d2 = sorted(np.asarray(all_6mers)[(closest_family == 'TTTTTT') & (closest_distance == 2)].tolist())

print(len(fam_t_rich_d0), fam_t_rich_d0[:5])
print(len(fam_t_rich_d1), fam_t_rich_d1[:5])
print(len(fam_t_rich_d2), fam_t_rich_d2[:5])


## Assign motifs to families based on priority

motif_priority = {
    
    'A-rich_d0'   : sorted(fam_a_rich_d0),
    'A-rich_d1'   : sorted(fam_a_rich_d1),
    'A-rich_d2'   : sorted(fam_a_rich_d2),
    'A-rich'      : sorted(fam_a_rich_d0 + fam_a_rich_d1 + fam_a_rich_d2),

    'T-rich_d0'   : sorted(fam_t_rich_d0),
    'T-rich_d1'   : sorted(fam_t_rich_d1),
    'T-rich_d2'   : sorted(fam_t_rich_d2),
    'T-rich'      : sorted(fam_t_rich_d0 + fam_t_rich_d1 + fam_t_rich_d2),
    
    'TA/TA-rich_d0' : sorted(fam_tatata_d0),
    'TA/TA-rich_d1' : sorted(fam_tatata_d1),
    'TA/TA-rich_d2' : sorted(fam_tatata_d2),
    'TA/TA-rich'    : sorted(fam_tatata_d0 + fam_tatata_d1 + fam_tatata_d2),
    
}

motif_family = {}
motif_hamming = {}

for motif in all_6mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf

for mf,mlist in motif_priority.items():
    for m in mlist:
        if ("_d") in mf:
            motif_hamming[m] = mf

            
## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    if ("_d" not in mf):
        print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    if ("_d" not in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

for mf,ml in motif_priority.items():
    if ("_d" in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)


## Record motif family definitions

hamming_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
    'hamming'    : motif_hamming,
}

with open(os.path.join(OUTDIR, 'motif_definitions.scer.6mers.distance.pickle'), mode = 'wb') as handle:
    pickle.dump(hamming_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.scer.6mers.distance.txt'))
)


Conflicting family assignment: AAACAT equidistant to ['ATATAT' 'AAAAAA'] ... assigning to AAAAAA based on content
Conflicting family assignment: AAAGAT equidistant to ['ATATAT' 'AAAAAA'] ... assigning to AAAAAA based on content
Conflicting family assignment: AAATAC equidistant to ['ATATAT' 'AAAAAA'] ... assigning to ATATAT based on content
Conflicting family assignment: AAATAG equidistant to ['ATATAT' 'AAAAAA'] ... assigning to ATATAT based on content
Conflicting family assignment: AACATA equidistant to ['TATATA' 'AAAAAA'] ... assigning to TATATA based on priority
Conflicting family assignment: AAGATA equidistant to ['TATATA' 'AAAAAA'] ... assigning to TATATA based on priority
Conflicting family assignment: AATACA equidistant to ['TATATA' 'AAAAAA'] ... assigning to TATATA based on priority
Conflicting family assignment: AATAGA equidistant to ['TATATA' 'AAAAAA'] ... assigning to TATATA based on priority
Conflicting family assignment: ACAAAT equidistant to ['ATATAT' 'AAAAAA'] ... assigni

#### 5mers

In [10]:
## Identify archetypical motifs and assign motifs to their most similar archetype

archetypes = np.asarray(['TATAT','ATATA','AAAAA','TTTTT'])

closest_family,closest_distance,closest_matches = zip(*[motifs.closest_archetype(motif, archetypes, kmer=5, max_mismatch=2) for motif in all_5mers])
closest_family   = np.asarray(closest_family)
closest_distance = np.asarray(closest_distance, dtype = float)

ambiguous_motifs = [h for h,m in zip(all_5mers, closest_matches) if len(m) > 1]


## Define family of TA/TA-rich motifs

fam_tatata_d0 = sorted(np.asarray(all_5mers)[((closest_family == 'TATAT') | (closest_family == 'ATATA')) & (closest_distance == 0)].tolist())
fam_tatata_d1 = sorted(np.asarray(all_5mers)[((closest_family == 'TATAT') | (closest_family == 'ATATA')) & (closest_distance == 1)].tolist())
fam_tatata_d2 = sorted(np.asarray(all_5mers)[((closest_family == 'TATAT') | (closest_family == 'ATATA')) & (closest_distance == 2)].tolist())

print(len(fam_tatata_d0), fam_tatata_d0[:5])
print(len(fam_tatata_d1), fam_tatata_d1[:5])
print(len(fam_tatata_d2), fam_tatata_d2[:5])


## Define family of A-rich motifs

fam_a_rich_d0 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 0)].tolist())
fam_a_rich_d1 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 1)].tolist())
fam_a_rich_d2 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 2)].tolist())

print(len(fam_a_rich_d0), fam_a_rich_d0[:5])
print(len(fam_a_rich_d1), fam_a_rich_d1[:5])
print(len(fam_a_rich_d2), fam_a_rich_d2[:5])


## Define family of T-rich motifs

fam_t_rich_d0 = sorted(np.asarray(all_5mers)[(closest_family == 'TTTTT') & (closest_distance == 0)].tolist())
fam_t_rich_d1 = sorted(np.asarray(all_5mers)[(closest_family == 'TTTTT') & (closest_distance == 1)].tolist())
fam_t_rich_d2 = sorted(np.asarray(all_5mers)[(closest_family == 'TTTTT') & (closest_distance == 2)].tolist())

print(len(fam_t_rich_d0), fam_t_rich_d0[:5])
print(len(fam_t_rich_d1), fam_t_rich_d1[:5])
print(len(fam_t_rich_d2), fam_t_rich_d2[:5])


## Assign motifs to families based on priority

motif_priority = {
    
    'A-rich_d0'   : sorted(fam_a_rich_d0),
    'A-rich_d1'   : sorted(fam_a_rich_d1),
    'A-rich_d2'   : sorted(fam_a_rich_d2),
    'A-rich'      : sorted(fam_a_rich_d0 + fam_a_rich_d1 + fam_a_rich_d2),

    'T-rich_d0'   : sorted(fam_t_rich_d0),
    'T-rich_d1'   : sorted(fam_t_rich_d1),
    'T-rich_d2'   : sorted(fam_t_rich_d2),
    'T-rich'      : sorted(fam_t_rich_d0 + fam_t_rich_d1 + fam_t_rich_d2),
    
    'TA/TA-rich_d0' : sorted(fam_tatata_d0),
    'TA/TA-rich_d1' : sorted(fam_tatata_d1),
    'TA/TA-rich_d2' : sorted(fam_tatata_d2),
    'TA/TA-rich'    : sorted(fam_tatata_d0 + fam_tatata_d1 + fam_tatata_d2),
    
}

motif_family = {}
motif_hamming = {}

for motif in all_5mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf
        
for mf,mlist in motif_priority.items():
    for m in mlist:
        if ("_d") in mf:
            motif_hamming[m] = mf

            
## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    if ("_d" not in mf):
        print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    if ("_d" not in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)


for mf,ml in motif_priority.items():
    if ("_d" in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)


## Record motif family definitions

hamming_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
    'hamming'    : motif_hamming,
}

with open(os.path.join(OUTDIR, 'motif_definitions.scer.5mers.distance.pickle'), mode = 'wb') as handle:
    pickle.dump(hamming_definitions, handle)
    
(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.scer.5mers.distance.txt'))
)


Conflicting family assignment: AAATA equidistant to ['ATATA' 'AAAAA'] ... assigning to ATATA based on content
Conflicting family assignment: AAATC equidistant to ['ATATA' 'AAAAA'] ... assigning to ATATA based on content
Conflicting family assignment: AAATG equidistant to ['ATATA' 'AAAAA'] ... assigning to ATATA based on content
Conflicting family assignment: AAATT equidistant to ['ATATA' 'AAAAA'] ... assigning to ATATA based on content
Conflicting family assignment: AACAT equidistant to ['TATAT' 'AAAAA'] ... assigning to AAAAA based on content
Conflicting family assignment: AACTA equidistant to ['ATATA' 'AAAAA'] ... assigning to AAAAA based on content
Conflicting family assignment: AAGAT equidistant to ['TATAT' 'AAAAA'] ... assigning to AAAAA based on content
Conflicting family assignment: AAGTA equidistant to ['ATATA' 'AAAAA'] ... assigning to AAAAA based on content
Conflicting family assignment: AATAC equidistant to ['TATAT' 'AAAAA'] ... assigning to TATAT based on content
Conflictin

a_rich <- c('AAAAA', 'AAAAC', 'AAAAG', 'AAAAT', 'AAACA', 'AAACC', 'AAACG', 'AAACT', 'AAAGA', 'AAAGC', 'AAAGG', 'AAAGT', 'AACAA', 'AACAC', 'AACAG', 'AACAT', 'AACCA', 'AACGA', 'AACTA', 'AAGAA', 'AAGAC', 'AAGAG', 'AAGAT', 'AAGCA', 'AAGGA', 'AAGTA', 'AATAA', 'AATCA', 'AATGA', 'ACAAA', 'ACAAC', 'ACAAG', 'ACAAT', 'ACCAA', 'ACGAA', 'ACTAA', 'AGAAA', 'AGAAC', 'AGAAG', 'AGAAT', 'AGCAA', 'AGGAA', 'AGTAA', 'CAAAA', 'CAAAC', 'CAAAG', 'CAAAT', 'CAACA', 'CAAGA', 'CACAA', 'CAGAA', 'CCAAA', 'CGAAA', 'CTAAA', 'GAAAA', 'GAAAC', 'GAAAG', 'GAAAT', 'GAACA', 'GAAGA', 'GACAA', 'GAGAA', 'GCAAA', 'GGAAA', 'GTAAA', 'TAAAA', 'TAACA', 'TAAGA', 'TCAAA', 'TGAAA', 'TTAAA')

t_rich <- c('AATTT', 'ACTTT', 'AGTTT', 'ATTCT', 'ATTGT', 'ATTTT', 'CATTT', 'CCTTT', 'CGTTT', 'CTCTT', 'CTGTT', 'CTTCT', 'CTTGT', 'CTTTA', 'CTTTC', 'CTTTG', 'CTTTT', 'GATTT', 'GCTTT', 'GGTTT', 'GTCTT', 'GTGTT', 'GTTCT', 'GTTGT', 'GTTTA', 'GTTTC', 'GTTTG', 'GTTTT', 'TCATT', 'TCCTT', 'TCGTT', 'TCTTA', 'TCTTC', 'TCTTG', 'TCTTT', 'TGATT', 'TGCTT', 'TG

#### 4mers

In [11]:
## Identify archetypical motifs and assign motifs to their most similar archetype

archetypes = np.asarray(['TATA','ATAT','AAAA','TTTT'])

closest_family,closest_distance,closest_matches = zip(*[motifs.closest_archetype(motif, archetypes, kmer=4, max_mismatch=2) for motif in all_4mers])
closest_family   = np.asarray(closest_family)
closest_distance = np.asarray(closest_distance, dtype = float)

ambiguous_motifs = [h for h,m in zip(all_4mers, closest_matches) if len(m) > 1]


## Define family of TA/TA-rich motifs

fam_tatata_d0 = sorted(np.asarray(all_4mers)[((closest_family == 'TATA') | (closest_family == 'ATAT')) & (closest_distance == 0)].tolist())
fam_tatata_d1 = sorted(np.asarray(all_4mers)[((closest_family == 'TATA') | (closest_family == 'ATAT')) & (closest_distance == 1)].tolist())
fam_tatata_d2 = sorted(np.asarray(all_4mers)[((closest_family == 'TATA') | (closest_family == 'ATAT')) & (closest_distance == 2)].tolist())

print(len(fam_tatata_d0), fam_tatata_d0[:5])
print(len(fam_tatata_d1), fam_tatata_d1[:5])
print(len(fam_tatata_d2), fam_tatata_d2[:5])


## Define family of A-rich motifs

fam_a_rich_d0 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 0)].tolist())
fam_a_rich_d1 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 1)].tolist())
fam_a_rich_d2 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 2)].tolist())

print(len(fam_a_rich_d0), fam_a_rich_d0[:5])
print(len(fam_a_rich_d1), fam_a_rich_d1[:5])
print(len(fam_a_rich_d2), fam_a_rich_d2[:5])


## Define family of T-rich motifs

fam_t_rich_d0 = sorted(np.asarray(all_4mers)[(closest_family == 'TTTT') & (closest_distance == 0)].tolist())
fam_t_rich_d1 = sorted(np.asarray(all_4mers)[(closest_family == 'TTTT') & (closest_distance == 1)].tolist())
fam_t_rich_d2 = sorted(np.asarray(all_4mers)[(closest_family == 'TTTT') & (closest_distance == 2)].tolist())

print(len(fam_t_rich_d0), fam_t_rich_d0[:5])
print(len(fam_t_rich_d1), fam_t_rich_d1[:5])
print(len(fam_t_rich_d2), fam_t_rich_d2[:5])


## Assign motifs to families based on priority

motif_priority = {
    
    'A-rich_d0'   : sorted(fam_a_rich_d0),
    'A-rich_d1'   : sorted(fam_a_rich_d1),
    'A-rich_d2'   : sorted(fam_a_rich_d2),
    'A-rich'      : sorted(fam_a_rich_d0 + fam_a_rich_d1 + fam_a_rich_d2),

    'T-rich_d0'   : sorted(fam_t_rich_d0),
    'T-rich_d1'   : sorted(fam_t_rich_d1),
    'T-rich_d2'   : sorted(fam_t_rich_d2),
    'T-rich'      : sorted(fam_t_rich_d0 + fam_t_rich_d1 + fam_t_rich_d2),
    
    'TA/TA-rich_d0' : sorted(fam_tatata_d0),
    'TA/TA-rich_d1' : sorted(fam_tatata_d1),
    'TA/TA-rich_d2' : sorted(fam_tatata_d2),
    'TA/TA-rich'    : sorted(fam_tatata_d0 + fam_tatata_d1 + fam_tatata_d2),
    
}

motif_family = {}
motif_hamming = {}

for motif in all_4mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf
        
for mf,mlist in motif_priority.items():
    for m in mlist:
        if ("_d") in mf:
            motif_hamming[m] = mf


## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    if ("_d" not in mf):
        print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    if ("_d" not in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

for mf,ml in motif_priority.items():
    if ("_d" in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)


## Record motif family definitions

hamming_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
    'hamming'    : motif_hamming,
}

with open(os.path.join(OUTDIR, 'motif_definitions.scer.4mers.distance.pickle'), mode = 'wb') as handle:
    pickle.dump(hamming_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.scer.4mers.distance.txt'))
)


Conflicting family assignment: AAAT equidistant to ['ATAT' 'AAAA'] ... assigning to ATAT based on content
Conflicting family assignment: AACT equidistant to ['ATAT' 'AAAA'] ... assigning to AAAA based on content
Conflicting family assignment: AAGT equidistant to ['ATAT' 'AAAA'] ... assigning to AAAA based on content
Conflicting family assignment: AATA equidistant to ['TATA' 'AAAA'] ... assigning to TATA based on content
Conflicting family assignment: AATC equidistant to ['TATA' 'AAAA'] ... assigning to AAAA based on content
Conflicting family assignment: AATG equidistant to ['TATA' 'AAAA'] ... assigning to AAAA based on content
Conflicting family assignment: AATT equidistant to ['TATA' 'ATAT' 'AAAA' 'TTTT'] ... assigning to ATAT based on content
Conflicting family assignment: ACAC equidistant to ['ATAT' 'AAAA'] ... assigning to ATAT based on priority
Conflicting family assignment: ACAG equidistant to ['ATAT' 'AAAA'] ... assigning to ATAT based on priority
Conflicting family assignment: