# Motif Family Definitions for *S. pombe*

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import motifs

In [5]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


## K-mer definitions

In [6]:
all_4mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 4))]
all_5mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 5))]
all_6mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 6))]

pd.DataFrame.from_dict({'motif' : all_4mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.4mers.txt'), sep = "\t", index = False, header = False)
pd.DataFrame.from_dict({'motif' : all_5mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.5mers.txt'), sep = "\t", index = False, header = False)
pd.DataFrame.from_dict({'motif' : all_6mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.6mers.txt'), sep = "\t", index = False, header = False)


## S. pombe

In [7]:
family_cmap = {
    'A-rich'        : '#2278B5',
    'A-rich_d0'     : '#2278B5',
    'A-rich_d1'     : '#6AB1E3',
    'A-rich_d2'     : '#9CCBEC',
    'A-rich_d3'     : '#CDE5F5',
    'T-rich'        : '#2FA148',
    'T-rich_d0'     : '#2FA148',
    'T-rich_d1'     : '#73D689',
    'T-rich_d2'     : '#A1E4B0',
    'GTA'           : '#FCB316',
    'GTA_d0'        : '#FCB316',
    'GTA_controls'  : '#6D6E71',
    'Other'         : '#F7F8F8',
}


### By Hamming distance to archetypical motifs

#### 6mers

In [8]:
## Identify archetypical motifs and assign motifs to their most similar archetype

archetypes = np.asarray(['AAAAAA','TTTTTT'])

closest_family,closest_distance,closest_matches = zip(*[motifs.closest_archetype(motif, archetypes, max_mismatch=[3,2], kmer=6) for motif in all_6mers])

closest_family   = np.asarray(closest_family)
closest_distance = np.asarray(closest_distance, dtype = float)

ambiguous_motifs = [h for h,m in zip(all_6mers, closest_matches) if len(m) > 1]
print(f"\nAmbiguous motifs (N = {len(ambiguous_motifs)}): {','.join(ambiguous_motifs)}")


## Identify motifs containing the GTA element

contains_GTA = np.asarray([("GTA" in motif) for motif in all_6mers])
controls_GTA = np.asarray([("G" in motif) & ("T" in motif) & ("A" in motif) & ("GTA" not in motif) for motif in all_6mers])

print(sum(contains_GTA))
print(sum(controls_GTA))

## Identify motifs containing the TAG element

contains_TAG = np.asarray([("TAG" in motif) for motif in all_6mers])
controls_TAG = np.asarray([("G" in motif) & ("T" in motif) & ("A" in motif) & ("TAG" not in motif) for motif in all_6mers])

print(sum(contains_TAG))
print(sum(controls_TAG))


## Define family of A-rich motifs

fam_a_rich_d0 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 0) & (contains_GTA == False)].tolist())
fam_a_rich_d1 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 1) & (contains_GTA == False)].tolist())
fam_a_rich_d2 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 2) & (contains_GTA == False)].tolist())
fam_a_rich_d3 = sorted(np.asarray(all_6mers)[(closest_family == 'AAAAAA') & (closest_distance == 3) & (contains_GTA == False)].tolist())

print(len(fam_a_rich_d0), fam_a_rich_d0[:5])
print(len(fam_a_rich_d1), fam_a_rich_d1[:5])
print(len(fam_a_rich_d2), fam_a_rich_d2[:5])
print(len(fam_a_rich_d3), fam_a_rich_d3[:5])


## Define family of T-rich motifs

fam_t_rich_d0 = sorted(np.asarray(all_6mers)[(closest_family == 'TTTTTT') & (closest_distance == 0) & (contains_GTA == False)].tolist())
fam_t_rich_d1 = sorted(np.asarray(all_6mers)[(closest_family == 'TTTTTT') & (closest_distance == 1) & (contains_GTA == False)].tolist())
fam_t_rich_d2 = sorted(np.asarray(all_6mers)[(closest_family == 'TTTTTT') & (closest_distance == 2) & (contains_GTA == False)].tolist())

print(len(fam_t_rich_d0), fam_t_rich_d0[:5])
print(len(fam_t_rich_d1), fam_t_rich_d1[:5])
print(len(fam_t_rich_d2), fam_t_rich_d2[:5])

## Define family of motifs containing the GTA element

fam_contains_GTA = sorted(np.asarray(all_6mers)[(contains_GTA == True)])
fam_controls_GTA = sorted(np.asarray(all_6mers)[(controls_GTA == True)])

print(len(fam_contains_GTA))
print(len(fam_controls_GTA))

## Define family of motifs containing the TAG element

fam_contains_TAG = sorted(np.asarray(all_6mers)[(contains_TAG == True)])
fam_controls_TAG = sorted(np.asarray(all_6mers)[(controls_TAG == True)])

print(len(fam_contains_TAG))
print(len(fam_controls_TAG))

## Define family of motifs containing both GTA and TAG elements

fam_contains_GTA_TAG = sorted(np.asarray(all_6mers)[(contains_GTA == True) & (contains_TAG == True)])
print(len(fam_contains_GTA_TAG))

## Assign motifs to families based on priority

motif_priority = {
    
#     'TAG_controls' : sorted(fam_controls_TAG),
#     'GTA_controls' : sorted(fam_controls_GTA),

    'T-rich_d0'   : sorted(fam_t_rich_d0),
    'T-rich_d1'   : sorted(fam_t_rich_d1),
    'T-rich_d2'   : sorted(fam_t_rich_d2),
    'T-rich'      : sorted(fam_t_rich_d0 + fam_t_rich_d1 + fam_t_rich_d2),
    
    'A-rich_d0'   : sorted(fam_a_rich_d0),
    'A-rich_d1'   : sorted(fam_a_rich_d1),
    'A-rich_d2'   : sorted(fam_a_rich_d2),
    'A-rich_d3'   : sorted(fam_a_rich_d3),
    'A-rich'      : sorted(fam_a_rich_d0 + fam_a_rich_d1 + fam_a_rich_d2 + fam_a_rich_d3),
    
    'TAG_d0'      : sorted(fam_contains_TAG),
    'TAG'         : sorted(fam_contains_TAG),
    
    'GTA_d0'      : sorted(fam_contains_GTA),
    'GTA'         : sorted(fam_contains_GTA),
    
    'GTA-TAG_d0'  : sorted(fam_contains_GTA_TAG),
    'GTA-TAG'     : sorted(fam_contains_GTA_TAG),
    
}

motif_family = {}
motif_hamming = {}

for motif in all_6mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf
        
for mf,mlist in motif_priority.items():
    for m in mlist:
        if ("_d") in mf:
            motif_hamming[m] = mf

## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    if ("_d" not in mf):
        print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    if ("_d" not in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

for mf,ml in motif_priority.items():
    if ("_d" in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

## Record motif family definitions

hamming_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
    'hamming'    : motif_hamming,
}

with open(os.path.join(OUTDIR, 'motif_definitions.spom.6mers.distance.pickle'), mode = 'wb') as handle:
    pickle.dump(hamming_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.spom.6mers.distance.txt'))
)


Ambiguous motifs (N = 0): 
255
1845
255
1845
1 ['AAAAAA']
18 ['AAAAAC', 'AAAAAG', 'AAAAAT', 'AAAACA', 'AAAAGA']
131 ['AAAACC', 'AAAACG', 'AAAACT', 'AAAAGC', 'AAAAGG']
504 ['AAACCC', 'AAACCG', 'AAACCT', 'AAACGC', 'AAACGG']
1 ['TTTTTT']
18 ['ATTTTT', 'CTTTTT', 'GTTTTT', 'TATTTT', 'TCTTTT']
131 ['AATTTT', 'ACTTTT', 'AGTTTT', 'ATATTT', 'ATCTTT']
255
1845
255
1845
56
T-rich 	 146
A-rich 	 619
TAG 	 199
GTA 	 199
GTA-TAG 	 56

t_rich <- c('AATTTT', 'ACTTTT', 'AGTTTT', 'ATATTT', 'ATCTTT', 'ATGTTT', 'ATTATT', 'ATTCTT', 'ATTGTT', 'ATTTAT', 'ATTTCT', 'ATTTGT', 'ATTTTA', 'ATTTTC', 'ATTTTG', 'ATTTTT', 'CATTTT', 'CCTTTT', 'CGTTTT', 'CTATTT', 'CTCTTT', 'CTGTTT', 'CTTATT', 'CTTCTT', 'CTTGTT', 'CTTTAT', 'CTTTCT', 'CTTTGT', 'CTTTTA', 'CTTTTC', 'CTTTTG', 'CTTTTT', 'GATTTT', 'GCTTTT', 'GGTTTT', 'GTCTTT', 'GTGTTT', 'GTTATT', 'GTTCTT', 'GTTGTT', 'GTTTAT', 'GTTTCT', 'GTTTGT', 'GTTTTA', 'GTTTTC', 'GTTTTG', 'GTTTTT', 'TAATTT', 'TACTTT', 'TATATT', 'TATCTT', 'TATGTT', 'TATTAT', 'TATTCT', 'TATTGT', 'TATTTA', 'T

In [9]:
## Identify archetypical motifs and assign motifs to their most similar archetype

archetypes = np.asarray(['AAAAA','TTTTT'])

closest_family,closest_distance,closest_matches = zip(*[motifs.closest_archetype(motif, archetypes, max_mismatch=[3,2], kmer=5) for motif in all_5mers])

closest_family   = np.asarray(closest_family)
closest_distance = np.asarray(closest_distance, dtype = float)

ambiguous_motifs = [h for h,m in zip(all_5mers, closest_matches) if len(m) > 1]
print(f"\nAmbiguous motifs (N = {len(ambiguous_motifs)}): {','.join(ambiguous_motifs)}")


## Identify motifs containing the GTA element

contains_GTA = np.asarray([("GTA" in motif) for motif in all_5mers])
controls_GTA = np.asarray([("G" in motif) & ("T" in motif) & ("A" in motif) & ("GTA" not in motif) for motif in all_5mers])

print(sum(contains_GTA))
print(sum(controls_GTA))


## Identify motifs containing the TAG element

contains_TAG = np.asarray([("TAG" in motif) for motif in all_5mers])
controls_TAG = np.asarray([("G" in motif) & ("T" in motif) & ("A" in motif) & ("TAG" not in motif) for motif in all_5mers])

print(sum(contains_TAG))
print(sum(controls_TAG))


## Define family of A-rich motifs

fam_a_rich_d0 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 0) & (contains_GTA == False)].tolist())
fam_a_rich_d1 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 1) & (contains_GTA == False)].tolist())
fam_a_rich_d2 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 2) & (contains_GTA == False)].tolist())
fam_a_rich_d3 = sorted(np.asarray(all_5mers)[(closest_family == 'AAAAA') & (closest_distance == 3) & (contains_GTA == False)].tolist())

print(len(fam_a_rich_d0), fam_a_rich_d0[:5])
print(len(fam_a_rich_d1), fam_a_rich_d1[:5])
print(len(fam_a_rich_d2), fam_a_rich_d2[:5])
print(len(fam_a_rich_d3), fam_a_rich_d3[:5])


## Define family of T-rich motifs

fam_t_rich_d0 = sorted(np.asarray(all_5mers)[(closest_family == 'TTTTT') & (closest_distance == 0) & (contains_GTA == False)].tolist())
fam_t_rich_d1 = sorted(np.asarray(all_5mers)[(closest_family == 'TTTTT') & (closest_distance == 1) & (contains_GTA == False)].tolist())
fam_t_rich_d2 = sorted(np.asarray(all_5mers)[(closest_family == 'TTTTT') & (closest_distance == 2) & (contains_GTA == False)].tolist())

print(len(fam_t_rich_d0), fam_t_rich_d0[:5])
print(len(fam_t_rich_d1), fam_t_rich_d1[:5])
print(len(fam_t_rich_d2), fam_t_rich_d2[:5])


## Define family of motifs containing the GTA element

fam_contains_GTA = sorted(np.asarray(all_5mers)[(contains_GTA == True)])
fam_controls_GTA = sorted(np.asarray(all_5mers)[(controls_GTA == True)])

print(len(fam_contains_GTA))
print(len(fam_controls_GTA))


## Define family of motifs containing the TAG element

fam_contains_TAG = sorted(np.asarray(all_5mers)[(contains_TAG == True)])
fam_controls_TAG = sorted(np.asarray(all_5mers)[(controls_TAG == True)])

print(len(fam_contains_TAG))
print(len(fam_controls_TAG))

## Define family of motifs containing both GTA and TAG elements

fam_contains_GTA_TAG = sorted(np.asarray(all_5mers)[(contains_GTA == True) & (contains_TAG == True)])
print(len(fam_contains_GTA_TAG))


## Assign motifs to families based on priority

motif_priority = {
    
#     'GTA_controls' : sorted(fam_controls_GTA),

    'T-rich_d0'   : sorted(fam_t_rich_d0),
    'T-rich_d1'   : sorted(fam_t_rich_d1),
    'T-rich_d2'   : sorted(fam_t_rich_d2),
    'T-rich'      : sorted(fam_t_rich_d0 + fam_t_rich_d1 + fam_t_rich_d2),
    
    'A-rich_d0'   : sorted(fam_a_rich_d0),
    'A-rich_d1'   : sorted(fam_a_rich_d1),
    'A-rich_d2'   : sorted(fam_a_rich_d2),
    'A-rich_d3'   : sorted(fam_a_rich_d3),
    'A-rich'      : sorted(fam_a_rich_d0 + fam_a_rich_d1 + fam_a_rich_d2 + fam_a_rich_d3),
    
    'TAG_d0'      : sorted(fam_contains_TAG),
    'TAG'         : sorted(fam_contains_TAG),
    
    'GTA_d0'      : sorted(fam_contains_GTA),
    'GTA'         : sorted(fam_contains_GTA),
    
    'GTA-TAG_d0'  : sorted(fam_contains_GTA_TAG),
    'GTA-TAG'     : sorted(fam_contains_GTA_TAG),
    
}

motif_family = {}
motif_hamming = {}

for motif in all_5mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf
        
for mf,mlist in motif_priority.items():
    for m in mlist:
        if ("_d") in mf:
            motif_hamming[m] = mf

## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    if ("_d" not in mf):
        print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    if ("_d" not in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

for mf,ml in motif_priority.items():
    if ("_d" in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

## Record motif family definitions

hamming_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
    'hamming'    : motif_hamming,
}

with open(os.path.join(OUTDIR, 'motif_definitions.spom.5mers.distance.pickle'), mode = 'wb') as handle:
    pickle.dump(hamming_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.spom.5mers.distance.txt'))
)


Ambiguous motifs (N = 0): 
48
342
48
342
1 ['AAAAA']
15 ['AAAAC', 'AAAAG', 'AAAAT', 'AAACA', 'AAAGA']
87 ['AAACC', 'AAACG', 'AAACT', 'AAAGC', 'AAAGG']
242 ['AACCC', 'AACCG', 'AACCT', 'AACGC', 'AACGG']
1 ['TTTTT']
15 ['ATTTT', 'CTTTT', 'GTTTT', 'TATTT', 'TCTTT']
87 ['AATTT', 'ACTTT', 'AGTTT', 'ATATT', 'ATCTT']
48
342
48
342
9
T-rich 	 100
A-rich 	 327
TAG 	 39
GTA 	 39
GTA-TAG 	 9

t_rich <- c('AATTT', 'ACTTT', 'AGTTT', 'ATATT', 'ATCTT', 'ATGTT', 'ATTAT', 'ATTCT', 'ATTGT', 'ATTTA', 'ATTTC', 'ATTTG', 'ATTTT', 'CATTT', 'CCTTT', 'CGTTT', 'CTATT', 'CTCTT', 'CTGTT', 'CTTAT', 'CTTCT', 'CTTGT', 'CTTTA', 'CTTTC', 'CTTTG', 'CTTTT', 'GATTT', 'GCTTT', 'GGTTT', 'GTCTT', 'GTGTT', 'GTTAT', 'GTTCT', 'GTTGT', 'GTTTA', 'GTTTC', 'GTTTG', 'GTTTT', 'TAATT', 'TACTT', 'TATAT', 'TATCT', 'TATGT', 'TATTA', 'TATTC', 'TATTG', 'TATTT', 'TCATT', 'TCCTT', 'TCGTT', 'TCTAT', 'TCTCT', 'TCTGT', 'TCTTA', 'TCTTC', 'TCTTG', 'TCTTT', 'TGATT', 'TGCTT', 'TGGTT', 'TGTCT', 'TGTGT', 'TGTTA', 'TGTTC', 'TGTTG', 'TGTTT', 'TTAAT', 

In [10]:
## Identify archetypical motifs and assign motifs to their most similar archetype

archetypes = np.asarray(['AAAA','TTTT'])

closest_family,closest_distance,closest_matches = zip(*[motifs.closest_archetype(motif, archetypes, max_mismatch=[3,2], kmer=4) for motif in all_4mers])

closest_family   = np.asarray(closest_family)
closest_distance = np.asarray(closest_distance, dtype = float)

ambiguous_motifs = [h for h,m in zip(all_4mers, closest_matches) if len(m) > 1]
print(f"\nAmbiguous motifs (N = {len(ambiguous_motifs)}): {','.join(ambiguous_motifs)}")


## Identify motifs containing the GTA element

contains_GTA = np.asarray([("GTA" in motif) for motif in all_4mers])
controls_GTA = np.asarray([("G" in motif) & ("T" in motif) & ("A" in motif) & ("GTA" not in motif) for motif in all_4mers])

print(sum(contains_GTA))
print(sum(controls_GTA))


## Identify motifs containing the TAG element

contains_TAG = np.asarray([("TAG" in motif) for motif in all_4mers])
controls_TAG = np.asarray([("G" in motif) & ("T" in motif) & ("A" in motif) & ("TAG" not in motif) for motif in all_4mers])

print(sum(contains_TAG))
print(sum(controls_TAG))


## Define family of A-rich motifs

fam_a_rich_d0 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 0) & (contains_GTA == False)].tolist())
fam_a_rich_d1 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 1) & (contains_GTA == False)].tolist())
fam_a_rich_d2 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 2) & (contains_GTA == False)].tolist())
fam_a_rich_d3 = sorted(np.asarray(all_4mers)[(closest_family == 'AAAA') & (closest_distance == 3) & (contains_GTA == False)].tolist())

print(len(fam_a_rich_d0), fam_a_rich_d0[:5])
print(len(fam_a_rich_d1), fam_a_rich_d1[:5])
print(len(fam_a_rich_d2), fam_a_rich_d2[:5])
print(len(fam_a_rich_d3), fam_a_rich_d3[:5])


## Define family of T-rich motifs

fam_t_rich_d0 = sorted(np.asarray(all_4mers)[(closest_family == 'TTTT') & (closest_distance == 0) & (contains_GTA == False)].tolist())
fam_t_rich_d1 = sorted(np.asarray(all_4mers)[(closest_family == 'TTTT') & (closest_distance == 1) & (contains_GTA == False)].tolist())
fam_t_rich_d2 = sorted(np.asarray(all_4mers)[(closest_family == 'TTTT') & (closest_distance == 2) & (contains_GTA == False)].tolist())

print(len(fam_t_rich_d0), fam_t_rich_d0[:5])
print(len(fam_t_rich_d1), fam_t_rich_d1[:5])
print(len(fam_t_rich_d2), fam_t_rich_d2[:5])


## Define family of motifs containing the GTA element

fam_contains_GTA = sorted(np.asarray(all_4mers)[(contains_GTA == True)])
fam_controls_GTA = sorted(np.asarray(all_4mers)[(controls_GTA == True)])

print(len(fam_contains_GTA))
print(len(fam_controls_GTA))


## Define family of motifs containing the TAG element

fam_contains_TAG = sorted(np.asarray(all_4mers)[(contains_TAG == True)])
fam_controls_TAG = sorted(np.asarray(all_4mers)[(controls_TAG == True)])

print(len(fam_contains_TAG))
print(len(fam_controls_TAG))

## Define family of motifs containing both GTA and TAG elements

fam_contains_GTA_TAG = sorted(np.asarray(all_4mers)[(contains_GTA == True) & (contains_TAG == True)])
print(len(fam_contains_GTA_TAG))


## Assign motifs to families based on priority

motif_priority = {
    
#     'GTA_controls' : sorted(fam_controls_GTA),

    'T-rich_d0'   : sorted(fam_t_rich_d0),
    'T-rich_d1'   : sorted(fam_t_rich_d1),
    'T-rich_d2'   : sorted(fam_t_rich_d2),
    'T-rich'      : sorted(fam_t_rich_d0 + fam_t_rich_d1 + fam_t_rich_d2),
    
    'A-rich_d0'   : sorted(fam_a_rich_d0),
    'A-rich_d1'   : sorted(fam_a_rich_d1),
    'A-rich_d2'   : sorted(fam_a_rich_d2),
    'A-rich_d3'   : sorted(fam_a_rich_d3),
    'A-rich'      : sorted(fam_a_rich_d0 + fam_a_rich_d1 + fam_a_rich_d2 + fam_a_rich_d3),
    
    'TAG_d0'      : sorted(fam_contains_TAG),
    'TAG'         : sorted(fam_contains_TAG),
    
    'GTA_d0'      : sorted(fam_contains_GTA),
    'GTA'         : sorted(fam_contains_GTA),
    
    'GTA-TAG_d0'  : sorted(fam_contains_GTA_TAG),
    'GTA-TAG'     : sorted(fam_contains_GTA_TAG),
    
}

motif_family = {}
motif_hamming = {}

for motif in all_4mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf
        
for mf,mlist in motif_priority.items():
    for m in mlist:
        if ("_d") in mf:
            motif_hamming[m] = mf

## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    if ("_d" not in mf):
        print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    if ("_d" not in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

for mf,ml in motif_priority.items():
    if ("_d" in mf):
        outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
        print(outstring)

## Record motif family definitions

hamming_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
    'hamming'    : motif_hamming,
}

with open(os.path.join(OUTDIR, 'motif_definitions.spom.4mers.distance.pickle'), mode = 'wb') as handle:
    pickle.dump(hamming_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.spom.4mers.distance.txt'))
)

Conflicting family assignment: AATT equidistant to ['AAAA' 'TTTT'] ... assigning to AAAA based on content
Conflicting family assignment: ATAT equidistant to ['AAAA' 'TTTT'] ... assigning to AAAA based on priority
Conflicting family assignment: ATTA equidistant to ['AAAA' 'TTTT'] ... assigning to TTTT based on content
Conflicting family assignment: TAAT equidistant to ['AAAA' 'TTTT'] ... assigning to AAAA based on content
Conflicting family assignment: TATA equidistant to ['AAAA' 'TTTT'] ... assigning to AAAA based on priority
Conflicting family assignment: TTAA equidistant to ['AAAA' 'TTTT'] ... assigning to AAAA based on content

Ambiguous motifs (N = 6): AATT,ATAT,ATTA,TAAT,TATA,TTAA
8
52
8
52
1 ['AAAA']
12 ['AAAC', 'AAAG', 'AAAT', 'AACA', 'AAGA']
51 ['AACC', 'AACG', 'AACT', 'AAGC', 'AAGG']
76 ['ACCC', 'ACCG', 'ACCT', 'ACGC', 'ACGG']
1 ['TTTT']
12 ['ATTT', 'CTTT', 'GTTT', 'TATT', 'TCTT']
47 ['ACTT', 'AGTT', 'ATCT', 'ATGT', 'ATTA']
8
52
8
52
1
T-rich 	 58
A-rich 	 135
TAG 	 7
GTA 	 7
