# Motif Family Definitions for *A. thaliana*

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import motifs

In [5]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


## K-mer definitions

In [6]:
all_4mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 4))]
all_5mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 5))]
all_6mers = ["".join(x) for x in list(itertools.product('ACGT', repeat = 6))]

pd.DataFrame.from_dict({'motif' : all_4mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.4mers.txt'), sep = "\t", index = False, header = False)
pd.DataFrame.from_dict({'motif' : all_5mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.5mers.txt'), sep = "\t", index = False, header = False)
pd.DataFrame.from_dict({'motif' : all_6mers}, orient = 'columns').to_csv(os.path.join(OUTDIR, 'motif_list.6mers.txt'), sep = "\t", index = False, header = False)


## A. thaliana

In [7]:
family_cmap = {
    'A-rich'          : '#2278B5', 
    'T-rich'          : '#2FA148', 
    'TGTA-containing' : '#D65F5F', 
    'Other'           : '#F7F8F8',
}

### By pattern

##### 6mers

In [8]:
## Identify motif patterns

a_rich    = list(set([x for x in all_6mers if (x.count('A') >= 4)]))
t_rich    = list(set([x for x in all_6mers if (x.count('T') >= 4)]))
tgta_rich = list(set([x for x in all_6mers if ('TGTA' in x)]))

## Assign motifs to families based on priority

motif_priority = {
    'T-rich'          : t_rich,
    'A-rich'          : a_rich,
    'TGTA-containing' : tgta_rich,
}

motif_family = {}

for motif in all_6mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf

        
## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
    print(outstring)
    
    
## Record motif family definitions

pattern_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
}

with open(os.path.join(OUTDIR, 'motif_definitions.atha.6mers.patterns.pickle'), mode = 'wb') as handle:
    pickle.dump(pattern_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.atha.6mers.patterns.txt'))
)


T-rich 	 151
A-rich 	 154
TGTA-containing 	 48

t_rich <- c('AATTTT', 'ACTTTT', 'AGTTTT', 'ATATTT', 'ATCTTT', 'ATGTTT', 'ATTATT', 'ATTCTT', 'ATTGTT', 'ATTTAT', 'ATTTCT', 'ATTTGT', 'ATTTTA', 'ATTTTC', 'ATTTTG', 'ATTTTT', 'CATTTT', 'CCTTTT', 'CGTTTT', 'CTATTT', 'CTCTTT', 'CTGTTT', 'CTTATT', 'CTTCTT', 'CTTGTT', 'CTTTAT', 'CTTTCT', 'CTTTGT', 'CTTTTA', 'CTTTTC', 'CTTTTG', 'CTTTTT', 'GATTTT', 'GCTTTT', 'GGTTTT', 'GTATTT', 'GTCTTT', 'GTGTTT', 'GTTATT', 'GTTCTT', 'GTTGTT', 'GTTTAT', 'GTTTCT', 'GTTTGT', 'GTTTTA', 'GTTTTC', 'GTTTTG', 'GTTTTT', 'TAATTT', 'TACTTT', 'TAGTTT', 'TATATT', 'TATCTT', 'TATGTT', 'TATTAT', 'TATTCT', 'TATTGT', 'TATTTA', 'TATTTC', 'TATTTG', 'TATTTT', 'TCATTT', 'TCCTTT', 'TCGTTT', 'TCTATT', 'TCTCTT', 'TCTGTT', 'TCTTAT', 'TCTTCT', 'TCTTGT', 'TCTTTA', 'TCTTTC', 'TCTTTG', 'TCTTTT', 'TGATTT', 'TGCTTT', 'TGGTTT', 'TGTCTT', 'TGTGTT', 'TGTTAT', 'TGTTCT', 'TGTTGT', 'TGTTTA', 'TGTTTC', 'TGTTTG', 'TGTTTT', 'TTAATT', 'TTACTT', 'TTAGTT', 'TTATAT', 'TTATCT', 'TTATGT', 'TTATTA', 'TTATTC', 

##### 5mers

In [9]:
## Identify motif patterns

a_rich    = list(set([x for x in all_5mers if (x.count('A') >= 3)]))
t_rich    = list(set([x for x in all_5mers if (x.count('T') >= 3)]))
tgta_rich = list(set([x for x in all_5mers if ('TGTA' in x)]))

## Assign motifs to families based on priority

motif_priority = {
    'T-rich'          : t_rich,
    'A-rich'          : a_rich,
    'TGTA-containing' : tgta_rich,
}

motif_family = {}

for motif in all_5mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf

        
## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
    print(outstring)
    
    
## Record motif family definitions

pattern_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
}

with open(os.path.join(OUTDIR, 'motif_definitions.atha.5mers.patterns.pickle'), mode = 'wb') as handle:
    pickle.dump(pattern_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.atha.5mers.patterns.txt'))
)


T-rich 	 104
A-rich 	 106
TGTA-containing 	 8

t_rich <- c('AATTT', 'ACTTT', 'AGTTT', 'ATATT', 'ATCTT', 'ATGTT', 'ATTAT', 'ATTCT', 'ATTGT', 'ATTTA', 'ATTTC', 'ATTTG', 'ATTTT', 'CATTT', 'CCTTT', 'CGTTT', 'CTATT', 'CTCTT', 'CTGTT', 'CTTAT', 'CTTCT', 'CTTGT', 'CTTTA', 'CTTTC', 'CTTTG', 'CTTTT', 'GATTT', 'GCTTT', 'GGTTT', 'GTATT', 'GTCTT', 'GTGTT', 'GTTAT', 'GTTCT', 'GTTGT', 'GTTTA', 'GTTTC', 'GTTTG', 'GTTTT', 'TAATT', 'TACTT', 'TAGTT', 'TATAT', 'TATCT', 'TATGT', 'TATTA', 'TATTC', 'TATTG', 'TATTT', 'TCATT', 'TCCTT', 'TCGTT', 'TCTAT', 'TCTCT', 'TCTGT', 'TCTTA', 'TCTTC', 'TCTTG', 'TCTTT', 'TGATT', 'TGCTT', 'TGGTT', 'TGTCT', 'TGTGT', 'TGTTA', 'TGTTC', 'TGTTG', 'TGTTT', 'TTAAT', 'TTACT', 'TTAGT', 'TTATA', 'TTATC', 'TTATG', 'TTATT', 'TTCAT', 'TTCCT', 'TTCGT', 'TTCTA', 'TTCTC', 'TTCTG', 'TTCTT', 'TTGAT', 'TTGCT', 'TTGGT', 'TTGTC', 'TTGTG', 'TTGTT', 'TTTAA', 'TTTAC', 'TTTAG', 'TTTAT', 'TTTCA', 'TTTCC', 'TTTCG', 'TTTCT', 'TTTGA', 'TTTGC', 'TTTGG', 'TTTGT', 'TTTTA', 'TTTTC', 'TTTTG', 'TTTTT')

a_ri

##### 4mers

In [10]:
## Identify motif patterns

a_rich    = list(set([x for x in all_4mers if (x.count('A') >= 2)]))
t_rich    = list(set([x for x in all_4mers if (x.count('T') >= 2)]))
tgta_rich = list(set([x for x in all_4mers if ('TGTA' in x)]))

## Assign motifs to families based on priority

motif_priority = {
    'T-rich'          : t_rich,
    'A-rich'          : a_rich,
    'TGTA-containing' : tgta_rich,
}

motif_family = {}

for motif in all_4mers:
    motif_family[motif] = 'Other'
    
for mf,mlist in motif_priority.items():
    for m in mlist:
        motif_family[m] = mf


## Output list of motifs in each family for use in R

motif_unique = {}

for mf in motif_priority.keys():
    motif_unique[mf] = sorted([k for k,v in motif_family.items() if (v == mf)])
    print(mf, "\t", len(motif_unique[mf]))
    
for mf,ml in motif_unique.items():
    outstring = "\n" + mf.replace("-","_").replace("/","").lower() + " <- c('" + "', '".join(ml) + "')"
    print(outstring)
    

## Record motif family definitions

pattern_definitions = {
    'priority'   : motif_priority,
    'family'     : motif_family,
    'no_overlap' : motif_unique,
}

with open(os.path.join(OUTDIR, 'motif_definitions.atha.4mers.patterns.pickle'), mode = 'wb') as handle:
    pickle.dump(pattern_definitions, handle)

(pd.DataFrame.from_dict(motif_family, orient = 'index').reset_index()
 .rename(columns = {'index' : 'motif', 0 : 'family'})
 .to_csv(os.path.join(OUTDIR, 'motif_families.atha.4mers.patterns.txt'))
)


T-rich 	 60
A-rich 	 67
TGTA-containing 	 1

t_rich <- c('ACTT', 'AGTT', 'ATCT', 'ATGT', 'ATTC', 'ATTG', 'ATTT', 'CATT', 'CCTT', 'CGTT', 'CTAT', 'CTCT', 'CTGT', 'CTTA', 'CTTC', 'CTTG', 'CTTT', 'GATT', 'GCTT', 'GGTT', 'GTAT', 'GTCT', 'GTGT', 'GTTA', 'GTTC', 'GTTG', 'GTTT', 'TACT', 'TAGT', 'TATC', 'TATG', 'TATT', 'TCAT', 'TCCT', 'TCGT', 'TCTA', 'TCTC', 'TCTG', 'TCTT', 'TGAT', 'TGCT', 'TGGT', 'TGTC', 'TGTG', 'TGTT', 'TTAC', 'TTAG', 'TTAT', 'TTCA', 'TTCC', 'TTCG', 'TTCT', 'TTGA', 'TTGC', 'TTGG', 'TTGT', 'TTTA', 'TTTC', 'TTTG', 'TTTT')

a_rich <- c('AAAA', 'AAAC', 'AAAG', 'AAAT', 'AACA', 'AACC', 'AACG', 'AACT', 'AAGA', 'AAGC', 'AAGG', 'AAGT', 'AATA', 'AATC', 'AATG', 'AATT', 'ACAA', 'ACAC', 'ACAG', 'ACAT', 'ACCA', 'ACGA', 'ACTA', 'AGAA', 'AGAC', 'AGAG', 'AGAT', 'AGCA', 'AGGA', 'AGTA', 'ATAA', 'ATAC', 'ATAG', 'ATAT', 'ATCA', 'ATGA', 'ATTA', 'CAAA', 'CAAC', 'CAAG', 'CAAT', 'CACA', 'CAGA', 'CATA', 'CCAA', 'CGAA', 'CTAA', 'GAAA', 'GAAC', 'GAAG', 'GAAT', 'GACA', 'GAGA', 'GATA', 'GCAA', 'GGAA', 'G