# Filtering motifs important for PolyaStrength scores

**Purpose:** To create a Python object containing all summary motif importance information, which facilitates downstream analysis and visualization.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import motifs

In [5]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


# Processing motif disruption results

## Model Configurations

In [6]:
with open(os.path.join(OUTDIR, 'polyastrength_configurations.pickle'), mode = 'rb') as handle:
    configurations = pickle.load(handle)


## Motif family definitions by species

In [7]:
## Load motif family definitions for S. cerevisiae

scer_definitions = {'patterns' : {}, 'distance' : {}}

# By pattern - we only look at pattern families for 6mers
with open(os.path.join(OUTDIR, f'motif_definitions.scer.6mers.patterns.pickle'), mode = 'rb') as handle:
    scer_definitions['patterns'][6] = pickle.load(handle)
    
scer_definitions['patterns'][5] = {'family':{}}
scer_definitions['patterns'][4] = {'family':{}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.scer.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        scer_definitions['distance'][kmer] = pickle.load(handle)


In [8]:
## Load motif family definitions for S. pombe

spom_definitions = {'distance' : {}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.spom.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        spom_definitions['distance'][kmer] = pickle.load(handle)


In [9]:
## Load motif family definitions for A. thaliana

atha_definitions = {'patterns' : {}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.atha.{kmer}mers.patterns.pickle'), mode = 'rb') as handle:
        atha_definitions['patterns'][kmer] = pickle.load(handle)


## Processed motif importance profiles

In [10]:
with open(os.path.join(OUTDIR, "polyastrength_motifs.disruption_profiles.pickle"), mode = 'rb') as handle:
    motif_summaries = pickle.load(handle)


In [11]:
with open(os.path.join(OUTDIR, "polyastrength_motifs.significant_motifs.pickle"), mode = 'rb') as handle:
    significant_motifs = pickle.load(handle)
    

## Assign individual motifs to families

In [12]:
## Assign motif families to individual motif importance profiles

# S. cerevisiae
motif_summaries.loc[motif_summaries['species'] == 'saccharomyces_cerevisiae', 'overallFamily'] = motif_summaries['testMotif'].apply(lambda x : scer_definitions['distance'][len(x)]['family'].get(x,'NA'))
motif_summaries.loc[motif_summaries['species'] == 'saccharomyces_cerevisiae', 'hammingFamily'] = motif_summaries['testMotif'].apply(lambda x : scer_definitions['distance'][len(x)]['hamming'].get(x,'NA'))
motif_summaries.loc[motif_summaries['species'] == 'saccharomyces_cerevisiae', 'patternFamily'] = motif_summaries['testMotif'].apply(lambda x : scer_definitions['patterns'][len(x)]['family'].get(x,'NA'))


## Record results

with open(os.path.join(OUTDIR, "polyastrength_motifs.summaries.pickle"), mode = 'wb') as handle:
    pickle.dump(motif_summaries, handle)
    

## Compile python dictionary with significant motif data for downstream analysis

In [13]:
scer_dist_mfams = ['A-rich','T-rich','TA/TA-rich','A-rich_d0','A-rich_d1','A-rich_d2','T-rich_d0','T-rich_d1','T-rich_d2','TA/TA-rich_d0','TA/TA-rich_d1','TA/TA-rich_d2']
scer_patt_mfams = ['G-rich','G/T-rich','T/A-rich','A-rich','T-rich','TT/AA-rich','TA/AA-rich','TC/TA-rich','TG/TA-rich','TA/TA-rich']
scer_cont_mfams = ['TA','TG','TC','TT']


In [14]:
nested_dict = lambda: defaultdict(nested_dict)
mfam_importance = nested_dict()

for i,(s,sd) in enumerate(configurations.items()):
    for (c,cd) in sd.items():
        
        mt     = cd['modeltype']
        ntotal = cd['golden_lines']
    
        print("\n" + "\t".join([str(_) for _ in [s,mt,c,ntotal]]))

        mot_dis_dir = os.path.join(PROJECT, s, 'analysis', 'motif_analysis', 'motif_disruption', c)

        sigmots = sorted(significant_motifs[s][mt][c])
        print("Number of significant motifs:", len(sigmots))

        if (mt == 'polyaid_classification') or (mt == 'polyaclassifier'):
            mvar = 'differenceClassification'
        elif (mt == 'polyaid_cleavage'):
            mvar = 'l2fcEntropy'
        elif (mt == 'polyastrength'):
            mvar = 'differenceStrength'

        ## Fetch species-specific motif family definitions

        if (s == "saccharomyces_cerevisiae"):
            mfams_dist = scer_dist_mfams
            mfams_patt = scer_patt_mfams
            mdefs = scer_definitions

        elif (s == "schizosaccharomyces_pombe"):
            mfams_dist = spom_dist_mfams
            mfams_patt = spom_patt_mfams
            mdefs = spom_definitions

        elif (s == 'arabidopsis_thaliana'):
            mfams_dist = atha_dist_mfams
            mfams_patt = atha_patt_mfams
            mdefs = atha_definitions

        ## Motif definitions using hamming distances to archetypical motifs

        for mfam in mfams_dist:

            if ("_d" in mfam):
                mlist = [m for m,f in mdefs['distance'][len(sigmots[0])]['hamming'].items() if (m in sigmots) and (f == mfam)]
            else:
                mlist = [m for m in mdefs['distance'][len(sigmots[0])]['no_overlap'][mfam] if (m in sigmots)]
            print("\thamming", mfam, "\t", len(mlist))

            try:
                mfam_importance[s][mt][c][f'hamming_{mfam}'] = {}
                mfam_importance[s][mt][c][f'hamming_{mfam}']['motifs'] = mlist
                mfam_importance[s][mt][c][f'hamming_{mfam}']['data']   = motifs.summarize_family(mot_dis_dir, mlist, 'position', mvar, ntotal)
            except ValueError:
                continue

        ## Motif definitions using known patterns

        if (s == 'arabidopsis_thaliana') or ((s == 'saccharomyces_cerevisiae') and ('kmers-6' in c)):

            for mfam in mfams_patt:

                mlist = [m for m in mdefs['patterns'][len(sigmots[0])]['no_overlap'][mfam] if (m in sigmots)]
                print("\tpattern", mfam, "\t", len(mlist))

                try:
                    mfam_importance[s][mt][c][f'pattern_{mfam}'] = {}
                    mfam_importance[s][mt][c][f'pattern_{mfam}']['motifs'] = mlist
                    mfam_importance[s][mt][c][f'pattern_{mfam}']['data']   = motifs.summarize_family(mot_dis_dir, mlist, 'position', mvar, ntotal)
                except ValueError:
                    continue

        ## Special analysis of UA-rich elements in S. cerevisiae 6mers

        if ((s == 'saccharomyces_cerevisiae') and ('kmers-6' in c)):

            for mfam in scer_cont_mfams:

                mlist = [m for m in mdefs['distance'][len(sigmots[0])]['no_overlap']['TA/TA-rich'] if (m in sigmots) and ((mfam in m) or (mfam[::-1] in m))]
                print("\tcontains", mfam, "\t", len(mlist))

                try:
                    mfam_importance[s][mt][c][f'contains_{mfam}'] = {}
                    mfam_importance[s][mt][c][f'contains_{mfam}']['motifs'] = mlist
                    mfam_importance[s][mt][c][f'contains_{mfam}']['data']   = motifs.summarize_family(mot_dis_dir, mlist, 'position', mvar, ntotal)
                except ValueError:
                    continue

with open(os.path.join(OUTDIR, 'polyastrength_motifs.family_profiles.pickle'), mode = 'wb') as handle:
    pickle.dump(mfam_importance, handle)



saccharomyces_cerevisiae	polyastrength	polyastrength_kmers-6	9725
Number of significant motifs: 246
	hamming A-rich 	 39
	hamming T-rich 	 32
	hamming TA/TA-rich 	 74
	hamming A-rich_d0 	 1
	hamming A-rich_d1 	 15


  ste_vec[i] = std_vec[i] / np.sqrt(freq_vec[i])


	hamming A-rich_d2 	 23
	hamming T-rich_d0 	 1
	hamming T-rich_d1 	 11
	hamming T-rich_d2 	 20
	hamming TA/TA-rich_d0 	 2
	hamming TA/TA-rich_d1 	 25
	hamming TA/TA-rich_d2 	 47
	pattern G-rich 	 32
	pattern G/T-rich 	 2
	pattern T/A-rich 	 1
	pattern A-rich 	 23
	pattern T-rich 	 32
	pattern TT/AA-rich 	 3
	pattern TA/AA-rich 	 21
	pattern TC/TA-rich 	 9
	pattern TG/TA-rich 	 36
	pattern TA/TA-rich 	 30
	contains TA 	 74
	contains TG 	 31
	contains TC 	 5
	contains TT 	 10
