# Filtering motifs important for PolyaStrength scores

**Purpose:** To identify motifs significantly contributing to polyA site strength according to PolyaStrength. We employ a two-step filtering procedure.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import helpers, motifs

In [5]:
tqdm.tqdm.pandas()

In [6]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


# Processing motif disruption results

## Model Configurations

In [7]:
## Fetch line counts-1 for golden files before running this step

trained_configs = pd.DataFrame.from_dict({
    'species'      : ['saccharomyces_cerevisiae'],
    'description'  : ['polyastrength_kmers-6'],
    'modeltype'    : ['polyastrength'],
    'golden_lines' : [9725],
}, orient = 'columns')

trained_configs.sort_values(['species','modeltype'], inplace = True)

trained_configs


Unnamed: 0,species,description,modeltype,golden_lines
0,saccharomyces_cerevisiae,polyastrength_kmers-6,polyastrength,9725


In [8]:
## Make dictionary of configurations for easy downstream use

configurations = {
    'saccharomyces_cerevisiae'  : {},
    'schizosaccharomyces_pombe' : {},
    'arabidopsis_thaliana'      : {},
}

for i,row in trained_configs.iterrows():
    
    rdict = row.to_dict()
    rdict.pop('species')
    rdict.pop('description')
    
    configurations[row['species']][row['description']] = rdict
    
with open(os.path.join(OUTDIR, 'polyastrength_configurations.pickle'), mode = 'wb') as handle:
    pickle.dump(configurations, handle)
    

## Motif family definitions by species

In [9]:
## Load motif family definitions for S. cerevisiae

scer_definitions = {'patterns' : {}, 'distance' : {}}

# By pattern - we only look at pattern families for 6mers
with open(os.path.join(OUTDIR, f'motif_definitions.scer.6mers.patterns.pickle'), mode = 'rb') as handle:
    scer_definitions['patterns'][6] = pickle.load(handle)
    
scer_definitions['patterns'][5] = {'family':{}}
scer_definitions['patterns'][4] = {'family':{}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.scer.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        scer_definitions['distance'][kmer] = pickle.load(handle)


## Motif importance by position

In [10]:
motif_summaries = []

for i,config in trained_configs.iterrows():
    
    motif_dir = os.path.join(PROJECT, config['species'], "analysis", "motif_analysis", "motif_disruption")
    
    try:
        motif_file = os.path.join(motif_dir, f"motif_disruption.{config.description}.summary.txt")
        motif_summary = pd.read_csv(motif_file, sep = "\t")    
    except FileNotFoundError:
        continue

    motif_summary['species'] = config.species
    motif_summary['modeltype'] = config.modeltype
    motif_summary['config'] = config.description

    motif_summaries.append(motif_summary)
    print(f"{i}: {config['species']:25} {config['modeltype']} {config['description']} {motif_summary.shape}")
    
motif_summaries = pd.concat(motif_summaries, sort = False, ignore_index = True)
print(f"Removing {(motif_summaries['xs'].str.len() <= 2).sum()} motifs due to incorrect xs length...")

motif_summaries = motif_summaries.loc[motif_summaries['xs'].str.len() > 2].copy()
print(f"Motif summaries shape: {motif_summaries.shape}")


## Load motif importance profiles to numeric vectors

motif_summaries['kmer']          = motif_summaries['testMotif'].str.len()

motif_summaries['xs']            = motif_summaries['xs'].apply(lambda xs : [int(float(x)) for x in xs.strip("][").split(", ")])

motif_summaries['freqs']         = motif_summaries['freqs'].apply(lambda xs : np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['means']         = motif_summaries['means'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['medians']       = motif_summaries['medians'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))

motif_summaries['meanWeights']   = motif_summaries['meanWeights'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['medianWeights'] = motif_summaries['medianWeights'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))

motif_summaries['stds']          = motif_summaries['stds'].apply(lambda xs : np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['stdWeights']    = motif_summaries['stdWeights'].apply(lambda xs : np.asarray([float(x) for x in xs.strip("][").split(", ")]))

motif_summaries['stes']          = motif_summaries.apply(lambda row: helpers.std_to_ste(row['stds'], row['freqs']), axis = 1)
motif_summaries['steWeights']    = motif_summaries.apply(lambda row: helpers.std_to_ste(row['stdWeights'], row['freqs']), axis = 1)


## Calculate peak motif importance scores

measure = "mean"

motif_summaries['peak_persite_score'] = motif_summaries[f'{measure}s'].apply(lambda x : np.max(x))
motif_summaries['auc_persite_score']  = motif_summaries[f'{measure}s'].apply(lambda x : np.sum(x))

motif_summaries['extreme_persite_results']  = motif_summaries.progress_apply(lambda row: motifs.peak_importance_window(row['xs'], row[f'{measure}s'], region_size = 500, window_size = 20, direction = 'positive'), axis = 1)
motif_summaries['extreme_persite_position'] = motif_summaries['extreme_persite_results'].apply(lambda x : x[0])
motif_summaries['extreme_persite_score']    = motif_summaries['extreme_persite_results'].apply(lambda x : x[1])
motif_summaries['extreme_persite_log1p']    = np.sign(motif_summaries['extreme_persite_score']) * np.log10(np.abs(motif_summaries['extreme_persite_score']))
motif_summaries.drop(columns = 'extreme_persite_results', inplace = True)

motif_summaries['peak_weights_score'] = motif_summaries[f'{measure}Weights'].apply(lambda x : np.max(x))
motif_summaries['auc_weights_score']  = motif_summaries[f'{measure}Weights'].apply(lambda x : np.sum(x))

motif_summaries['extreme_weights_results']  = motif_summaries.progress_apply(lambda row: motifs.peak_importance_window(row['xs'], row[f'{measure}Weights'], region_size = 500, window_size = 20, direction = 'positive'), axis = 1)
motif_summaries['extreme_weights_position'] = motif_summaries['extreme_weights_results'].apply(lambda x : x[0])
motif_summaries['extreme_weights_score']    = motif_summaries['extreme_weights_results'].apply(lambda x : x[1])
motif_summaries['extreme_weights_log1p']    = np.sign(motif_summaries['extreme_weights_score']) * np.log10(np.abs(motif_summaries['extreme_weights_score']))
motif_summaries.drop(columns = 'extreme_weights_results', inplace = True)


## Record results

with open(os.path.join(OUTDIR, "polyastrength_motifs.disruption_profiles.pickle"), mode = 'wb') as handle:
    pickle.dump(motif_summaries, handle)
    

0: saccharomyces_cerevisiae  polyastrength polyastrength_kmers-6 (4096, 12)
Removing 0 motifs due to incorrect xs length...
Motif summaries shape: (4096, 12)


  ste_vec[i] = std_vec[i] / np.sqrt(freq_vec[i])
100%|██████████| 4096/4096 [00:03<00:00, 1268.27it/s]
100%|██████████| 4096/4096 [00:03<00:00, 1266.62it/s]


In [11]:
motif_summaries.groupby(['species','modeltype','kmer']).size()

species                   modeltype      kmer
saccharomyces_cerevisiae  polyastrength  6       4096
dtype: int64

#### Compile motif importance profiles into matrices

In [12]:
hmdfs_sum = dict()

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\t".join([s,mt,c]))
    
    if (s not in hmdfs_sum):
        hmdfs_sum[s] = dict()
    if (mt not in hmdfs_sum[s]):
        hmdfs_sum[s][mt] = dict()
        
    condS  = (motif_summaries['species'] == s)
    condMT = (motif_summaries['modeltype'] == mt)
    condC  = (motif_summaries['config'] == c)
        
    msum = motif_summaries.loc[condS & condMT & condC]
    
    hmdfs_sum[s][mt][c] = motifs.make_hmdf(msum, weighted = True, xwindow = (-250,250))
    
with open(os.path.join(OUTDIR, "polyastrength_motifs.hmdfs.sum.pickle"), mode = 'wb') as handle:
    pickle.dump(hmdfs_sum, handle)
    

saccharomyces_cerevisiae	polyastrength	polyastrength_kmers-6


4096it [00:05, 742.45it/s]


In [13]:
hmdfs_sum = dict()

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\t".join([s,mt,c]))
    
    if (s not in hmdfs_sum):
        hmdfs_sum[s] = dict()
    if (mt not in hmdfs_sum[s]):
        hmdfs_sum[s][mt] = dict()
        
    condS  = (motif_summaries['species'] == s)
    condMT = (motif_summaries['modeltype'] == mt)
    condC  = (motif_summaries['config'] == c)
        
    msum = motif_summaries.loc[condS & condMT & condC]
    
    hmdfs_sum[s][mt][c] = motifs.make_hmdf(msum, weighted = False, xwindow = (-250,250))
    
with open(os.path.join(OUTDIR, "polyastrength_motifs.hmdfs.per_site.pickle"), mode = 'wb') as handle:
    pickle.dump(hmdfs_sum, handle)
    

saccharomyces_cerevisiae	polyastrength	polyastrength_kmers-6


4096it [00:05, 756.44it/s]


#### Identify motifs significant over background

In [14]:
background_motifs = dict()

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\n\n", "\t".join([s,mt,c]))
    
    if (s not in background_motifs):
        background_motifs[s] = dict()
    if (mt not in background_motifs[s]):
        background_motifs[s][mt] = dict()
        
    condS  = (motif_summaries['species'] == s)
    condMT = (motif_summaries['modeltype'] == mt)
    condC  = (motif_summaries['config'] == c)
        
    msum = motif_summaries.loc[condS & condMT & condC]
        
    background_motifs[s][mt][c] = motifs.filter_by_importance(msum,
                                                              hmdfs_sum[s][mt][c], 
                                                              OUTDIR, mt, c, 
                                                              center_idx = 250,
                                                              exclude = (-80,30), 
                                                              sliding = 20, 
                                                              niter = 1000, 
                                                              alpha = 0.001, ## Equivalent to 99.9%ile
                                                              direction = 'positive'
                                                             )

with open(os.path.join(OUTDIR, "polyastrength_motifs.background_motifs.pickle"), mode = 'wb') as handle:
    pickle.dump(background_motifs, handle)
    


#  saccharomyces_cerevisiae	polyastrength	polyastrength_kmers-6
# 100%|██████████| 1000/1000 [00:29<00:00, 33.42it/s]

# High Values:	Percentile = 99.50:	Value = 39.370
# Low Values:	Percentile = 0.50:	Value = -5.523
# Number of motifs more extreme than background thresholds: 770 out of 4096




 saccharomyces_cerevisiae	polyastrength	polyastrength_kmers-6


100%|██████████| 1000/1000 [00:41<00:00, 24.23it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=0.576
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-2.357
Number of motifs above background threshold: 4023 out of 4096


#### Identify regionally-significant motifs

In [15]:
region_motifs = defaultdict(dict)

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\n\n" + "\t".join([s,mt,c]))
    
    if (s not in region_motifs):
        region_motifs[s] = dict()
    if (mt not in region_motifs[s]):
        region_motifs[s][mt] = dict()
    
    region_motifs[s][mt][c] = motifs.filter_by_region(hmdfs_sum[s][mt][c].loc[background_motifs[s][mt][c]],
                                                      OUTDIR, mt, s, c, 
                                                      regions = [(-250,-120),(-120,-80),(-80,-40),(-40,0),(0,40),(40,80),(80,120),(120,250)],
                                                      sliding = 20, 
                                                      alpha = 0.001,
                                                      overedges = True,
                                                      verbose = False
                                                     )
    
    regmots = sorted(set(itertools.chain.from_iterable([list(v.keys()) for k,v in region_motifs[s][mt][c].items()])))
    print("Number of regionally-significant motifs:", len(regmots))
    
with open(os.path.join(OUTDIR, "polyastrength_motifs.region_motifs.pickle"), mode = 'wb') as handle:
    pickle.dump(region_motifs, handle)




saccharomyces_cerevisiae	polyastrength	polyastrength_kmers-6
Region=(-250, -120): alpha=0.001, percentile=99.900, cutoff_value=     1.231, n_passing= 69
Region=(-120, -80) : alpha=0.001, percentile=99.900, cutoff_value=     1.903, n_passing= 51
Region=(-80, -40)  : alpha=0.001, percentile=99.900, cutoff_value=    20.840, n_passing= 37
Region=(-40, 0)    : alpha=0.001, percentile=99.900, cutoff_value=    23.651, n_passing= 26
Region=(0, 40)     : alpha=0.001, percentile=99.900, cutoff_value=     7.833, n_passing= 34
Region=(40, 80)    : alpha=0.001, percentile=99.900, cutoff_value=     1.462, n_passing= 18
Region=(80, 120)   : alpha=0.001, percentile=99.900, cutoff_value=     1.225, n_passing= 30
Region=(120, 250)  : alpha=0.001, percentile=99.900, cutoff_value=     1.260, n_passing= 63
Number of regionally-significant motifs: 246


#### Combine significance filtering requirements

In [16]:
significant_motifs = defaultdict(dict)

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    
    if (s not in significant_motifs):
        significant_motifs[s] = dict()
    if (mt not in significant_motifs[s]):
        significant_motifs[s][mt] = dict()
        
    bgdmots = background_motifs[s][mt][c]
    regmots = sorted(set(itertools.chain.from_iterable([list(v.keys()) for k,v in region_motifs[s][mt][c].items()])))
    sigmots = sorted(set([r for r in regmots if (r in bgdmots)]))
    print(f"{s:25} {c}:\tbgd={len(bgdmots):4d}\treg={len(regmots):4d}\tsig={len(sigmots):4d}")
    
    significant_motifs[s][mt][c] = sigmots
        
with open(os.path.join(OUTDIR, "polyastrength_motifs.significant_motifs.pickle"), mode = 'wb') as handle:
    pickle.dump(significant_motifs, handle)


saccharomyces_cerevisiae  polyastrength_kmers-6:	bgd=4023	reg= 246	sig= 246


In [17]:
for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print(f"{s:25} {c}:\n\t{','.join(significant_motifs[s][mt][c])}")


saccharomyces_cerevisiae  polyastrength_kmers-6:
	AAAAAA,AAAAAG,AAAAAT,AAAAGA,AAAAGG,AAAATA,AAACAA,AAAGAA,AAAGAG,AAAGCG,AAAGGA,AAATAA,AAATAG,AACAAA,AACGAA,AACGCA,AACTAA,AAGAAA,AAGAAG,AAGAAT,AAGGAA,AAGGAG,AAGGGG,AAGTAG,AATAAA,AATAAC,AATAAT,AATAGA,AATATA,AATGTA,AATTAC,ACAAAA,ACAAAG,ACAAAT,ACATAA,ACATAG,ACATAT,ACGAGC,ACGGGG,ACGTAA,ACGTAC,ACGTAG,ACGTAT,AGAAAA,AGATAA,AGATAG,AGATAT,AGATGT,AGGACC,AGGAGA,AGGGAC,AGGGAG,AGGGGA,AGGGGC,AGGTAA,AGTATA,ATAAAA,ATAAAT,ATAAGA,ATAAGT,ATAATA,ATAATT,ATACAA,ATACAT,ATAGAT,ATAGGC,ATAGGG,ATAGGT,ATAGTG,ATATAA,ATATAC,ATATAG,ATATAT,ATATGT,ATGTAA,ATGTAC,ATGTAG,ATGTAT,CAAGGG,CAATGC,CAGGGA,CATGTA,CCGGGG,CCTTTT,CGAAAA,CGAAGA,CGAGGA,CGGAGA,CGGATG,CGGGGA,CGTAGA,CTACGT,CTATAT,CTATGT,CTCTTT,CTGTGT,CTTCTT,CTTTCT,CTTTTC,CTTTTG,CTTTTT,GAAAAA,GAAAAG,GAAAGG,GAAGAG,GAAGGA,GAATGA,GAATGG,GAGAAG,GAGGAG,GAGGCG,GAGGGA,GAGGGC,GAGGGG,GATATA,GATATT,GATGTA,GCAGAG,GCAGGG,GCATAT,GCCGGA,GCCTAT,GCCTGA,GCGGAA,GCGGAG,GCGTGG,GCTTTT,GGAAAA,GGAAAG,GGAAGG,GGAATG,GGACAG,GGAGAG,GGAGCC,GGAGGA,GGAGG