# Filtering motifs important for PolyaStrength scores

**Purpose:** To create a table summarizing the motif importance scores of significant motifs.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import motifs
from functools import reduce

In [5]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


# Processing motif disruption results

## Model Configurations

In [6]:
with open(os.path.join(OUTDIR, 'polyastrength_configurations.pickle'), mode = 'rb') as handle:
    configurations = pickle.load(handle)


## Motif family definitions by species

In [7]:
## Load motif family definitions for S. cerevisiae

scer_definitions = {'patterns' : {}, 'distance' : {}}

# By pattern - we only look at pattern families for 6mers
with open(os.path.join(OUTDIR, f'motif_definitions.scer.6mers.patterns.pickle'), mode = 'rb') as handle:
    scer_definitions['patterns'][6] = pickle.load(handle)
    
scer_definitions['patterns'][5] = {'family':{}}
scer_definitions['patterns'][4] = {'family':{}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.scer.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        scer_definitions['distance'][kmer] = pickle.load(handle)


In [8]:
## Load motif family definitions for S. pombe

spom_definitions = {'distance' : {}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.spom.{kmer}mers.distance.pickle'), mode = 'rb') as handle:
        spom_definitions['distance'][kmer] = pickle.load(handle)


In [9]:
## Load motif family definitions for A. thaliana

atha_definitions = {'patterns' : {}}

# By Hamming distance
for kmer in [4,5,6]:
    with open(os.path.join(OUTDIR, f'motif_definitions.atha.{kmer}mers.patterns.pickle'), mode = 'rb') as handle:
        atha_definitions['patterns'][kmer] = pickle.load(handle)


## Processed motif importance profiles

In [10]:
with open(os.path.join(OUTDIR, "polyastrength_motifs.summaries.pickle"), mode = 'rb') as handle:
    motif_summaries = pickle.load(handle)


In [11]:
with open(os.path.join(OUTDIR, "polyastrength_motifs.hmdfs.sum.pickle"), mode = 'rb') as handle:
    hmdfs_sum = pickle.load(handle)
    

In [12]:
with open(os.path.join(OUTDIR, "polyastrength_motifs.significant_motifs.pickle"), mode = 'rb') as handle:
    significant_motifs = pickle.load(handle)
    

## Compile summary table of importance scores for individual motifs

In [13]:
for i,(s,sd) in enumerate(configurations.items()):
    for (c,cd) in sd.items():
        
        ## Specify model input details
        
        k      = float(c.split("-")[-1])
        mt     = cd['modeltype']
        ntotal = cd['golden_lines']

        mot_dis_dir = os.path.join(PROJECT, s, 'analysis', 'motif_analysis', 'motif_disruption', c)

        sigmots = sorted(significant_motifs[s][mt][c])
        
        print(f"\n{s}: config={c}, kmer={int(k)}, golden_sites={ntotal}, sig_motifs={len(sigmots)}")
            
        ## Fetch species-specific motif family definitions
        
        msum = motif_summaries.loc[(motif_summaries['species'] == s) & (motif_summaries['kmer'] == k)]

        if (s in ['saccharomyces_cerevisiae','schizosaccharomyces_pombe']):
            mfams = dict(zip(msum['testMotif'], msum['hammingFamily']))
        elif (s == 'arabidopsis_thaliana'):
            mfams = dict(zip(msum['testMotif'], msum['overallFamily']))

        ## Compile motif importance by species and model in different regions

        mdata_reg = []

        if (s == 'saccharomyces_cerevisiae'):
            mregs = [(-120,-26),(-25,-16),(-15,-6),(-5,1),(2,15)]
        elif (s == 'schizosaccharomyces_pombe'):
            mregs = [(-80,-31),(-30,-16),(-15,-6),(-5,1),(2,14),(15,60)]
        elif (s == 'arabidopsis_thaliana'):
            mregs = [(-150,-31),(-30,-16),(-15,-6),(-5,1),(2,15)]

        for mreg in mregs:
            mreg_dt = motifs.region_data(OUTDIR, mot_dis_dir, sigmots, mreg, datatype = "summary_ind", xvariable = "position", yvariable = "differenceStrength", measure = "mean", measure_error = 95, mdict = mfams)
            mreg_dt = mreg_dt[['motif','family','score','conf_hi','conf_lo']].rename(columns = {'score' : f'score_{mreg[0]}_{mreg[1]}', 'conf_lo' : f'conf_lo_{mreg[0]}_{mreg[1]}', 'conf_hi' : f'conf_hi_{mreg[0]}_{mreg[1]}'}).copy()
            print(f"Region {str(mreg):12}: {mreg_dt.shape}")
            mdata_reg.append(mreg_dt)

        mdata_reg = reduce(lambda x, y: pd.merge(x, y, on = ['motif','family']), mdata_reg)
        print(f"\nMerged  : {mdata_reg.shape}")

        ## Sum importance profiles

        hmdf_sum_plot = hmdfs_sum[s][mt][c][list(range(-250,251))]
        mdata = mdata_reg.merge(hmdf_sum_plot.loc[sigmots].reset_index().rename(columns = {'testMotif':'motif'}), on = 'motif')
        print(f"Compiled: {mdata.shape}")

        mdata.fillna(0).sort_values('motif').to_csv(os.path.join(OUTDIR, f'motif_importance_table.{s}.{mt}.{c}.txt'), sep = "\t", index = False)



saccharomyces_cerevisiae: config=polyastrength_kmers-6, kmer=6, golden_sites=9725, sig_motifs=246
Region (-120, -26) : (246, 5)


  return std / np.sqrt(self.sum_weights - 1)
  return self.sum / self.sum_weights
  return self.sumsquares / (self.sum_weights - self.ddof)
  return std / np.sqrt(self.sum_weights - 1)


Region (-25, -16)  : (246, 5)
Region (-15, -6)   : (246, 5)
Region (-5, 1)     : (246, 5)
Region (2, 15)     : (246, 5)

Merged  : (246, 17)
Compiled: (246, 518)
