# Analysis of PolyaClassifier site definition via unbiased heatmaps

**Purpose**: To prepare matrices containing motif importance scores surrounding cleavage sites, which will be visualized in a clustered heatmap using R to identify motif families.


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

## IMPORTS AND SETUP

In [4]:
from paper_utilities import motifs

In [5]:
PROJECT   = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR    = os.path.join(PROJECT, 'manuscript', 'analysis', 'polyaclassifier_motif_importance_heatmaps')
RESOURCES = os.path.join(os.path.dirname(OUTDIR), 'resources')
os.makedirs(OUTDIR, exist_ok = True)


In [6]:
## Load model configuration information

with open(os.path.join(RESOURCES, 'polyaclassifier_configurations.pickle'), mode = 'rb') as handle:
    configurations = pickle.load(handle)


In [7]:
## Load processed motif importance data

with open(os.path.join(RESOURCES, "polyaclassifier_motifs.hmdfs.sum.pickle"), mode = 'rb') as handle:
    hmdfs_sum = pickle.load(handle)
    
with open(os.path.join(RESOURCES, "polyaclassifier_motifs.hmdfs.per_site.pickle"), mode = 'rb') as handle:
    hmdfs_per = pickle.load(handle)
    
## Load important motifs

with open(os.path.join(RESOURCES, "polyaclassifier_motifs.significant_motifs.pickle"), mode = 'rb') as handle:
    significant_motifs = pickle.load(handle)


## Prepare data for motif importance heatmaps

These are plotted in R using `pheatmap`.


In [8]:
for s in hmdfs_per.keys():
    for mt in hmdfs_per[s].keys():
        for c in hmdfs_per[s][mt].keys():
            
            if (s == 'saccharomyces_cerevisiae'):
                region = (-150,100)
            elif (s == 'schizosaccharomyces_pombe'):
                region = (-100,100)
            elif (s == 'arabidopsis_thaliana'):
                region = (-250,100)
            
            sigmots = significant_motifs[s][mt][c]
            print(f"{s:25} {c} sig={len(sigmots):4d}")

            hmdf_per_plot = hmdfs_per[s][mt][c][list(range(region[0],region[1]+1,1))]
            hmdf_per_plot.loc[sigmots].to_csv(os.path.join(OUTDIR, f"hmdf_per.{s}.{c}.txt"), sep = "\t", index = True, header = True)

            hmdf_sum_plot = hmdfs_sum[s][mt][c][list(range(region[0],region[1]+1,1))]
            hmdf_sum_plot.loc[sigmots].to_csv(os.path.join(OUTDIR, f"hmdf_sum.{s}.{c}.txt"), sep = "\t", index = True, header = True)


arabidopsis_thaliana      polyaclassifier_bagging3_kmers-4 sig=  22
arabidopsis_thaliana      polyaclassifier_bagging3_kmers-5 sig=  72
arabidopsis_thaliana      polyaclassifier_bagging3_kmers-6 sig= 255
saccharomyces_cerevisiae  polyaclassifier_bagging3_kmers-4 sig=  15
saccharomyces_cerevisiae  polyaclassifier_bagging3_kmers-5 sig=  51
saccharomyces_cerevisiae  polyaclassifier_bagging3_kmers-6 sig= 137
schizosaccharomyces_pombe polyaclassifier_bagging3_kmers-4 sig=  24
schizosaccharomyces_pombe polyaclassifier_bagging3_kmers-5 sig=  72
schizosaccharomyces_pombe polyaclassifier_bagging3_kmers-6 sig= 230
