# Filtering motifs important for PolyaClassifier site definition

**Purpose:** To identify motifs significantly contributing to cleavage site formation according to PolyaClassifier. We employ a two-step filtering procedure.

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%run -i notebook_setup.py

In [4]:
from paper_utilities import helpers, motifs

In [5]:
tqdm.tqdm.pandas()

In [6]:
PROJECT = "/projects/b1080/eks/polyadenylation/yeast"
OUTDIR  = os.path.join(PROJECT, 'manuscript', 'analysis', 'resources')
os.makedirs(OUTDIR, exist_ok = True)


# Processing motif disruption results

## Model Configurations

In [7]:
## Fetch line counts-1 for golden files before running this step

trained_configs = (pd.DataFrame.from_dict({
    'species'      : ['saccharomyces_cerevisiae']*3 + ['schizosaccharomyces_pombe']*3 + ['arabidopsis_thaliana']*3,
    'description'  : ['polyaclassifier_bagging3_kmers-4','polyaclassifier_bagging3_kmers-5','polyaclassifier_bagging3_kmers-6']*3,
    'modeltype'    : ['polyaclassifier']*9,
    'golden_lines' : [11673]*3 + [2492]*3 + [9358]*3,
    }, orient = 'columns')
                   .sort_values(['species','modeltype'])
                   .reset_index(drop = True)
)

trained_configs


Unnamed: 0,species,description,modeltype,golden_lines
0,arabidopsis_thaliana,polyaclassifier_bagging3_kmers-4,polyaclassifier,9358
1,arabidopsis_thaliana,polyaclassifier_bagging3_kmers-5,polyaclassifier,9358
2,arabidopsis_thaliana,polyaclassifier_bagging3_kmers-6,polyaclassifier,9358
3,saccharomyces_cerevisiae,polyaclassifier_bagging3_kmers-4,polyaclassifier,11673
4,saccharomyces_cerevisiae,polyaclassifier_bagging3_kmers-5,polyaclassifier,11673
5,saccharomyces_cerevisiae,polyaclassifier_bagging3_kmers-6,polyaclassifier,11673
6,schizosaccharomyces_pombe,polyaclassifier_bagging3_kmers-4,polyaclassifier,2492
7,schizosaccharomyces_pombe,polyaclassifier_bagging3_kmers-5,polyaclassifier,2492
8,schizosaccharomyces_pombe,polyaclassifier_bagging3_kmers-6,polyaclassifier,2492


In [8]:
## Make dictionary of configurations for easy downstream use

configurations = {
    'saccharomyces_cerevisiae'  : {},
    'schizosaccharomyces_pombe' : {},
    'arabidopsis_thaliana'      : {},
}

for i,row in trained_configs.iterrows():
    
    rdict = row.to_dict()
    rdict.pop('species')
    rdict.pop('description')
    
    configurations[row['species']][row['description']] = rdict
    
with open(os.path.join(OUTDIR, 'polyaclassifier_configurations.pickle'), mode = 'wb') as handle:
    pickle.dump(configurations, handle)


## Motif importance by position

#### Load motif disruption results into comprehensive data frame

In [9]:
motif_summaries = []

for i,config in trained_configs.iterrows():
    
    motif_dir = os.path.join(PROJECT, config['species'], "analysis", "motif_analysis", "motif_disruption")
    
    try:
        motif_file = os.path.join(motif_dir, f"motif_disruption.{config.description}.summary.txt")
        motif_summary = pd.read_csv(motif_file, sep = "\t")    
    except FileNotFoundError:
        continue

    motif_summary['species'] = config.species
    motif_summary['modeltype'] = config.modeltype
    motif_summary['config'] = config.description

    motif_summaries.append(motif_summary)
    print(f"{i}: {config['species']:25} {config['modeltype']} {config['description']} {motif_summary.shape}")
    
motif_summaries = pd.concat(motif_summaries, sort = False, ignore_index = True)
print(f"Removing {(motif_summaries['xs'].str.len() <= 2).sum()} motifs due to incorrect xs length...")

motif_summaries = motif_summaries.loc[motif_summaries['xs'].str.len() > 2].copy()
print(f"Motif summaries shape: {motif_summaries.shape}")


## Load motif importance profiles to numeric vectors

motif_summaries['kmer']          = motif_summaries['testMotif'].str.len()

motif_summaries['xs']            = motif_summaries['xs'].apply(lambda xs : [int(float(x)) for x in xs.strip("][").split(", ")])

motif_summaries['freqs']         = motif_summaries['freqs'].apply(lambda xs : np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['means']         = motif_summaries['means'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['medians']       = motif_summaries['medians'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))

motif_summaries['meanWeights']   = motif_summaries['meanWeights'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['medianWeights'] = motif_summaries['medianWeights'].apply(lambda xs : -1*np.asarray([float(x) for x in xs.strip("][").split(", ")]))

motif_summaries['stds']          = motif_summaries['stds'].apply(lambda xs : np.asarray([float(x) for x in xs.strip("][").split(", ")]))
motif_summaries['stdWeights']    = motif_summaries['stdWeights'].apply(lambda xs : np.asarray([float(x) for x in xs.strip("][").split(", ")]))

motif_summaries['stes']          = motif_summaries.apply(lambda row: helpers.std_to_ste(row['stds'], row['freqs']), axis = 1)
motif_summaries['steWeights']    = motif_summaries.apply(lambda row: helpers.std_to_ste(row['stdWeights'], row['freqs']), axis = 1)


## Calculate peak motif importance scores

measure = "mean"

motif_summaries['peak_persite_score'] = motif_summaries[f'{measure}s'].apply(lambda x : np.max(x))
motif_summaries['auc_persite_score']  = motif_summaries[f'{measure}s'].apply(lambda x : np.sum(x))

motif_summaries['extreme_persite_results']  = motif_summaries.progress_apply(lambda row: motifs.peak_importance_window(row['xs'], row[f'{measure}s'], region_size = 500, window_size = 20, direction = 'positive'), axis = 1)
motif_summaries['extreme_persite_position'] = motif_summaries['extreme_persite_results'].apply(lambda x : x[0])
motif_summaries['extreme_persite_score']    = motif_summaries['extreme_persite_results'].apply(lambda x : x[1])
motif_summaries['extreme_persite_log1p']    = np.sign(motif_summaries['extreme_persite_score']) * np.log10(np.abs(motif_summaries['extreme_persite_score']))
motif_summaries.drop(columns = 'extreme_persite_results', inplace = True)

motif_summaries['peak_weights_score'] = motif_summaries[f'{measure}Weights'].apply(lambda x : np.max(x))
motif_summaries['auc_weights_score']  = motif_summaries[f'{measure}Weights'].apply(lambda x : np.sum(x))

motif_summaries['extreme_weights_results']  = motif_summaries.progress_apply(lambda row: motifs.peak_importance_window(row['xs'], row[f'{measure}Weights'], region_size = 500, window_size = 20, direction = 'positive'), axis = 1)
motif_summaries['extreme_weights_position'] = motif_summaries['extreme_weights_results'].apply(lambda x : x[0])
motif_summaries['extreme_weights_score']    = motif_summaries['extreme_weights_results'].apply(lambda x : x[1])
motif_summaries['extreme_weights_log1p']    = np.sign(motif_summaries['extreme_weights_score']) * np.log10(np.abs(motif_summaries['extreme_weights_score']))
motif_summaries.drop(columns = 'extreme_weights_results', inplace = True)


## Record results

with open(os.path.join(OUTDIR, "polyaclassifier_motifs.disruption_profiles.pickle"), mode = 'wb') as handle:
    pickle.dump(motif_summaries, handle)
    

0: arabidopsis_thaliana      polyaclassifier polyaclassifier_bagging3_kmers-4 (256, 12)
1: arabidopsis_thaliana      polyaclassifier polyaclassifier_bagging3_kmers-5 (1024, 12)
2: arabidopsis_thaliana      polyaclassifier polyaclassifier_bagging3_kmers-6 (4096, 12)
3: saccharomyces_cerevisiae  polyaclassifier polyaclassifier_bagging3_kmers-4 (256, 12)
4: saccharomyces_cerevisiae  polyaclassifier polyaclassifier_bagging3_kmers-5 (1024, 12)
5: saccharomyces_cerevisiae  polyaclassifier polyaclassifier_bagging3_kmers-6 (4096, 12)
6: schizosaccharomyces_pombe polyaclassifier polyaclassifier_bagging3_kmers-4 (256, 12)
7: schizosaccharomyces_pombe polyaclassifier polyaclassifier_bagging3_kmers-5 (1024, 12)
8: schizosaccharomyces_pombe polyaclassifier polyaclassifier_bagging3_kmers-6 (4096, 12)
Removing 0 motifs due to incorrect xs length...
Motif summaries shape: (16128, 12)


  ste_vec[i] = std_vec[i] / np.sqrt(freq_vec[i])
100%|██████████| 16128/16128 [00:13<00:00, 1228.51it/s]
100%|██████████| 16128/16128 [00:12<00:00, 1286.09it/s]


In [10]:
motif_summaries.groupby(['species','modeltype','kmer']).size()

species                    modeltype        kmer
arabidopsis_thaliana       polyaclassifier  4        256
                                            5       1024
                                            6       4096
saccharomyces_cerevisiae   polyaclassifier  4        256
                                            5       1024
                                            6       4096
schizosaccharomyces_pombe  polyaclassifier  4        256
                                            5       1024
                                            6       4096
dtype: int64

#### Compile motif importance profiles into matrices

In [11]:
hmdfs_sum = dict()

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\t".join([s,mt,c]))
    
    if (s not in hmdfs_sum):
        hmdfs_sum[s] = dict()
    if (mt not in hmdfs_sum[s]):
        hmdfs_sum[s][mt] = dict()
        
    condS  = (motif_summaries['species'] == s)
    condMT = (motif_summaries['modeltype'] == mt)
    condC  = (motif_summaries['config'] == c)
        
    msum = motif_summaries.loc[condS & condMT & condC]
    
    hmdfs_sum[s][mt][c] = motifs.make_hmdf(msum, weighted = True, xwindow = (-250,250))
    
with open(os.path.join(OUTDIR, "polyaclassifier_motifs.hmdfs.sum.pickle"), mode = 'wb') as handle:
    pickle.dump(hmdfs_sum, handle)
    

arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-4


256it [00:00, 904.38it/s]


arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-5


1024it [00:01, 860.64it/s]


arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-6


4096it [00:05, 771.64it/s]


saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-4


256it [00:00, 903.37it/s]


saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-5


1024it [00:01, 859.05it/s]


saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-6


4096it [00:05, 767.68it/s]


schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-4


256it [00:00, 896.60it/s]


schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-5


1024it [00:01, 866.53it/s]


schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-6


4096it [00:05, 762.62it/s]


In [12]:
hmdfs_per = defaultdict(dict)

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\t".join([s,mt,c]))
    
    if (s not in hmdfs_per):
        hmdfs_per[s] = dict()
    if (mt not in hmdfs_per[s]):
        hmdfs_per[s][mt] = dict()
        
    condS  = (motif_summaries['species'] == s)
    condMT = (motif_summaries['modeltype'] == mt)
    condC  = (motif_summaries['config'] == c)
        
    msum = motif_summaries.loc[condS & condMT & condC]
    
    hmdfs_per[s][mt][c] = motifs.make_hmdf(msum, weighted = False, xwindow = (-250,250))
    
with open(os.path.join(OUTDIR, "polyaclassifier_motifs.hmdfs.per_site.pickle"), mode = 'wb') as handle:
    pickle.dump(hmdfs_per, handle)
    

arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-4


256it [00:00, 891.58it/s]


arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-5


1024it [00:01, 856.39it/s]


arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-6


4096it [00:05, 767.98it/s]


saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-4


256it [00:00, 911.70it/s]


saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-5


1024it [00:01, 864.03it/s]


saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-6


4096it [00:05, 769.32it/s]


schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-4


256it [00:00, 909.42it/s]


schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-5


1024it [00:01, 872.55it/s]


schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-6


4096it [00:05, 766.29it/s]


#### Identify motifs significant over background

In [13]:
background_motifs = dict()

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\n\n", "\t".join([s,mt,c]))
    
    if (s not in background_motifs):
        background_motifs[s] = dict()
    if (mt not in background_motifs[s]):
        background_motifs[s][mt] = dict()
        
    condS  = (motif_summaries['species'] == s)
    condMT = (motif_summaries['modeltype'] == mt)
    condC  = (motif_summaries['config'] == c)
        
    msum = motif_summaries.loc[condS & condMT & condC]
        
    background_motifs[s][mt][c] = motifs.filter_by_importance(msum,
                                                              hmdfs_sum[s][mt][c], 
                                                              OUTDIR, mt, c, 
                                                              center_idx = 250,
                                                              exclude = (-80,30), 
                                                              sliding = 20, 
                                                              niter = 1000, 
                                                              alpha = 0.001, ## Equivalent to 99.9%ile
                                                              direction = 'positive'
                                                             )

with open(os.path.join(OUTDIR, "polyaclassifier_motifs.background_motifs.pickle"), mode = 'wb') as handle:
    pickle.dump(background_motifs, handle)




 arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-4


100%|██████████| 1000/1000 [00:01<00:00, 724.23it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=456.225
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-57.911
Number of motifs above background threshold: 166 out of 256


 arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-5


100%|██████████| 1000/1000 [00:06<00:00, 163.09it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=173.352
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-19.454
Number of motifs above background threshold: 580 out of 1024


 arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-6


100%|██████████| 1000/1000 [00:34<00:00, 29.03it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=56.570
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-7.702
Number of motifs above background threshold: 2114 out of 4096


 saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-4


100%|██████████| 1000/1000 [00:01<00:00, 719.58it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=452.132
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-28.734
Number of motifs above background threshold: 148 out of 256


 saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-5


100%|██████████| 1000/1000 [00:06<00:00, 162.87it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=142.833
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-18.393
Number of motifs above background threshold: 561 out of 1024


 saccharomyces_cerevisiae	polyaclassifier	polyaclassifier_bagging3_kmers-6


100%|██████████| 1000/1000 [00:34<00:00, 28.97it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=66.273
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-6.477
Number of motifs above background threshold: 1672 out of 4096


 schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-4


100%|██████████| 1000/1000 [00:01<00:00, 706.72it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=203.124
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-54.855
Number of motifs above background threshold: 179 out of 256


 schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-5


100%|██████████| 1000/1000 [00:06<00:00, 163.83it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=111.352
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-40.082
Number of motifs above background threshold: 566 out of 1024


 schizosaccharomyces_pombe	polyaclassifier	polyaclassifier_bagging3_kmers-6


100%|██████████| 1000/1000 [00:36<00:00, 27.78it/s]


High Values:	alpha=0.001, percentile=99.900: cutoff_value=55.866
Low Values :	alpha=0.001, percentile= 0.100: cutoff_value=-16.863
Number of motifs above background threshold: 1775 out of 4096


#### Identify regionally-significant motifs

In [14]:
region_motifs = defaultdict(dict)

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print("\n\n" + "\t".join([s,mt,c]))
    
    if (s not in region_motifs):
        region_motifs[s] = dict()
    if (mt not in region_motifs[s]):
        region_motifs[s][mt] = dict()
    
    region_motifs[s][mt][c] = motifs.filter_by_region(hmdfs_sum[s][mt][c].loc[background_motifs[s][mt][c]],
                                                      OUTDIR, mt, s, c, 
                                                      regions = [(-250,-120),(-120,-80),(-80,-40),(-40,0),(0,40),(40,80),(80,120),(120,250)],
                                                      sliding = 20, 
                                                      alpha = 0.01,
                                                      overedges = True,
                                                      verbose = False
                                                     )
    
    regmots = sorted(set(itertools.chain.from_iterable([list(v.keys()) for k,v in region_motifs[s][mt][c].items()])))
    print("Number of regionally-significant motifs:", len(regmots))
    
with open(os.path.join(OUTDIR, "polyaclassifier_motifs.region_motifs.pickle"), mode = 'wb') as handle:
    pickle.dump(region_motifs, handle)




arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-4
Region=(-250, -120): alpha=0.010, percentile=99.000, cutoff_value=  1730.219, n_passing=  8
Region=(-120, -80) : alpha=0.010, percentile=99.000, cutoff_value=  4141.129, n_passing=  2
Region=(-80, -40)  : alpha=0.010, percentile=99.000, cutoff_value=  4600.922, n_passing=  2
Region=(-40, 0)    : alpha=0.010, percentile=99.000, cutoff_value=  7041.055, n_passing=  8
Region=(0, 40)     : alpha=0.010, percentile=99.000, cutoff_value=  4799.077, n_passing= 13
Region=(40, 80)    : alpha=0.010, percentile=99.000, cutoff_value=   509.686, n_passing=  9
Region=(80, 120)   : alpha=0.010, percentile=99.000, cutoff_value=   483.971, n_passing=  7
Region=(120, 250)  : alpha=0.010, percentile=99.000, cutoff_value=   435.487, n_passing=  8
Number of regionally-significant motifs: 22


arabidopsis_thaliana	polyaclassifier	polyaclassifier_bagging3_kmers-5
Region=(-250, -120): alpha=0.010, percentile=99.000, cutoff_value=   609.082

#### Organize significance filtering results - expect that all regionally-significant motifs are significant over background

In [15]:
significant_motifs = defaultdict(dict)

for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    
    if (s not in significant_motifs):
        significant_motifs[s] = dict()
    if (mt not in significant_motifs[s]):
        significant_motifs[s][mt] = dict()
        
    bgdmots = background_motifs[s][mt][c]
    regmots = sorted(set(itertools.chain.from_iterable([list(v.keys()) for k,v in region_motifs[s][mt][c].items()])))
    sigmots = sorted(set([r for r in regmots if (r in bgdmots)]))
    print(f"{s:25} {c}:\tbgd={len(bgdmots):4d}\treg={len(regmots):4d}\tsig={len(sigmots):4d}")
    
    significant_motifs[s][mt][c] = sigmots
    
with open(os.path.join(OUTDIR, "polyaclassifier_motifs.significant_motifs.pickle"), mode = 'wb') as handle:
    pickle.dump(significant_motifs, handle)


arabidopsis_thaliana      polyaclassifier_bagging3_kmers-4:	bgd= 166	reg=  22	sig=  22
arabidopsis_thaliana      polyaclassifier_bagging3_kmers-5:	bgd= 580	reg=  72	sig=  72
arabidopsis_thaliana      polyaclassifier_bagging3_kmers-6:	bgd=2114	reg= 255	sig= 255
saccharomyces_cerevisiae  polyaclassifier_bagging3_kmers-4:	bgd= 148	reg=  15	sig=  15
saccharomyces_cerevisiae  polyaclassifier_bagging3_kmers-5:	bgd= 561	reg=  51	sig=  51
saccharomyces_cerevisiae  polyaclassifier_bagging3_kmers-6:	bgd=1672	reg= 137	sig= 137
schizosaccharomyces_pombe polyaclassifier_bagging3_kmers-4:	bgd= 179	reg=  24	sig=  24
schizosaccharomyces_pombe polyaclassifier_bagging3_kmers-5:	bgd= 566	reg=  72	sig=  72
schizosaccharomyces_pombe polyaclassifier_bagging3_kmers-6:	bgd=1775	reg= 230	sig= 230


In [16]:
for i,(s,mt,c) in motif_summaries[['species','modeltype','config']].drop_duplicates().iterrows():
    print(f"{s:25} {c}:\n\t{','.join(significant_motifs[s][mt][c])}")


arabidopsis_thaliana      polyaclassifier_bagging3_kmers-4:
	AAAA,AAAC,AAAT,AACA,ACAA,AGAA,ATTT,CAAA,CTTT,GAAA,GTTT,TATT,TCTT,TGTA,TGTT,TTAT,TTCT,TTGT,TTTA,TTTC,TTTG,TTTT
arabidopsis_thaliana      polyaclassifier_bagging3_kmers-5:
	AAAAA,AAAAC,AAAAG,AAAAT,AAACA,AAAGA,AACAA,AAGAA,AATAA,AATTT,ACAAA,AGAAA,AGAAG,AGTTT,ATAAA,ATGAA,ATGTA,ATTTC,ATTTG,ATTTT,CAAAA,CATTT,CTTCT,CTTGT,CTTTG,CTTTT,GAAAA,GAAGA,GATTT,GGTTT,GTGTT,GTTAT,GTTGT,GTTTA,GTTTC,GTTTG,GTTTT,TATAT,TATTT,TCATT,TCTCT,TCTGT,TCTTC,TCTTT,TGAAA,TGGTT,TGTAA,TGTAT,TGTGT,TGTTA,TGTTG,TGTTT,TTATG,TTATT,TTCAT,TTCTC,TTCTT,TTGGT,TTGTA,TTGTG,TTGTT,TTTAT,TTTCA,TTTCT,TTTGA,TTTGC,TTTGG,TTTGT,TTTTA,TTTTC,TTTTG,TTTTT
arabidopsis_thaliana      polyaclassifier_bagging3_kmers-6:
	AAAAAA,AAAAAC,AAAAAG,AAAAAT,AAAACA,AAAACT,AAAAGA,AAAATA,AAAATC,AAAATG,AAAATT,AAACAA,AAACAG,AAACAT,AAACCA,AAAGAA,AAAGAG,AAATAA,AAATCA,AAATGA,AAATGT,AAATTG,AACAAA,AACCAA,AAGAAA,AAGAAG,AAGAGA,AAGTTT,AATAAA,AATAAT,AATCAA,AATGAA,AATGTA,AATTTG,AATTTT,ACAAAA,ACAAAC,ACAAAG,ACAAAT,AC