In [1]:
import os
from Bio import SeqIO

In [2]:
def read_data():
    base_directory = './h_pylori_paidb//'
    data = {}
    
    for name in os.listdir(base_directory):
        dir_path = os.path.join(base_directory, name)

        if os.path.isdir(dir_path):
            fasta_file = os.path.join(dir_path, f"{name}.fasta")
            pai_file   = os.path.join(dir_path, f"{name}.pai")
            cpai_file  = os.path.join(dir_path, f"{name}.cpai")
            npai_file  = os.path.join(dir_path, f"{name}.npai")

            data[name] = {}
            genome_sequence = next(SeqIO.parse(fasta_file, 'fasta'))
            data[name]['genome_sequence'] = genome_sequence

            with open(pai_file, 'r') as file:
                data[name]['pai_sequences'] = []

                while True:
                    line = file.readline()
                    if not line:
                        break

                    line = line.split()
                    pai_start_index = int(line[0])
                    pai_end_index = int(line[1])

                    pai_sequence = genome_sequence.seq[pai_start_index:pai_end_index]

                    data[name]['pai_sequences'].append((pai_sequence, (pai_start_index, pai_end_index)))

            with open(cpai_file, 'r') as file:
                data[name]['cpai_sequences'] = []

                while True:
                    line = file.readline()
                    if not line:
                        break

                    line = line.split()
                    cpai_start_index = int(line[0])
                    cpai_end_index = int(line[1])

                    cpai_sequence = genome_sequence.seq[cpai_start_index:cpai_end_index]

                    data[name]['cpai_sequences'].append((cpai_sequence, (cpai_start_index, cpai_end_index)))

            with open(npai_file, 'r') as file:
                data[name]['npai_sequences'] = []

                while True:
                    line = file.readline()
                    if not line:
                        break

                    line = line.split()
                    npai_start_index = int(line[0])
                    npai_end_index = int(line[1])

                    npai_sequence = genome_sequence.seq[npai_start_index:npai_end_index]

                    data[name]['npai_sequences'].append((npai_sequence, (npai_start_index, npai_end_index)))

    return data

In [3]:
h_pylori_data = read_data()

### TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(sequence, k):
    kmers = get_kmers(str(sequence), k)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([kmers])
    tfidf_matrix = X.toarray()
    return tfidf_matrix, vectorizer

def get_kmers(sequence, k):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return ' '.join(kmers)

def get_kmers_map(tfidf_matrix, vectorizer, sequence, top_n=20):
    feature_names = vectorizer.get_feature_names_out()
    feature_scores = tfidf_matrix.flatten()
    
    top_kmers_indices = feature_scores.argsort()[-top_n:][::-1]
    top_kmers = [(feature_names[i], feature_scores[i]) for i in top_kmers_indices]

    mapped_kmers = {}
    for kmer, score in top_kmers:
        kmer = kmer.upper()
        count = sequence.count(kmer)
        mapped_kmers[kmer] = {'score': score, 'count': count}
        
    return mapped_kmers

def find_unique_kmers(full_sequence, start_index, end_index, significant_kmers, unique_kmers):
    for kmer, details in significant_kmers.items():
        left_occurrences = full_sequence[:start_index].count(kmer)
        right_occurrences = full_sequence[end_index:].count(kmer)
        occurrences = left_occurrences + right_occurrences
        if occurrences == 0:
            unique_kmers.append((kmer, details['count']))

In [5]:
def get_unique_patterns(full_sequence, island_sequence, start_index, end_index, k_max):
    
    unique_kmers = []
    
    for k in range(4, k_max):
        tfidf_matrix, vectorizer = tfidf(island_sequence, k)
        kmers = get_kmers_map(tfidf_matrix, vectorizer, island_sequence)
        find_unique_kmers(full_sequence, start_index, end_index, kmers, unique_kmers)
        
    return unique_kmers

In [6]:
import logging

logging.basicConfig(filename='./logs/tfidf_all_islands.log', level=logging.INFO,
                    format='%(message)s')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(message)s'))

logging.getLogger('').addHandler(console)

In [6]:
from Bio import Align
from Bio.Seq import Seq

In [7]:
def count_matches(sequence, pattern):
#     aligner = Align.PairwiseAligner()
#     aligner.mode = 'local'
#     aligner.match_score = 2
#     aligner.mismatch_score = -2
#     aligner.open_gap_score = -2
#     aligner.extend_gap_score = -1
    
#     alignments = aligner.align(sequence, pattern)
#     threshold = 0.85 * (2 * len(pattern))
    
#     return sum(1 for alignment in alignments if alignment.score >= threshold)
    return sequence.count(pattern)

In [8]:
def find_islands_kmers(full_sequence,
                       pai_islands,
                       cpai_islands,
                       npai_islands,
                       significant_kmers,
                       all_islands_kmers):
    
    all_islands = pai_islands + cpai_islands + npai_islands
    all_islands.sort(key=lambda x: x[0])
        
    for kmer, details in significant_kmers.items():
        
        if kmer in all_islands_kmers:
            continue
        
        start = 0
        end = 0
        isUnique = True
        
        # Check if kmer doesn't exist outside of the islands
        for island in all_islands:
            end = island[0]
            
            if end > start:
                if count_matches(full_sequence[start:end], kmer) > 0:
                    isUnique = False
                    break
                    
            start = island[1]
                    
        if not isUnique or count_matches(full_sequence[start:], kmer) > 0:
            continue
          
        
        # Check which islands contain kmer
        islands_containing_kmer = []
        
        for num, pai_island in enumerate(pai_islands, start=1):
            start = pai_island[0]
            end = pai_island[1]
            
            count = count_matches(full_sequence[start:end], kmer)
            
            if count > 0:
                islands_containing_kmer.append(f'PAI {num}: {count}')
            
        for num, cpai_island in enumerate(cpai_islands, start=1):
            start = cpai_island[0]
            end = cpai_island[1]
            
            count = count_matches(full_sequence[start:end], kmer)
            
            if count > 0:
                islands_containing_kmer.append(f'CPAI {num}: {count}')
                
        for num, npai_island in enumerate(npai_islands, start=1):
            start = npai_island[0]
            end = npai_island[1]
            
            count = count_matches(full_sequence[start:end], kmer)
            
            if count > 0:
                islands_containing_kmer.append(f'NPAI {num}: {count}')
        
        all_islands_kmers[kmer] = islands_containing_kmer

In [9]:
def get_islands_kmers(full_sequence,
                      island_sequence,
                      pai_islands,
                      cpai_islands,
                      npai_islands,
                      all_islands_kmers,
                      k_max):
    
    
    for k in range(4, k_max):
        tfidf_matrix, vectorizer = tfidf(island_sequence, k)
        kmers = get_kmers_map(tfidf_matrix, vectorizer, island_sequence)
        find_islands_kmers(full_sequence, pai_islands, cpai_islands, npai_islands, kmers, all_islands_kmers)

In [11]:
k_max = 40
logging.info(f'Max pattern length: {k_max}')
logging.info('--------------------------')

unique_patterns = {}

for i, (name, data) in enumerate(h_pylori_data.items()):
    logging.info(f'Genome {i+1}/{len(h_pylori_data)}: {name}')
    
    genome_sequence = data['genome_sequence']
    all_islands_kmers = {}

    pai_indices = [x[1] for x in data['pai_sequences']]
    cpai_indices = [x[1] for x in data['cpai_sequences']]
    npai_indices = [x[1] for x in data['npai_sequences']]
    
    if len(data['pai_sequences']) != 0:
        
        for j, (pai_sequence, indices) in enumerate(data['pai_sequences']):
            
            len_before = len(all_islands_kmers)
            
            get_islands_kmers(genome_sequence,
                              pai_sequence,
                              pai_indices,
                              cpai_indices,
                              npai_indices,
                              all_islands_kmers,
                              k_max)
            
            new_patterns_found = len(all_islands_kmers) - len_before
            logging.info(f"PAI {j+1}/{len(data['pai_sequences'])}: {new_patterns_found} patterns found")
                        
    else:
        logging.info('No PAI sequences')
    
    
    if len(data['cpai_sequences']) != 0:
        
        for j, (cpai_sequence, indices) in enumerate(data['cpai_sequences']):
            
            len_before = len(all_islands_kmers)
            
            get_islands_kmers(genome_sequence,
                              cpai_sequence,
                              pai_indices,
                              cpai_indices,
                              npai_indices,
                              all_islands_kmers,
                              k_max)
            
            new_patterns_found = len(all_islands_kmers) - len_before
            logging.info(f"CPAI {j+1}/{len(data['cpai_sequences'])}: {new_patterns_found} patterns found")
            
    else:
        logging.info('No CPAI sequences')
        
        
    if len(data['npai_sequences']) != 0:

        for j, (npai_sequence, indices) in enumerate(data['npai_sequences']):
            
            len_before = len(all_islands_kmers)
            
            get_islands_kmers(genome_sequence,
                              npai_sequence,
                              pai_indices,
                              cpai_indices,
                              npai_indices,
                              all_islands_kmers,
                              k_max)
            
            new_patterns_found = len(all_islands_kmers) - len_before
            logging.info(f"NPAI {j+1}/{len(data['npai_sequences'])}: {new_patterns_found} patterns found")
    else:
        logging.info('No NPAI sequences')
    logging.info('--------------------------')
    
    unique_patterns[name] = all_islands_kmers
    os.makedirs(f'h_pylori_paidb/{name}/patterns/tfidf/all_islands/', exist_ok=True)
    
    with open(f'h_pylori_paidb/{name}/patterns/tfidf/all_islands/tfidf.txt', 'w') as file:
        for kmer in all_islands_kmers:
            file.write(kmer)
            file.write('\n')
            
            for island in all_islands_kmers[kmer]:
                file.write(island)
                file.write('\n')
            
            file.write('----------------\n')

Max pattern length: 40
--------------------------
Genome 1/53: NC_017372
No PAI sequences
CPAI 1/1: 538 patterns found
NPAI 1/1: 555 patterns found
--------------------------
Genome 2/53: NC_011333
No PAI sequences
No CPAI sequences
NPAI 1/2: 550 patterns found
NPAI 2/2: 533 patterns found
--------------------------
Genome 3/53: NC_008086
No PAI sequences
CPAI 1/1: 542 patterns found
No NPAI sequences
--------------------------
Genome 4/53: NC_021882
No PAI sequences
CPAI 1/1: 524 patterns found
No NPAI sequences
--------------------------
Genome 5/53: NC_010698
No PAI sequences
CPAI 1/1: 524 patterns found
NPAI 1/1: 550 patterns found
--------------------------
Genome 6/53: NC_017359
No PAI sequences
No CPAI sequences
NPAI 1/1: 552 patterns found
--------------------------
Genome 7/53: NC_017355
No PAI sequences
No CPAI sequences
NPAI 1/2: 550 patterns found
NPAI 2/2: 564 patterns found
--------------------------
Genome 8/53: NC_017375
No PAI sequences
CPAI 1/1: 557 patterns found
No 

In [13]:
import json

In [14]:
with open('./json/tfidf/all_islands.json', 'w') as file:
    json.dump(unique_patterns, file)

#### Patterns that partially belong in the islands

In [10]:
import logging

logging.basicConfig(filename='./logs/tfidf_partial.log', level=logging.INFO,
                    format='%(message)s')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(message)s'))

logging.getLogger('').addHandler(console)

In [11]:
def find_unique_kmers_partial(full_sequence, island_start_index, island_end_index, significant_kmers, unique_kmers):
    for kmer, details in significant_kmers.items():
        
        is_unique = True
        is_partial = False
        index = 0
        
        while index != -1:
            index = full_sequence.seq.find(kmer, index)
            if index != -1:
                start = index
                end = index + len(kmer)
                
                if start < island_start_index:
                    if end < island_start_index:
                        is_unique = False
                        break
                    else:
                        is_partial = True
                        
                if end > island_end_index:
                    if start > island_end_index:
                        is_unique = False
                        break
                    else:
                        is_partial = True
                
                index += len(kmer)
                
        if not is_unique or not is_partial:
            continue
        
        left_occurrences = full_sequence[:island_start_index].count(kmer)
        right_occurrences = full_sequence[island_end_index:].count(kmer)
        occurrences = left_occurrences + right_occurrences
        
        if occurrences == 0:
            unique_kmers.append((kmer, details['count']))
        else:
            print('Something went wrong')
            
def get_unique_patterns_partial(full_sequence, 
                                island_start_index,
                                island_end_index,
                                k_max):
    
    unique_kmers = []
    
    for k in range(4, k_max):
        
        start = island_start_index-k+1
        end = island_end_index+k-1
        
        if start < 0:
            start = 0
            
        if end > len(full_sequence.seq) - 1:
            end = len(full_sequence.seq) - 1
        
        island_sequence = full_sequence.seq[start:end]
        
        tfidf_matrix, vectorizer = tfidf(island_sequence, k)
        kmers = get_kmers_map(tfidf_matrix, vectorizer, island_sequence)
        find_unique_kmers_partial(full_sequence, island_start_index, island_end_index, kmers, unique_kmers)
        
    return unique_kmers

In [12]:
unique_pai_patterns  = {}
unique_cpai_patterns = {}
unique_npai_patterns = {}

k_max = 40
logging.info(f'Max pattern length: {k_max}')
logging.info('--------------------------')

for i, (name, data) in enumerate(h_pylori_data.items()):        
    logging.info(f'Genome {i+1}/{len(h_pylori_data)}: {name}')
    
    if len(data['pai_sequences']) != 0:
        unique_pai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (pai_sequence, indices) in enumerate(data['pai_sequences']):
            unique_patterns = get_unique_patterns_partial(genome_sequence, indices[0], indices[1], k_max)

            unique_pai_patterns[name][f'pai_sequence_{j+1}'] = pai_sequence
            unique_pai_patterns[name][f'indices_pai_{j+1}'] = indices
            unique_pai_patterns[name][f'patterns_pai_{j+1}'] = unique_patterns

            logging.info(f"PAI {j+1}/{len(data['pai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'h_pylori_paidb/{name}/patterns/tfidf/partial/pai/', exist_ok=True)
            
            with open(f'h_pylori_paidb/{name}/patterns/tfidf/partial/pai/pai_{j+1}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
                        
    else:
        logging.info('No PAI sequences')
    
    if len(data['cpai_sequences']) != 0:    
        unique_cpai_patterns[name] = {}

        genome_sequence = data['genome_sequence']
        for j, (cpai_sequence, indices) in enumerate(data['cpai_sequences']):
            unique_patterns = get_unique_patterns_partial(genome_sequence, indices[0], indices[1], k_max)

            unique_cpai_patterns[name][f'cpai_sequence_{j+1}'] = cpai_sequence
            unique_cpai_patterns[name][f'indices_cpai_{j+1}'] = indices
            unique_cpai_patterns[name][f'patterns_cpai_{j+1}'] = unique_patterns
            
            logging.info(f"CPAI {j+1}/{len(data['cpai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'h_pylori_paidb/{name}/patterns/tfidf/partial/cpai/', exist_ok=True)
            
            with open(f'h_pylori_paidb/{name}/patterns/tfidf/partial/cpai/cpai_{j+1}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
            
    else:
        logging.info('No CPAI sequences')
        
    if len(data['npai_sequences']) != 0:
        unique_npai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (npai_sequence, indices) in enumerate(data['npai_sequences']):
            unique_patterns = get_unique_patterns_partial(genome_sequence, indices[0], indices[1], k_max)

            unique_npai_patterns[name][f'npai_sequence_{j+1}'] = npai_sequence
            unique_npai_patterns[name][f'indices_npai_{j+1}'] = indices
            unique_npai_patterns[name][f'patterns_npai_{j+1}'] = unique_patterns

            logging.info(f"NPAI {j+1}/{len(data['npai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'h_pylori_paidb/{name}/patterns/tfidf/partial/npai/', exist_ok=True)
            
            with open(f'h_pylori_paidb/{name}/patterns/tfidf/partial/npai/npai_{j+1}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
    
    else:
        logging.info('No NPAI sequences')
    logging.info('--------------------------')

Max pattern length: 40
--------------------------
Genome 1/53: NC_017372
No PAI sequences
CPAI 1/1: 0 unique patterns
NPAI 1/1: 0 unique patterns
--------------------------
Genome 2/53: NC_011333
No PAI sequences
No CPAI sequences
NPAI 1/2: 0 unique patterns
NPAI 2/2: 0 unique patterns
--------------------------
Genome 3/53: NC_008086
No PAI sequences
CPAI 1/1: 0 unique patterns
No NPAI sequences
--------------------------
Genome 4/53: NC_021882
No PAI sequences
CPAI 1/1: 0 unique patterns
No NPAI sequences
--------------------------
Genome 5/53: NC_010698
No PAI sequences
CPAI 1/1: 0 unique patterns
NPAI 1/1: 0 unique patterns
--------------------------
Genome 6/53: NC_017359
No PAI sequences
No CPAI sequences
NPAI 1/1: 0 unique patterns
--------------------------
Genome 7/53: NC_017355
No PAI sequences
No CPAI sequences
NPAI 1/2: 0 unique patterns
NPAI 2/2: 2 unique patterns
--------------------------
Genome 8/53: NC_017375
No PAI sequences
CPAI 1/1: 0 unique patterns
No NPAI sequenc

### NOSEP

In [4]:
from spmf import Spmf
import os
from Bio import SeqIO

In [5]:
def to_spmf_format(genome_sequence):
    seq = str(genome_sequence)
    
    seq = seq.replace('A', '1')
    seq = seq.replace('C', '2')
    seq = seq.replace('G', '3')
    seq = seq.replace('T', '4')

    seq = ''.join(c + ' -1 ' for c in seq).rstrip(' -1 ') + ' -2'
    
    return seq

In [6]:
def from_spmf_format(filename):
    patterns = []
    
    with open(filename, 'r') as file:
        for line in file:
            lst = line.split('#')
            
            pattern = lst[0]
            support = lst[1]
            
            pattern = pattern.replace('-1', '')
            pattern = pattern.replace('1', 'A')
            pattern = pattern.replace('2', 'C')
            pattern = pattern.replace('3', 'G')
            pattern = pattern.replace('4', 'T')
            pattern = pattern.replace(' ', '')
            
            support = int(support.replace('SUP: ', ''))
            patterns.append((pattern, support))
            
    return patterns

In [7]:
import logging

logging.basicConfig(filename='logs/nosep.log', level=logging.INFO,
                    format='%(message)s')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(message)s'))

logging.getLogger('').addHandler(console)

In [8]:
def run_nosep(full_sequence, island_sequence, start_index, end_index, k_max):
    
    unique_patterns = []
    
    input_filename = 'input.txt'
    output_filename = 'output.txt'
    
    with open(input_filename, 'w') as input_file:
        input_file.write(to_spmf_format(island_sequence))
    
    spmf = Spmf("NOSEP", input_filename=input_filename, output_filename=output_filename, arguments=[4, k_max, 0, 0, 2])
    spmf.run()
    
    patterns = from_spmf_format(output_filename)
    
    os.remove(input_filename)
    os.remove(output_filename)
    
    for pattern, support in patterns:
        left_occurrences = full_sequence[:start_index].count(pattern)
        right_occurrences = full_sequence[end_index:].count(pattern)
        occurrences = left_occurrences + right_occurrences
        if occurrences == 0:
            unique_patterns.append((pattern, support))
            
    return unique_patterns

In [9]:
unique_pai_patterns  = {}
unique_cpai_patterns = {}
unique_npai_patterns = {}

k_max = 40
logging.info(f'Max pattern length: {k_max}')
logging.info('--------------------------')

for i, (name, data) in enumerate(h_pylori_data.items()):
    logging.info(f'Genome {i+1}/{len(h_pylori_data)}: {name}')
    
    if len(data['pai_sequences']) != 0:
        unique_pai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (pai_sequence, indices) in enumerate(data['pai_sequences']):
            
            if indices[1] - indices[0] > 40000:
                logging.info(f"PAI {j+1}/{len(data['pai_sequences'])}: island too large")
                continue
                
            unique_patterns = run_nosep(genome_sequence, pai_sequence, indices[0], indices[1], k_max)

            unique_pai_patterns[name]['pai_sequence'] = pai_sequence
            unique_pai_patterns[name]['indices'] = indices
            unique_pai_patterns[name]['patterns'] = unique_patterns

            logging.info(f"PAI {j+1}/{len(data['pai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'h_pylori_paidb/{name}/patterns/nosep/pai/', exist_ok=True)
            
            with open(f'h_pylori_paidb/{name}/patterns/nosep/pai/pai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
                        
    else:
        logging.info('No PAI sequences')
    
    if len(data['cpai_sequences']) != 0:    
        unique_cpai_patterns[name] = {}

        genome_sequence = data['genome_sequence']
        for j, (cpai_sequence, indices) in enumerate(data['cpai_sequences']):
            
            if indices[1] - indices[0] > 40000:
                logging.info(f"CPAI {j+1}/{len(data['cpai_sequences'])}: island too large")
                continue
            
            unique_patterns = run_nosep(genome_sequence, cpai_sequence, indices[0], indices[1], k_max)

            unique_cpai_patterns[name]['cpai_sequence'] = cpai_sequence
            unique_cpai_patterns[name]['indices'] = indices
            unique_cpai_patterns[name]['patterns'] = unique_patterns
            
            logging.info(f"CPAI {j+1}/{len(data['cpai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'h_pylori_paidb/{name}/patterns/nosep/cpai/', exist_ok=True)
            
            with open(f'h_pylori_paidb/{name}/patterns/nosep/cpai/cpai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
            
    else:
        logging.info('No CPAI sequences')
        
    if len(data['npai_sequences']) != 0:
        unique_npai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (npai_sequence, indices) in enumerate(data['npai_sequences']):
            
            if indices[1] - indices[0] > 40000:
                logging.info(f"NPAI {j+1}/{len(data['npai_sequences'])}: island too large")
                continue
            
            unique_patterns = run_nosep(genome_sequence, npai_sequence, indices[0], indices[1], k_max)

            unique_npai_patterns[name]['npai_sequence'] = npai_sequence
            unique_npai_patterns[name]['indices'] = indices
            unique_npai_patterns[name]['patterns'] = unique_patterns

            logging.info(f"NPAI {j+1}/{len(data['npai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'h_pylori_paidb/{name}/patterns/nosep/npai/', exist_ok=True)
            
            with open(f'h_pylori_paidb/{name}/patterns/nosep/npai/npai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
    
    else:
        logging.info('No NPAI sequences')
    logging.info('--------------------------')

Max pattern length: 40
--------------------------
Genome 1/53: NC_017372
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 13971
 Total time ~ 34880 ms
 Maximum memory usage : 294.0252990722656 mb
 Calculation count 27737



CPAI 1/1: 314 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 2/53: NC_011333
No PAI sequences
No CPAI sequences
NPAI 1/2: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 29246
 Total time ~ 176465 ms
 Maximum memory usage : 42.65605163574219 mb
 Calculation count 58527



NPAI 2/2: 594 unique patterns
--------------------------
Genome 3/53: NC_008086
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 4/53: NC_021882
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 5/53: NC_010698
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 28954
 Total time ~ 189008 ms
 Maximum memory usage : 42.34050750732422 mb
 Calculation count 58091



CPAI 1/1: 568 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 6/53: NC_017359
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 46308
 Total time ~ 252636 ms
 Maximum memory usage : 391.41576385498047 mb
 Calculation count 74590



NPAI 1/1: 18858 unique patterns
--------------------------
Genome 7/53: NC_017355
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 39876
 Total time ~ 167029 ms
 Maximum memory usage : 326.35790252685547 mb
 Calculation count 61935



NPAI 1/2: 18216 unique patterns


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 106609
 Total time ~ 378097 ms
 Maximum memory usage : 292.7956771850586 mb
 Calculation count 124299



NPAI 2/2: 81965 unique patterns
--------------------------
Genome 8/53: NC_017375
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 9/53: NC_017739
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 13198
 Total time ~ 39231 ms
 Maximum memory usage : 237.73016357421875 mb
 Calculation count 26213



CPAI 1/1: 33 unique patterns
NPAI 1/2: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 29056
 Total time ~ 250640 ms
 Maximum memory usage : 131.96405792236328 mb
 Calculation count 58279



NPAI 2/2: 406 unique patterns
--------------------------
Genome 10/53: NC_020508
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 12799
 Total time ~ 35149 ms
 Maximum memory usage : 213.4578628540039 mb
 Calculation count 25661



CPAI 1/2: 176 unique patterns
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Genome 11/53: NC_017919
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 15120
 Total time ~ 47002 ms
 Maximum memory usage : 166.27306365966797 mb
 Calculation count 30281



NPAI 1/1: 10269 unique patterns
--------------------------
Genome 12/53: NC_017382
No PAI sequences
No CPAI sequences
NPAI 1/2: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 14192
 Total time ~ 45104 ms
 Maximum memory usage : 120.14141845703125 mb
 Calculation count 28314



NPAI 2/2: 172 unique patterns
--------------------------
Genome 13/53: NC_019563
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 7285
 Total time ~ 11327 ms
 Maximum memory usage : 60.97782897949219 mb
 Calculation count 14973



NPAI 1/1: 58 unique patterns
--------------------------
Genome 14/53: NC_017378
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 15/53: NC_012973
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 9340
 Total time ~ 17689 ms
 Maximum memory usage : 184.47708892822266 mb
 Calculation count 19272



NPAI 1/1: 115 unique patterns
--------------------------
Genome 16/53: NC_021216
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 17/53: NC_014560
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 14651
 Total time ~ 45997 ms
 Maximum memory usage : 265.0581588745117 mb
 Calculation count 29401



CPAI 1/1: 144 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 18/53: NC_021218
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 28853
 Total time ~ 199299 ms
 Maximum memory usage : 39.043426513671875 mb
 Calculation count 58065



NPAI 1/2: 548 unique patterns
NPAI 2/2: island too large
--------------------------
Genome 19/53: NC_017368


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 51506
 Total time ~ 296749 ms
 Maximum memory usage : 285.5534133911133 mb
 Calculation count 81123



PAI 1/1: 22999 unique patterns
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 20/53: NC_014256
PAI 1/1: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 17635
 Total time ~ 60832 ms
 Maximum memory usage : 78.42208099365234 mb
 Calculation count 35360



CPAI 1/1: 189 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 21/53: NC_020509
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 22/53: NC_022130
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 14929
 Total time ~ 33492 ms
 Maximum memory usage : 89.73841857910156 mb
 Calculation count 27178



NPAI 1/2: 3097 unique patterns


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 29229
 Total time ~ 178529 ms
 Maximum memory usage : 368.30967712402344 mb
 Calculation count 58498



NPAI 2/2: 708 unique patterns
--------------------------
Genome 23/53: NC_018939
No PAI sequences
java.lang.NumberFormatException: For input string: "Y"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67)
	at java.base/java.lang.Integer.parseInt(Integer.java:662)
	at java.base/java.lang.Integer.parseInt(Integer.java:778)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.readInputFile(AlgoNOSEP.java:809)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.runAlgorithm(AlgoNOSEP.java:878)
	at ca.pfv.spmf.algorithmmanager.descriptions.DescriptionAlgoNOSEP.runAlgorithm(DescriptionAlgoNOSEP.java:52)
	at ca.pfv.spmf.gui.CommandProcessor.runAlgorithm(CommandProcessor.java:265)
	at ca.pfv.spmf.gui.Main.processCommandLineArguments(Main.java:151)
	at ca.pfv.spmf.gui.Main.main(Main.java:64)
CPAI 1/2: 0 unique patterns
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Genome 24/53: NC_017741
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 0
 Total time ~ 9 ms
 Maximum memory usage : 10.952568054199219 mb
 Calculation count 0

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 13572
 Total time ~ 34770 ms
 Maximum memory usage : 33.820091247558594 mb
 Calculation count 26862



CPAI 1/3: 51 unique patterns
CPAI 2/3: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 28584
 Total time ~ 169700 ms
 Maximum memory usage : 176.6141128540039 mb
 Calculation count 57265



CPAI 3/3: 367 unique patterns
No NPAI sequences
--------------------------
Genome 25/53: NC_018937
No PAI sequences
java.lang.NumberFormatException: For input string: "Y"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67)
	at java.base/java.lang.Integer.parseInt(Integer.java:662)
	at java.base/java.lang.Integer.parseInt(Integer.java:778)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.readInputFile(AlgoNOSEP.java:809)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.runAlgorithm(AlgoNOSEP.java:878)
	at ca.pfv.spmf.algorithmmanager.descriptions.DescriptionAlgoNOSEP.runAlgorithm(DescriptionAlgoNOSEP.java:52)
	at ca.pfv.spmf.gui.CommandProcessor.runAlgorithm(CommandProcessor.java:265)
	at ca.pfv.spmf.gui.Main.processCommandLineArguments(Main.java:151)
	at ca.pfv.spmf.gui.Main.main(Main.java:64)
CPAI 1/2: 0 unique patterns
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Genome 26/53: NC_000921


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 0
 Total time ~ 9 ms
 Maximum memory usage : 10.952568054199219 mb
 Calculation count 0

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 47737
 Total time ~ 13352082 ms
 Maximum memory usage : 196.75147247314453 mb
 Calculation count 78630



PAI 1/1: 17978 unique patterns
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 27/53: NC_017371
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 25457
 Total time ~ 129126 ms
 Maximum memory usage : 348.8876419067383 mb
 Calculation count 51172



CPAI 1/1: 707 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 28/53: NC_019560
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 12561
 Total time ~ 30257 ms
 Maximum memory usage : 199.0332489013672 mb
 Calculation count 24973



CPAI 1/1: 45 unique patterns


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 6553
 Total time ~ 8153 ms
 Maximum memory usage : 40.442726135253906 mb
 Calculation count 13485



NPAI 1/2: 34 unique patterns


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 14595
 Total time ~ 40415 ms
 Maximum memory usage : 268.6669464111328 mb
 Calculation count 29057



NPAI 2/2: 171 unique patterns
--------------------------
Genome 29/53: NC_017358
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 28591
 Total time ~ 167196 ms
 Maximum memory usage : 247.0511703491211 mb
 Calculation count 57280



NPAI 1/2: 537 unique patterns
NPAI 2/2: island too large
--------------------------
Genome 30/53: NC_021217
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 31/53: NC_017733
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 32/53: NC_017926
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 23689
 Total time ~ 276966 ms
 Maximum memory usage : 240.13111114501953 mb
 Calculation count 47607



CPAI 1/2: 278 unique patterns
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Genome 33/53: NC_018938
No PAI sequences
java.lang.NumberFormatException: For input string: "Y"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67)
	at java.base/java.lang.Integer.parseInt(Integer.java:662)
	at java.base/java.lang.Integer.parseInt(Integer.java:778)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.readInputFile(AlgoNOSEP.java:809)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.runAlgorithm(AlgoNOSEP.java:878)
	at ca.pfv.spmf.algorithmmanager.descriptions.DescriptionAlgoNOSEP.runAlgorithm(DescriptionAlgoNOSEP.java:52)
	at ca.pfv.spmf.gui.CommandProcessor.runAlgorithm(CommandProcessor.java:265)
	at ca.pfv.spmf.gui.Main.processCommandLineArguments(Main.java:151)
	at ca.pfv.spmf.gui.Main.main(Main.java:64)
CPAI 1/2: 0 unique patterns
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Ge

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 0
 Total time ~ 33 ms
 Maximum memory usage : 10.952568054199219 mb
 Calculation count 0

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 58318
 Total time ~ 958458 ms
 Maximum memory usage : 175.55028533935547 mb
 Calculation count 87687



PAI 1/1: 29601 unique patterns
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 36/53: NC_017740
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 28993
 Total time ~ 173417 ms
 Maximum memory usage : 243.3825225830078 mb
 Calculation count 58136



CPAI 1/1: 574 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 37/53: NC_000915
java.lang.NumberFormatException: For input string: "K"
	at java.base/java.lang.NumberFormatException.forInputString(NumberFormatException.java:67)
	at java.base/java.lang.Integer.parseInt(Integer.java:662)
	at java.base/java.lang.Integer.parseInt(Integer.java:778)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.readInputFile(AlgoNOSEP.java:809)
	at ca.pfv.spmf.algorithms.sequentialpatterns.nosep.AlgoNOSEP.runAlgorithm(AlgoNOSEP.java:878)
	at ca.pfv.spmf.algorithmmanager.descriptions.DescriptionAlgoNOSEP.runAlgorithm(DescriptionAlgoNOSEP.java:52)
	at ca.pfv.spmf.gui.CommandProcessor.runAlgorithm(CommandProcessor.java:265)
	at ca.pfv.spmf.gui.Main.processCommandLineArguments(Main.java:151)
	at ca.pfv.spmf.gui.Main.main(Main.java:64)
PAI 1/1: 0 unique patterns
java.lang.NumberFormatException: For input string: "Y"
	at java.base/java.lang.NumberFormatException.forInputS

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 0
 Total time ~ 25 ms
 Maximum memory usage : 14.599899291992188 mb
 Calculation count 0

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 0
 Total time ~ 9 ms
 Maximum memory usage : 10.952568054199219 mb
 Calculation count 0

>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 22144
 Total time ~ 97453 ms
 Maximum memory usage : 173.3035888671875 mb
 Calculation count 44501



CPAI 1/1: 307 unique patterns
NPAI 1/1: island too large
--------------------------
Genome 40/53: NC_017367


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 47917
 Total time ~ 287666 ms
 Maximum memory usage : 55.83085632324219 mb
 Calculation count 77450



PAI 1/1: 19282 unique patterns


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 22377
 Total time ~ 116151 ms
 Maximum memory usage : 191.81851959228516 mb
 Calculation count 44887



CPAI 1/2: 317 unique patterns
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Genome 41/53: NC_014555
No PAI sequences
CPAI 1/2: island too large
CPAI 2/2: island too large
No NPAI sequences
--------------------------
Genome 42/53: NC_017379
No PAI sequences
No CPAI sequences
NPAI 1/2: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 20214
 Total time ~ 90437 ms
 Maximum memory usage : 189.5379867553711 mb
 Calculation count 39945



NPAI 2/2: 469 unique patterns
--------------------------
Genome 43/53: NC_017363
No PAI sequences
No CPAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 9298
 Total time ~ 20304 ms
 Maximum memory usage : 71.9657974243164 mb
 Calculation count 18817



NPAI 1/1: 5071 unique patterns
--------------------------
Genome 44/53: NC_011498
PAI 1/1: island too large
No CPAI sequences
NPAI 1/2: island too large
NPAI 2/2: island too large
--------------------------
Genome 45/53: NC_017357
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 46/53: NC_017742
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 47/53: NC_017361
No PAI sequences


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 28956
 Total time ~ 196069 ms
 Maximum memory usage : 144.9538116455078 mb
 Calculation count 58126



CPAI 1/2: 494 unique patterns
CPAI 2/2: island too large


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 9094
 Total time ~ 17542 ms
 Maximum memory usage : 214.18203735351562 mb
 Calculation count 18565



NPAI 1/1: 66 unique patterns
--------------------------
Genome 48/53: NC_017381
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 49/53: NC_017362
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------
Genome 50/53: NC_021215
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 51/53: NC_017360
No PAI sequences
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 52/53: NC_017365


>/home/maja/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 58019
 Total time ~ 349504 ms
 Maximum memory usage : 159.26554107666016 mb
 Calculation count 88015



PAI 1/1: 28634 unique patterns
CPAI 1/1: island too large
No NPAI sequences
--------------------------
Genome 53/53: NC_017354
No PAI sequences
No CPAI sequences
NPAI 1/1: island too large
--------------------------


In [18]:
with open('./json/nosep/pai_patterns.json', 'w') as file:
    json.dump(unique_pai_patterns, file)
    
with open('./json/nosep/cpai_patterns.json', 'w') as file:
    json.dump(unique_cpai_patterns, file)
    
with open('./json/nosep/npai_patterns.json', 'w') as file:
    json.dump(unique_npai_patterns, file)