In [1]:
import os
from Bio import SeqIO

In [2]:
def read_data():
    base_directory = './e_coli_paidb/'
    data = {}
    
    for name in os.listdir(base_directory):
        dir_path = os.path.join(base_directory, name)

        if os.path.isdir(dir_path):
            fasta_file = os.path.join(dir_path, f"{name}.fasta")
            pai_file   = os.path.join(dir_path, f"{name}.pai")
            cpai_file  = os.path.join(dir_path, f"{name}.cpai")
            npai_file  = os.path.join(dir_path, f"{name}.npai")

            data[name] = {}
            genome_sequence = next(SeqIO.parse(fasta_file, 'fasta'))
            data[name]['genome_sequence'] = genome_sequence

            with open(pai_file, 'r') as file:
                data[name]['pai_sequences'] = []

                while True:
                    line = file.readline()
                    if not line:
                        break

                    line = line.split()
                    pai_start_index = int(line[0])
                    pai_end_index = int(line[1])

                    pai_sequence = genome_sequence.seq[pai_start_index:pai_end_index]

                    data[name]['pai_sequences'].append((pai_sequence, (pai_start_index, pai_end_index)))

            with open(cpai_file, 'r') as file:
                data[name]['cpai_sequences'] = []

                while True:
                    line = file.readline()
                    if not line:
                        break

                    line = line.split()
                    cpai_start_index = int(line[0])
                    cpai_end_index = int(line[1])

                    cpai_sequence = genome_sequence.seq[cpai_start_index:cpai_end_index]

                    data[name]['cpai_sequences'].append((cpai_sequence, (cpai_start_index, cpai_end_index)))

            with open(npai_file, 'r') as file:
                data[name]['npai_sequences'] = []

                while True:
                    line = file.readline()
                    if not line:
                        break

                    line = line.split()
                    npai_start_index = int(line[0])
                    npai_end_index = int(line[1])

                    npai_sequence = genome_sequence.seq[npai_start_index:npai_end_index]

                    data[name]['npai_sequences'].append((npai_sequence, (npai_start_index, npai_end_index)))

    return data

### TF-IDF

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(sequence, k):
    kmers = get_kmers(str(sequence), k)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([kmers])
    tfidf_matrix = X.toarray()
    return tfidf_matrix, vectorizer

def get_kmers(sequence, k):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    return ' '.join(kmers)

def get_kmers_map(tfidf_matrix, vectorizer, sequence, top_n=20):
    feature_names = vectorizer.get_feature_names_out()
    feature_scores = tfidf_matrix.flatten()
    
    top_kmers_indices = feature_scores.argsort()[-top_n:][::-1]
    top_kmers = [(feature_names[i], feature_scores[i]) for i in top_kmers_indices]

    mapped_kmers = {}
    for kmer, score in top_kmers:
        kmer = kmer.upper()
        count = sequence.count(kmer)
        mapped_kmers[kmer] = {'score': score, 'count': count}
        
    return mapped_kmers

def find_unique_kmers(full_sequence, start_index, end_index, significant_kmers, unique_kmers):
    for kmer, details in significant_kmers.items():
        left_occurrences = full_sequence[:start_index].count(kmer)
        right_occurrences = full_sequence[end_index:].count(kmer)
        occurrences = left_occurrences + right_occurrences
        if occurrences == 0:
            unique_kmers.append((kmer, details['count']))

In [3]:
def get_unique_patterns(full_sequence, island_sequence, start_index, end_index, k_max):
    
    unique_kmers = []
    
    for k in range(4, k_max):
        tfidf_matrix, vectorizer = tfidf(island_sequence, k)
        kmers = get_kmers_map(tfidf_matrix, vectorizer, island_sequence)
        find_unique_kmers(full_sequence, start_index, end_index, kmers, unique_kmers)
        
    return unique_kmers

In [39]:
e_coli_data = read_data()

In [5]:
import logging

logging.basicConfig(filename='tfidf.log', level=logging.INFO,
                    format='%(message)s')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(message)s'))

logging.getLogger('').addHandler(console)

In [6]:
unique_pai_patterns  = {}
unique_cpai_patterns = {}
unique_npai_patterns = {}

k_max = 100
logging.info(f'Max pattern length: {k_max}')
logging.info('--------------------------')

for i, (name, data) in enumerate(e_coli_data.items()):
    logging.info(f'Genome {i+1}/{len(e_coli_data)}: {name}')
    
    if len(data['pai_sequences']) != 0:
        unique_pai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (pai_sequence, indices) in enumerate(data['pai_sequences']):
            unique_patterns = get_unique_patterns(genome_sequence, pai_sequence, indices[0], indices[1], k_max)

            unique_pai_patterns[name]['pai_sequence'] = pai_sequence
            unique_pai_patterns[name]['indices'] = indices
            unique_pai_patterns[name]['patterns'] = unique_patterns

            logging.info(f"PAI {j+1}/{len(data['pai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'e_coli_paidb/{name}/patterns/pai/', exist_ok=True)
            
            with open(f'e_coli_paidb/{name}/patterns/pai/pai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
                        
    else:
        logging.info('No PAI sequences')
    
    if len(data['cpai_sequences']) != 0:    
        unique_cpai_patterns[name] = {}

        genome_sequence = data['genome_sequence']
        for j, (cpai_sequence, indices) in enumerate(data['cpai_sequences']):
            unique_patterns = get_unique_patterns(genome_sequence, cpai_sequence, indices[0], indices[1], k_max)

            unique_cpai_patterns[name]['cpai_sequence'] = cpai_sequence
            unique_cpai_patterns[name]['indices'] = indices
            unique_cpai_patterns[name]['patterns'] = unique_patterns
            
            logging.info(f"CPAI {j+1}/{len(data['cpai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'e_coli_paidb/{name}/patterns/cpai/', exist_ok=True)
            
            with open(f'e_coli_paidb/{name}/patterns/cpai/cpai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
            
    else:
        logging.info('No CPAI sequences')
        
    if len(data['npai_sequences']) != 0:
        unique_npai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (npai_sequence, indices) in enumerate(data['npai_sequences']):
            unique_patterns = get_unique_patterns(genome_sequence, npai_sequence, indices[0], indices[1], k_max)

            unique_npai_patterns[name]['npai_sequence'] = npai_sequence
            unique_npai_patterns[name]['indices'] = indices
            unique_npai_patterns[name]['patterns'] = unique_patterns

            logging.info(f"NPAI {j+1}/{len(data['npai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'e_coli_paidb/{name}/patterns/npai/', exist_ok=True)
            
            with open(f'e_coli_paidb/{name}/patterns/npai/npai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
    
    else:
        logging.info('No NPAI sequences')
    logging.info('--------------------------')

Max pattern length: 100
--------------------------
Genome 1/90: NC_010468
No PAI sequences
CPAI 1/3: 1742 unique patterns
CPAI 2/3: 1735 unique patterns
CPAI 3/3: 631 unique patterns
NPAI 1/2: 1743 unique patterns
NPAI 2/2: 1746 unique patterns
--------------------------
Genome 2/90: NC_011748
No PAI sequences
CPAI 1/10: 641 unique patterns
CPAI 2/10: 1760 unique patterns
CPAI 3/10: 1751 unique patterns
CPAI 4/10: 1749 unique patterns
CPAI 5/10: 1619 unique patterns
CPAI 6/10: 581 unique patterns
CPAI 7/10: 36 unique patterns
CPAI 8/10: 1638 unique patterns
CPAI 9/10: 1648 unique patterns
CPAI 10/10: 1066 unique patterns
NPAI 1/1: 1740 unique patterns
--------------------------
Genome 3/90: NC_011751
PAI 1/1: 1454 unique patterns
CPAI 1/7: 657 unique patterns
CPAI 2/7: 1754 unique patterns
CPAI 3/7: 1738 unique patterns
CPAI 4/7: 1745 unique patterns
CPAI 5/7: 1753 unique patterns
CPAI 6/7: 1748 unique patterns
CPAI 7/7: 1399 unique patterns
NPAI 1/2: 1748 unique patterns
NPAI 2/2: 173

CPAI 3/8: 1752 unique patterns
CPAI 4/8: 1749 unique patterns
CPAI 5/8: 1744 unique patterns
CPAI 6/8: 1736 unique patterns
CPAI 7/8: 1743 unique patterns
CPAI 8/8: 1746 unique patterns
NPAI 1/3: 1749 unique patterns
NPAI 2/3: 1749 unique patterns
NPAI 3/3: 1750 unique patterns
--------------------------
Genome 34/90: NC_009800
No PAI sequences
CPAI 1/4: 654 unique patterns
CPAI 2/4: 1768 unique patterns
CPAI 3/4: 1407 unique patterns
CPAI 4/4: 1736 unique patterns
NPAI 1/1: 1688 unique patterns
--------------------------
Genome 35/90: NC_004431
PAI 1/2: 14 unique patterns
PAI 2/2: 4 unique patterns
CPAI 1/12: 664 unique patterns
CPAI 2/12: 1716 unique patterns
CPAI 3/12: 1744 unique patterns
CPAI 4/12: 1753 unique patterns
CPAI 5/12: 10 unique patterns
CPAI 6/12: 1742 unique patterns
CPAI 7/12: 1678 unique patterns
CPAI 8/12: 1626 unique patterns
CPAI 9/12: 1753 unique patterns
CPAI 10/12: 1748 unique patterns
CPAI 11/12: 1633 unique patterns
CPAI 12/12: 6 unique patterns
NPAI 1/2: 17

--------------------------
Genome 66/90: NC_012947
No PAI sequences
CPAI 1/5: 1756 unique patterns
CPAI 2/5: 1767 unique patterns
CPAI 3/5: 1742 unique patterns
CPAI 4/5: 1743 unique patterns
CPAI 5/5: 1753 unique patterns
NPAI 1/2: 0 unique patterns
NPAI 2/2: 1684 unique patterns
--------------------------
Genome 67/90: NC_011747
No PAI sequences
CPAI 1/3: 1803 unique patterns
CPAI 2/3: 1759 unique patterns
CPAI 3/3: 1803 unique patterns
NPAI 1/1: 1813 unique patterns
--------------------------
Genome 68/90: NC_013654
No PAI sequences
CPAI 1/4: 1746 unique patterns
CPAI 2/4: 1413 unique patterns
CPAI 3/4: 1737 unique patterns
CPAI 4/4: 1394 unique patterns
NPAI 1/2: 1745 unique patterns
NPAI 2/2: 1743 unique patterns
--------------------------
Genome 69/90: NC_011745
No PAI sequences
CPAI 1/11: 659 unique patterns
CPAI 2/11: 1561 unique patterns
CPAI 3/11: 1739 unique patterns
CPAI 4/11: 1528 unique patterns
CPAI 5/11: 1410 unique patterns
CPAI 6/11: 1751 unique patterns
CPAI 7/11: 13

In [7]:
unique_pai_patterns

{'NC_011751': {'pai_sequence': Seq('CCCGGATAGCTCAGTCGGTAGAGCAGGGGATTGAAAATCCCCGTGTCCTTGGTT...CAC'),
  'indices': (3445924, 3503263),
  'patterns': [('ACATTGGCATT', 3),
   ('GCATCAGAATA', 3),
   ('CAGATTCCCCCC', 2),
   ('TTCCCTGCCCGG', 2),
   ('AGCGGCACGGAT', 2),
   ('AGTGTCCTGCTG', 2),
   ('ATAATCCCTGTT', 2),
   ('GCCAAGGCCGGC', 2),
   ('TTCAAGAATTTTT', 2),
   ('TTTCCAGCATCAT', 2),
   ('TTCCATCAATAAC', 2),
   ('CCAGCGGCACGGA', 2),
   ('GCCAAGGCCGGCG', 2),
   ('ATTTCAGAATATA', 2),
   ('TCCACATTCTGTA', 2),
   ('GGGCTTCAGCCTG', 2),
   ('TCCACATCATCCG', 2),
   ('CAGGCTGATCAAA', 2),
   ('TTTCCGCTGTTTAC', 2),
   ('AGGGCTTCAGCCTG', 2),
   ('AATAATCCCTGTTG', 2),
   ('CAGGCTGATCAAATG', 2),
   ('TAAGAATAACATTTC', 2),
   ('TTTCTGCTCATCCAG', 2),
   ('AATAATCCCTGTTGA', 2),
   ('TGGCATCAGAATACTC', 2),
   ('CATGCTCATTGAAAAAATACT', 1),
   ('CATGGAACATCGTTCAGTGATT', 1),
   ('CATGCGTCACACCTCGTCCTTT', 1),
   ('CATGGACTCATTAACACCCTTT', 1),
   ('CATGATGTGACATTAAGATCAGC', 1),
   ('CATGCTGCCCGGAATGTGGAGGC', 

In [23]:
import json

with open('pai_patterns.json', 'w') as file:
    json.dump(unique_pai_patterns, file)
    
with open('cpai_patterns.json', 'w') as file:
    json.dump(unique_cpai_patterns, file)
    
with open('npai_patterns.json', 'w') as file:
    json.dump(unique_npai_patterns, file)

### NOSEP

In [3]:
from spmf import Spmf
import os
from Bio import SeqIO

In [4]:
def to_spmf_format(genome_sequence):
    seq = str(genome_sequence)
    
    seq = seq.replace('A', '1')
    seq = seq.replace('C', '2')
    seq = seq.replace('G', '3')
    seq = seq.replace('T', '4')

    seq = ''.join(c + ' -1 ' for c in seq).rstrip(' -1 ') + ' -2'
    
    return seq

In [5]:
def from_spmf_format(filename):
    patterns = []
    
    with open(filename, 'r') as file:
        for line in file:
            lst = line.split('#')
            
            pattern = lst[0]
            support = lst[1]
            
            pattern = pattern.replace('-1', '')
            pattern = pattern.replace('1', 'A')
            pattern = pattern.replace('2', 'C')
            pattern = pattern.replace('3', 'G')
            pattern = pattern.replace('4', 'T')
            pattern = pattern.replace(' ', '')
            
            support = int(support.replace('SUP: ', ''))
            patterns.append((pattern, support))
            
    return patterns

In [6]:
import logging

logging.basicConfig(filename='logs/nosep.log', level=logging.INFO,
                    format='%(message)s')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(message)s'))

logging.getLogger('').addHandler(console)

In [7]:
e_coli_data = read_data()

In [8]:
def run_nosep(full_sequence, island_sequence, start_index, end_index, k_max):
    
    unique_patterns = []
    
    input_filename = 'input.txt'
    output_filename = 'output.txt'
    
    with open(input_filename, 'w') as input_file:
        input_file.write(to_spmf_format(island_sequence))
    
    spmf = Spmf("NOSEP", input_filename=input_filename, output_filename=output_filename, arguments=[4, k_max, 0, 0, 2])
    spmf.run()
    
    patterns = from_spmf_format(output_filename)
    
    os.remove(input_filename)
    os.remove(output_filename)
    
    for pattern, support in patterns:
        left_occurrences = full_sequence[:start_index].count(pattern)
        right_occurrences = full_sequence[end_index:].count(pattern)
        occurrences = left_occurrences + right_occurrences
        if occurrences == 0:
            unique_patterns.append((pattern, support))
            
    return unique_patterns

In [9]:
unique_pai_patterns  = {}
unique_cpai_patterns = {}
unique_npai_patterns = {}

k_max = 100
logging.info(f'Max pattern length: {k_max}')
logging.info('--------------------------')

for i, (name, data) in enumerate(e_coli_data.items()):
    logging.info(f'Genome {i+1}/{len(e_coli_data)}: {name}')
    
    if len(data['pai_sequences']) != 0:
        unique_pai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (pai_sequence, indices) in enumerate(data['pai_sequences']):
            unique_patterns = run_nosep(genome_sequence, pai_sequence, indices[0], indices[1], k_max)

            unique_pai_patterns[name]['pai_sequence'] = pai_sequence
            unique_pai_patterns[name]['indices'] = indices
            unique_pai_patterns[name]['patterns'] = unique_patterns

            logging.info(f"PAI {j+1}/{len(data['pai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'e_coli_paidb/{name}/patterns/nosep/pai/', exist_ok=True)
            
            with open(f'e_coli_paidb/{name}/patterns/nosep/pai/pai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
                        
    else:
        logging.info('No PAI sequences')
    
    if len(data['cpai_sequences']) != 0:    
        unique_cpai_patterns[name] = {}

        genome_sequence = data['genome_sequence']
        for j, (cpai_sequence, indices) in enumerate(data['cpai_sequences']):
            unique_patterns = run_nosep(genome_sequence, cpai_sequence, indices[0], indices[1], k_max)

            unique_cpai_patterns[name]['cpai_sequence'] = cpai_sequence
            unique_cpai_patterns[name]['indices'] = indices
            unique_cpai_patterns[name]['patterns'] = unique_patterns
            
            logging.info(f"CPAI {j+1}/{len(data['cpai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'e_coli_paidb/{name}/patterns/nosep/cpai/', exist_ok=True)
            
            with open(f'e_coli_paidb/{name}/patterns/nosep/cpai/cpai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
            
    else:
        logging.info('No CPAI sequences')
        
    if len(data['npai_sequences']) != 0:
        unique_npai_patterns[name] = {}
        genome_sequence = data['genome_sequence']
        for j, (npai_sequence, indices) in enumerate(data['npai_sequences']):
            unique_patterns = run_nosep(genome_sequence, npai_sequence, indices[0], indices[1], k_max)

            unique_npai_patterns[name]['npai_sequence'] = npai_sequence
            unique_npai_patterns[name]['indices'] = indices
            unique_npai_patterns[name]['patterns'] = unique_patterns

            logging.info(f"NPAI {j+1}/{len(data['npai_sequences'])}: {len(unique_patterns)} unique patterns")
            
            os.makedirs(f'e_coli_paidb/{name}/patterns/nosep/npai/', exist_ok=True)
            
            with open(f'e_coli_paidb/{name}/patterns/nosep/npai/npai_{j}_patterns.txt', 'w') as file:
                for unique_pattern, frequency in unique_patterns:
                    file.write(f'{unique_pattern}: {frequency}\n')
    
    else:
        logging.info('No NPAI sequences')
    logging.info('--------------------------')

Max pattern length: 100
--------------------------
Genome 1/90: NC_010468
No PAI sequences


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 6452
 Total time ~ 11791 ms
 Maximum memory usage : 10.53265380859375 mb
 Calculation count 13811



CPAI 1/3: 4 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 14619
 Total time ~ 64137 ms
 Maximum memory usage : 143.85284423828125 mb
 Calculation count 31685



CPAI 2/3: 70 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 15750
 Total time ~ 58746 ms
 Maximum memory usage : 53.34174346923828 mb
 Calculation count 31715



CPAI 3/3: 96 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 8534
 Total time ~ 22462 ms
 Maximum memory usage : 91.68647766113281 mb
 Calculation count 18567



NPAI 1/2: 15 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 11613
 Total time ~ 40709 ms
 Maximum memory usage : 138.09127807617188 mb
 Calculation count 25389



NPAI 2/2: 17 unique patterns
--------------------------
Genome 2/90: NC_011748
No PAI sequences


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 29268
 Total time ~ 256172 ms
 Maximum memory usage : 132.1083984375 mb
 Calculation count 60595



CPAI 1/10: 126 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 62412
 Total time ~ 1477344 ms
 Maximum memory usage : 85.3110580444336 mb
 Calculation count 132017



CPAI 2/10: 4084 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 11665
 Total time ~ 42953 ms
 Maximum memory usage : 163.51680755615234 mb
 Calculation count 25137



CPAI 3/10: 61 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 6309
 Total time ~ 13261 ms
 Maximum memory usage : 103.59986114501953 mb
 Calculation count 13748



CPAI 4/10: 10 unique patterns


>/home/veljko/.local/lib/python3.10/site-packages/spmf/spmf.jar
 Number of patterns found: 39970
 Total time ~ 534894 ms
 Maximum memory usage : 206.57483673095703 mb
 Calculation count 86387



CPAI 5/10: 198 unique patterns


KeyboardInterrupt: 