### Compute the Number of Times a Pattern Appears in a Text

https://rosalind.info/problems/ba1a/

In [7]:
def PatternCount(text: str, pattern: str) -> int:
    """
    Count how many times a pattern appears in a text.
    Args:
        text (str): The text to search within.
        pattern (str): The pattern to search for.
    Returns:
        int: The number of times the pattern appears in the text.
    """
    count = 0
    pattern_length = len(pattern)
    text_length = len(text)

    for i in range(text_length - pattern_length + 1):
        if text[i:i + pattern_length] == pattern:
            count += 1
            
    return count

In [17]:
file_path = "data/rosalind_ba1a.txt"
with open(file_path, 'r') as file:
    text = file.readline().strip()
    pattern = file.readline().strip()
    print(PatternCount(text, pattern))

29


### Find the Most Frequent Words in a String

https://rosalind.info/problems/ba1b/

In [14]:
def FrequentWords(Text: str, k: int) -> set:
    """ 
    Find the most frequent k-mers in a given text.
    Args:
        Text (str): The text to search within.
        k (int): The length of the k-mers to find.
    Returns:
        set: A set of the most frequent k-mers.
    """
    freq_map = {}
    n = len(Text)
    
    for i in range(n - k + 1):
        Pattern = Text[i:i + k]
        if Pattern in freq_map:
            freq_map[Pattern] += 1
        else:
            freq_map[Pattern] = 1
    
    max_count = max(freq_map.values())
    frequent_patterns = {pattern for pattern, count in freq_map.items() if count == max_count}
    
    return frequent_patterns

In [18]:
file_path = "data/rosalind_ba1b.txt"
with open(file_path, 'r') as file:
    text = file.readline().strip()
    k = int(file.readline().strip())
    result = FrequentWords(text, k)
    print(" ".join(result))

CCGGCTTTCAAACT GCCGGCTTTCAAAC CGGCTTTCAAACTT


### Find the Reverse Complement of a String

https://rosalind.info/problems/ba1c/

In [18]:
def ReverseComplement(pattern: str) -> str:
    """
    Generate the reverse complement of a DNA string.
    Args:
        pattern (str): The DNA string to find the reverse complement of.
    Returns:
        str: The reverse complement of the input DNA string.
    """
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    reverse_comp = ''.join(complement[base] for base in reversed(pattern))
    return reverse_comp

In [21]:
file_path = "data/rosalind_ba1c.txt"
with open(file_path, 'r') as file:
    pattern = file.readline().strip()
    print(ReverseComplement(pattern))

ACCTCCCGGTATTCAGGGGCTGGCTTTTCACTTTCTACACTGCGTTGGATGAAGCTCGTATTGCTACGGATCGGCACGAGTTGTCACGTAATTTGCCCAATATTGCCGTTGCAAGATTTCACGCTAAGGCGTTGTAAGTAAGGCTACGTAGATGATAGGATTATTAATGCATTTGTTAGGCGGAGAAGGCCGCTGGTCCAACGAATGCCCAGCAGTTGGCCGAGACCTGGACTGGAAGATAACAAGAGAAGGAGGGGGCTCGTGTCGGCAAGGCTGTAATGGCGAGGCCCGTTACATCTAGCGGGCAAACCTGTGGAAAATCTCGCGGACAGAATCGTATTTCTCAGGGGCACTGTGCAACGCTCGTATTAGCGCTTTTAGCGGAACCCATGATTTGGTGTTGGAGCTCTATCGATACATACAGACAGCATTCCTCCGCAGGCGAAATTGAGCAAGTTGTAGTAATCGAACATAACTCGAGGAGTAGTATACTGTCGCGGGCCACATTGTCGATGGGACTGGTCGTAGCCCGTATCAGCTCTTCCCGCGCGAAAGTTAATACTGTGCCTATCCCATGATCAACAAAAATACCGTTACACCTGAATATTTTTTACCATCGCATCTCAAGTTTCCCAACCATCAGATGAGCACGTCCCATTTCTCGCGACAGCGTAGAATGTCATGTTCGATGATGTGTACGTCCACGTGAGGGTCGGACCCATCCTAAATGGGATAACCATTCTGCAAGAGCGGTCGCGTAGAATTGCGGTTGCCTGTTGCGGGTGCCACTGAGGTTATCATGTATACGGATAACAGCCACTAATTGATATTACGTTCTCAGCTCCTTTTCGTAGTCGGGGCGTTACAGACCACTCTTTCCTTGGCCGTATTCGTGCGTAACTTATTTGACTATCGGTGTGCTTGCCATACTCGACAAAGTAAGGCAATGGGCGGTGTGGCGTACCACAATGCTTTAGCAAATCGCTGTCATACTTCCATG

### Find All Occurrences of a Pattern in a String

https://rosalind.info/problems/ba1d/

In [23]:
def PatternMatching(pattern: str, genome: str) -> list:
    """
    Find all starting positions where a pattern appears in a genome.
    Args:
        pattern (str): The pattern to search for.
        genome (str): The genome to search within.
    Returns:
        list: A list of starting positions where the pattern is found in the genome.
    """
    positions = []
    pattern_length = len(pattern)
    genome_length = len(genome)

    for i in range(genome_length - pattern_length + 1):
        if genome[i:i + pattern_length] == pattern:
            positions.append(i)
    
    return positions

In [25]:
file_path = "data/rosalind_ba1d.txt"
with open(file_path, 'r') as file:
    pattern = file.readline().strip()
    genome = file.readline().strip()
    result = PatternMatching(pattern, genome)
    print(" ".join(map(str, result)))

17 115 127 165 207 214 303 334 404 433 450 473 535 577 584 605 762 788 804 823 830 846 861 876 1018 1037 1044 1051 1094 1101 1154 1235 1242 1296 1307 1323 1330 1391 1434 1576 1600 1666 1685 1729 1768 1786 1802 1846 1884 1926 2037 2044 2069 2090 2097 2200 2267 2310 2382 2464 2471 2523 2530 2556 2594 2638 2655 2673 2706 2790 2812 2840 2864 2931 2958 3007 3037 3052 3059 3090 3115 3122 3134 3168 3239 3246 3276 3295 3318 3354 3406 3496 3534 3584 3616 3633 3717 3829 3836 3843 3873 3880 3887 3912 3919 3934 3970 4026 4033 4079 4086 4095 4141 4186 4203 4220 4274 4291 4343 4375 4497 4567 4597 4682 4689 4839 4864 4880 4907 4914 4941 4960 4967 5021 5082 5099 5106 5123 5166 5195 5240 5313 5363 5383 5398 5405 5467 5506 5569 5609 5616 5633 5656 5682 5730 5737 5775 5795 5815 5827 5842 5857 5923 5930 5954 6003 6024 6031 6051 6058 6079 6117 6136 6187 6238 6290 6323 6330 6439 6446 6453 6495 6502 6541 6584 6626 6633 6686 6711 6765 6804 6828 6835 6897 6904 6965 6997 7004 7043 7050 7057 7106 7147 7169 7217 

### Find Patterns Forming Clumps in a String

https://rosalind.info/problems/ba1e/

In [27]:
def ClumpFinding(genome: str, k: int, L: int, t: int) -> set:
    """
    Find all k-mers forming (L, t)-clumps in a genome.
    Args:
        genome (str): The genome to search within.
        k (int): The length of the k-mers to find.
        L (int): The length of the window to search within.
        t (int): The minimum number of occurrences for a k-mer to be considered a clump.
    Returns:
        set: A set of k-mers forming (L, t)-clumps in the genome.
    """
    clumps = set()
    n = len(genome)
    
    for i in range(n - L + 1):
        window = genome[i:i + L]
        freq_map = {}
        
        for j in range(L - k + 1):
            pattern = window[j:j + k]
            if pattern in freq_map:
                freq_map[pattern] += 1
            else:
                freq_map[pattern] = 1
        
        for pattern, count in freq_map.items():
            if count >= t:
                clumps.add(pattern)
    
    return clumps

In [29]:
file_path = "data/rosalind_ba1e.txt"
with open(file_path, 'r') as file:
    genome = file.readline().strip()
    k, L, t = map(int, file.readline().strip().split())
    result = ClumpFinding(genome, k, L, t)
    print(" ".join(result))

ACTAGTTACT TAAACTTGGC ACAGACTAGC AAGCAACCCA CATATCGAAT AGCTACGTTC CATGGGTTGA CTAATTTCAT TATCCAGGAG


### Find a Position in a Genome Minimizing the Skew

https://rosalind.info/problems/ba1f/

In [1]:
def MinimumSkew(genome: str) -> list:
    """
    Find all positions in a genome where the skew is minimized.
    Args:
        genome (str): The genome to analyze.
    Returns:
        list: A list of positions where the skew is minimized.
    """
    skew = 0
    min_skew = 0
    min_positions = [0]
    
    for i, nucleotide in enumerate(genome):
        if nucleotide == 'G':
            skew += 1
        elif nucleotide == 'C':
            skew -= 1
        
        if skew < min_skew:
            min_skew = skew
            min_positions = [i + 1]
        elif skew == min_skew:
            min_positions.append(i + 1)
    
    return min_positions

In [4]:
file_path = "data/rosalind_ba1f.txt"
with open(file_path, 'r') as file:
    genome = file.readline().strip()
    result = MinimumSkew(genome)
    print(" ".join(map(str, result)))

73356 73357


### Compute the Hamming Distance Between Two Strings

https://rosalind.info/problems/ba1g/

In [5]:
def HammingDistance(p: str, q: str) -> int:
    """
    Compute the Hamming distance between two strings.
    Args:
        p (str): The first string.
        q (str): The second string.
    Returns:
        int: The Hamming distance between the two strings.
    """
    if len(p) != len(q):
        raise ValueError("Strings must be of equal length")
    
    distance = sum(1 for x, y in zip(p, q) if x != y)
    return distance

In [9]:
file_path = "data/rosalind_ba1g.txt"
with open(file_path, 'r') as file:
    p = file.readline().strip()
    q = file.readline().strip()
    print(HammingDistance(p, q))

915


### Find All Approximate Occurrences of a Pattern in a String

https://rosalind.info/problems/ba1h/

In [10]:
def ApproximatePatternMatching(pattern: str, text: str, d: int) -> list:
    """
    Find all starting positions where a pattern appears in a text with at most d mismatches.
    Args:
        pattern (str): The pattern to search for.
        text (str): The text to search within.
        d (int): The maximum number of mismatches allowed.
    Returns:
        list: A list of starting positions where the pattern is found in the text with at most d mismatches.
    """
    positions = []
    pattern_length = len(pattern)
    text_length = len(text)

    for i in range(text_length - pattern_length + 1):
        window = text[i:i + pattern_length]
        mismatches = sum(1 for x, y in zip(pattern, window) if x != y)
        if mismatches <= d:
            positions.append(i)
    
    return positions

In [11]:
file_path = "data/rosalind_ba1h.txt"
with open(file_path, 'r') as file:
    pattern = file.readline().strip()
    text = file.readline().strip()
    d = int(file.readline().strip())
    result = ApproximatePatternMatching(pattern, text, d)
    print(" ".join(map(str, result)))

0 4 9 13 16 17 21 22 29 32 42 47 58 61 62 68 69 75 85 86 87 90 92 93 101 108 116 124 125 129 130 144 152 158 159 170 176 185 187 197 206 210 211 228 238 239 245 250 258 262 267 275 285 289 292 300 311 316 321 331 342 345 348 357 372 373 377 401 404 406 409 425 430 447 448 456 467 473 480 481 490 498 499 514 515 519 530 533 537 546 555 556 560 573 578 580 587 590 591 605 606 617 621 622 633 634 647 652 655 663 664 668 674 675 705 715 716 720 724 725 729 734 745 760 763 765 767 768 782 783 787 799 812 813 826 829 840 844 848 849 858 863 872 874 884 893 894 898 912 917 920 925 934 935 941 948 960 979 983 984 986 994 1001 1002 1006 1012 1015 1019 1020 1026 1030 1031 1044 1063 1064 1068 1073 1074 1085 1086 1093 1098 1102 1103 1112 1123 1125 1138 1139 1144 1155 1170 1183 1184 1189 1194 1198 1199 1205 1215 1218 1225 1235 1246 1247 1259 1268 1286 1297 1301 1311 1315 1332 1345 1361 1374 1375 1379 1384 1386 1391 1403 1410 1415 1420 1429 1432 1440 1448 1454 1455 1464 1465 1478 1479 1482 1487 1495

### Find the Most Frequent Words with Mismatches in a String

https://rosalind.info/problems/ba1i/

In [12]:
def FrequentWordsWithMismatches(Text: str, k: int, d: int) -> set:
    """
    Find the most frequent k-mers with up to d mismatches in a given text.
    Args:
        Text (str): The text to search within.
        k (int): The length of the k-mers to find.
        d (int): The maximum number of mismatches allowed.
    Returns:
        set: A set of the most frequent k-mers with up to d mismatches.
    """
    from itertools import product

    def neighbors(pattern: str, d: int) -> set:
        if d == 0:
            return {pattern}
        if len(pattern) == 0:
            return {""}
        
        neighborhood = set()
        suffix_neighbors = neighbors(pattern[1:], d)
        
        for text in suffix_neighbors:
            if HammingDistance(pattern[1:], text) < d:
                for nucleotide in "ACGT":
                    neighborhood.add(nucleotide + text)
            else:
                neighborhood.add(pattern[0] + text)
        
        return neighborhood

    freq_map = {}
    n = len(Text)
    
    for i in range(n - k + 1):
        pattern = Text[i:i + k]
        neighborhood = neighbors(pattern, d)
        
        for neighbor in neighborhood:
            if neighbor in freq_map:
                freq_map[neighbor] += 1
            else:
                freq_map[neighbor] = 1
    
    max_count = max(freq_map.values())
    frequent_patterns = {pattern for pattern, count in freq_map.items() if count == max_count}
    
    return frequent_patterns

In [13]:
file_path = "data/rosalind_ba1i.txt"
with open(file_path, 'r') as file:
    text = file.readline().strip()
    k, d = map(int, file.readline().strip().split())
    result = FrequentWordsWithMismatches(text, k, d)
    print(" ".join(result))

CCCAAAC


Below states a nicer function definition:

In [14]:
from itertools import product

def neighbors(pattern: str, d: int) -> set:
    """
    Generate the d-neighborhood of a pattern (all strings that differ from the pattern by at most d mismatches).
    Args:
        pattern (str): The original pattern.
        d (int): The maximum number of mismatches allowed.
    Returns:
        set: A set of all strings in the d-neighborhood of the pattern.
    """
    if d == 0:
        return {pattern}
    if len(pattern) == 0:
        return {""}
    
    nucleotides = ['A', 'C', 'G', 'T']
    neighborhood = set()
    
    suffix_neighbors = neighbors(pattern[1:], d)
    
    for text in suffix_neighbors:
        if HammingDistance(pattern[1:], text) < d:
            for nucleotide in nucleotides:
                neighborhood.add(nucleotide + text)
        else:
            neighborhood.add(pattern[0] + text)
    
    return neighborhood


def FrequentWordsWithMismatches(Text: str, k: int, d: int) -> set:
    """
    Find the most frequent k-mers with up to d mismatches in a given text.
    Args:
        Text (str): The text to search within.
        k (int): The length of the k-mers to find.
        d (int): The maximum number of mismatches allowed.
    Returns:
        set: A set of the most frequent k-mers with up to d mismatches.
    """

    freq_map = {}
    n = len(Text)
    
    for i in range(n - k + 1):
        pattern = Text[i:i + k]
        neighborhood = neighbors(pattern, d)
        
        for neighbor in neighborhood:
            if neighbor in freq_map:
                freq_map[neighbor] += 1
            else:
                freq_map[neighbor] = 1
    
    max_count = max(freq_map.values())
    frequent_patterns = {pattern for pattern, count in freq_map.items() if count == max_count}
    
    return frequent_patterns

In [15]:
file_path = "data/rosalind_ba1i2.txt"
with open(file_path, 'r') as file:
    text = file.readline().strip()
    k, d = map(int, file.readline().strip().split())
    result = FrequentWordsWithMismatches(text, k, d)
    print(" ".join(result))

TTTTT


### Find Frequent Words with Mismatches and Reverse Complements

https://rosalind.info/problems/ba1j/

In [19]:
def FrequentWordsWithMismatchesAndReverseComplements(Text: str, k: int, d: int) -> set:
    """
    Find the most frequent k-mers with up to d mismatches and their reverse complements in a given text.
    Args:
        Text (str): The text to search within.
        k (int): The length of the k-mers to find.
        d (int): The maximum number of mismatches allowed.
    Returns:
        set: A set of the most frequent k-mers with up to d mismatches and their reverse complements.
    """
    from itertools import product

    def neighbors(pattern: str, d: int) -> set:
        if d == 0:
            return {pattern}
        if len(pattern) == 0:
            return {""}
        
        neighborhood = set()
        suffix_neighbors = neighbors(pattern[1:], d)
        
        for text in suffix_neighbors:
            if HammingDistance(pattern[1:], text) < d:
                for nucleotide in "ACGT":
                    neighborhood.add(nucleotide + text)
            else:
                neighborhood.add(pattern[0] + text)
        
        return neighborhood

    freq_map = {}
    n = len(Text)
    
    for i in range(n - k + 1):
        pattern = Text[i:i + k]
        neighborhood = neighbors(pattern, d)
        rev_comp = ReverseComplement(pattern)
        neighborhood.update(neighbors(rev_comp, d))
        
        for neighbor in neighborhood:
            if neighbor in freq_map:
                freq_map[neighbor] += 1
            else:
                freq_map[neighbor] = 1
    
    max_count = max(freq_map.values())
    frequent_patterns = {pattern for pattern, count in freq_map.items() if count == max_count}
    
    return frequent_patterns

In [20]:
file_path = "data/rosalind_ba1j.txt"
with open(file_path, 'r') as file:
    text = file.readline().strip()
    k, d = map(int, file.readline().strip().split())
    result = FrequentWordsWithMismatchesAndReverseComplements(text, k, d)
    print(" ".join(result))

CGCGCGC GCGCGCG


Below states a nicer function definition:

In [26]:
from itertools import product

def neighbors(pattern: str, d: int) -> set:
    """
    Generate the d-neighborhood of a pattern (all strings that differ from the pattern by at most d mismatches).
    Args:
        pattern (str): The original pattern.
        d (int): The maximum number of mismatches allowed.
    Returns:
        set: A set of all strings in the d-neighborhood of the pattern.
    """
    if d == 0:
        return {pattern}
    if len(pattern) == 0:
        return {""}
    
    nucleotides = ['A', 'C', 'G', 'T']
    neighborhood = set()
    
    suffix_neighbors = neighbors(pattern[1:], d)
    
    for text in suffix_neighbors:
        if HammingDistance(pattern[1:], text) < d:
            for nucleotide in nucleotides:
                neighborhood.add(nucleotide + text)
        else:
            neighborhood.add(pattern[0] + text)
    
    return neighborhood


def FrequentWordsWithMismatchesAndReverseComplements(Text: str, k: int, d: int) -> set:
    """
    Find the most frequent k-mers with up to d mismatches and their reverse complements in a given text.
    Args:
        Text (str): The text to search within.
        k (int): The length of the k-mers to find.
        d (int): The maximum number of mismatches allowed.
    Returns:
        set: A set of the most frequent k-mers with up to d mismatches and their reverse complements.
    """

    freq_map = {}
    n = len(Text)
    
    for i in range(n - k + 1):
        pattern = Text[i:i + k]
        neighborhood = neighbors(pattern, d)
        rev_comp = ReverseComplement(pattern)
        neighborhood.update(neighbors(rev_comp, d))
        
        for neighbor in neighborhood:
            if neighbor in freq_map:
                freq_map[neighbor] += 1
            else:
                freq_map[neighbor] = 1
    
    max_count = max(freq_map.values())
    frequent_patterns = {pattern for pattern, count in freq_map.items() if count == max_count}
    
    return frequent_patterns

In [27]:
file_path = "data/rosalind_ba1j2.txt"
with open(file_path, 'r') as file:
    text = file.readline().strip()
    k, d = map(int, file.readline().strip().split())
    result = FrequentWordsWithMismatchesAndReverseComplements(text, k, d)
    print(" ".join(result))

CCCCCCC GGGGGGG
