# BA1N Generate the d-Neighborhood of a String

In [3]:
from itertools import product

def allKmer(k):
    kmers = [''.join(i) for i in product('AGCT', repeat = k)]
    return kmers

def hammingDist(genome, kmer):
    dist = []
    for i in range(len(genome) - len(kmer) + 1):
        dist.append(sum(x != y for x,y in zip(genome[i : i + len(kmer)], kmer)))
    return(min(dist))

def neighbors(pattern, d):
    kmers = allKmer(len(pattern))
    out = []
    for kmer in kmers:
        if hammingDist(kmer, pattern) <= d:
            out.append(kmer)
    return(out)

def BA1N(filename):
    with open(filename, 'r') as f:
        dna = f.readline().rstrip()
        d = int(f.readline().rstrip())
        out = neighbors(dna, d)
        for i in out:
            print(i)
        return(out)

In [9]:
def fastNeighbors(pattern, d):
    if d == 0:
        return pattern
    if len(pattern) == 1:
        return ['A', 'C', 'G', 'T']
    neighborhood = set()
    suffix_neighbors = fastNeighbors(pattern[1:], d)
    for neighbor in suffix_neighbors:
        if hammingDist(neighbor, pattern[1:]) < d:
            for base in  'ACGT':
                neighborhood.add(base + neighbor)
        if hammingDist(neighbor, pattern[1:]) == d:
            neighborhood.add(pattern[0] + neighbor)
    return neighborhood

## Test

In [4]:
BA1N('BA1N-test.txt')

AAG
AGG
ACA
ACG
ACC
ACT
ATG
GCG
CCG
TCG


['AAG', 'AGG', 'ACA', 'ACG', 'ACC', 'ACT', 'ATG', 'GCG', 'CCG', 'TCG']

In [10]:
fastNeighbors('ACG', 1)

{'AAG', 'ACA', 'ACC', 'ACG', 'ACT', 'AGG', 'ATG', 'CCG', 'GCG', 'TCG'}

In [5]:
BA1N('BA1N-test2.txt')

AAACCAGAG
AAGCCAGAG
AACACAGAG
AACGCAGAG
AACCAAGAG
AACCGAGAG
AACCCAAAG
AACCCAGAA
AACCCAGAG
AACCCAGAC
AACCCAGAT
AACCCAGGG
AACCCAGCG
AACCCAGTG
AACCCACAG
AACCCATAG
AACCCGGAG
AACCCCGAG
AACCCTGAG
AACCTAGAG
AACTCAGAG
AATCCAGAG
AGAACAGAG
AGAGCAGAG
AGACAAGAG
AGACGAGAG
AGACCAAAG
AGACCAGAA
AGACCAGAG
AGACCAGAC
AGACCAGAT
AGACCAGGG
AGACCAGCG
AGACCAGTG
AGACCACAG
AGACCATAG
AGACCGGAG
AGACCCGAG
AGACCTGAG
AGACTAGAG
AGATCAGAG
AGGACAGAG
AGGGCAGAG
AGGCAAGAG
AGGCGAGAG
AGGCCAAAG
AGGCCAGAA
AGGCCAGAG
AGGCCAGAC
AGGCCAGAT
AGGCCAGGG
AGGCCAGCG
AGGCCAGTG
AGGCCACAG
AGGCCATAG
AGGCCGGAG
AGGCCCGAG
AGGCCTGAG
AGGCTAGAG
AGGTCAGAG
AGCAAAGAG
AGCAGAGAG
AGCACAAAG
AGCACAGAA
AGCACAGAG
AGCACAGAC
AGCACAGAT
AGCACAGGG
AGCACAGCG
AGCACAGTG
AGCACACAG
AGCACATAG
AGCACGGAG
AGCACCGAG
AGCACTGAG
AGCATAGAG
AGCGAAGAG
AGCGGAGAG
AGCGCAAAG
AGCGCAGAA
AGCGCAGAG
AGCGCAGAC
AGCGCAGAT
AGCGCAGGG
AGCGCAGCG
AGCGCAGTG
AGCGCACAG
AGCGCATAG
AGCGCGGAG
AGCGCCGAG
AGCGCTGAG
AGCGTAGAG
AGCCAAAAG
AGCCAAGAA
AGCCAAGAG
AGCCAAGAC
AGCCAAGAT
AGCCAAGGG
AGCCAAGCG
AGCCAAGTG


GTCCAACAG
GTCCAATAG
GTCCAGGAG
GTCCACGAG
GTCCATGAG
GTCCGAAAG
GTCCGAGAA
GTCCGAGAG
GTCCGAGAC
GTCCGAGAT
GTCCGAGGG
GTCCGAGCG
GTCCGAGTG
GTCCGACAG
GTCCGATAG
GTCCGGGAG
GTCCGCGAG
GTCCGTGAG
GTCCCAAAA
GTCCCAAAG
GTCCCAAAC
GTCCCAAAT
GTCCCAAGG
GTCCCAACG
GTCCCAATG
GTCCCAGAA
GTCCCAGAG
GTCCCAGAC
GTCCCAGAT
GTCCCAGGA
GTCCCAGGG
GTCCCAGGC
GTCCCAGGT
GTCCCAGCA
GTCCCAGCG
GTCCCAGCC
GTCCCAGCT
GTCCCAGTA
GTCCCAGTG
GTCCCAGTC
GTCCCAGTT
GTCCCACAA
GTCCCACAG
GTCCCACAC
GTCCCACAT
GTCCCACGG
GTCCCACCG
GTCCCACTG
GTCCCATAA
GTCCCATAG
GTCCCATAC
GTCCCATAT
GTCCCATGG
GTCCCATCG
GTCCCATTG
GTCCCGAAG
GTCCCGGAA
GTCCCGGAG
GTCCCGGAC
GTCCCGGAT
GTCCCGGGG
GTCCCGGCG
GTCCCGGTG
GTCCCGCAG
GTCCCGTAG
GTCCCCAAG
GTCCCCGAA
GTCCCCGAG
GTCCCCGAC
GTCCCCGAT
GTCCCCGGG
GTCCCCGCG
GTCCCCGTG
GTCCCCCAG
GTCCCCTAG
GTCCCTAAG
GTCCCTGAA
GTCCCTGAG
GTCCCTGAC
GTCCCTGAT
GTCCCTGGG
GTCCCTGCG
GTCCCTGTG
GTCCCTCAG
GTCCCTTAG
GTCCTAAAG
GTCCTAGAA
GTCCTAGAG
GTCCTAGAC
GTCCTAGAT
GTCCTAGGG
GTCCTAGCG
GTCCTAGTG
GTCCTACAG
GTCCTATAG
GTCCTGGAG
GTCCTCGAG
GTCCTTGAG
GTCTAAGAG
GTCTGAGAG


['AAACCAGAG',
 'AAGCCAGAG',
 'AACACAGAG',
 'AACGCAGAG',
 'AACCAAGAG',
 'AACCGAGAG',
 'AACCCAAAG',
 'AACCCAGAA',
 'AACCCAGAG',
 'AACCCAGAC',
 'AACCCAGAT',
 'AACCCAGGG',
 'AACCCAGCG',
 'AACCCAGTG',
 'AACCCACAG',
 'AACCCATAG',
 'AACCCGGAG',
 'AACCCCGAG',
 'AACCCTGAG',
 'AACCTAGAG',
 'AACTCAGAG',
 'AATCCAGAG',
 'AGAACAGAG',
 'AGAGCAGAG',
 'AGACAAGAG',
 'AGACGAGAG',
 'AGACCAAAG',
 'AGACCAGAA',
 'AGACCAGAG',
 'AGACCAGAC',
 'AGACCAGAT',
 'AGACCAGGG',
 'AGACCAGCG',
 'AGACCAGTG',
 'AGACCACAG',
 'AGACCATAG',
 'AGACCGGAG',
 'AGACCCGAG',
 'AGACCTGAG',
 'AGACTAGAG',
 'AGATCAGAG',
 'AGGACAGAG',
 'AGGGCAGAG',
 'AGGCAAGAG',
 'AGGCGAGAG',
 'AGGCCAAAG',
 'AGGCCAGAA',
 'AGGCCAGAG',
 'AGGCCAGAC',
 'AGGCCAGAT',
 'AGGCCAGGG',
 'AGGCCAGCG',
 'AGGCCAGTG',
 'AGGCCACAG',
 'AGGCCATAG',
 'AGGCCGGAG',
 'AGGCCCGAG',
 'AGGCCTGAG',
 'AGGCTAGAG',
 'AGGTCAGAG',
 'AGCAAAGAG',
 'AGCAGAGAG',
 'AGCACAAAG',
 'AGCACAGAA',
 'AGCACAGAG',
 'AGCACAGAC',
 'AGCACAGAT',
 'AGCACAGGG',
 'AGCACAGCG',
 'AGCACAGTG',
 'AGCACACAG',
 'AGCA

In [11]:
fastNeighbors('GGCCCAGAG', 3)

{'GGCCCATTT',
 'TCCCCGGAG',
 'AAACCAGAG',
 'GGCGCAGTC',
 'TGCCAAGAT',
 'GCTGCAGAG',
 'TGCCCCGTG',
 'GGACCCAAG',
 'GAAGCAGAG',
 'GTCCCTGGG',
 'GACCCAGTG',
 'GCCGCAGAT',
 'GGAGAAGAG',
 'GGAAGAGAG',
 'GGCGCTGAT',
 'GGCTGAGCG',
 'GTCCCAGAC',
 'GACGTAGAG',
 'AGACAAGAG',
 'GGCTCAGGT',
 'GAACCACAG',
 'AGGCCACAG',
 'GCCCCATGG',
 'GGGCGACAG',
 'GGCGCATAT',
 'GACGCACAG',
 'GGTCCAGAT',
 'GCCAGAGAG',
 'GGAACAGAT',
 'GGCCCTTTG',
 'GGACCCGAA',
 'CGCCCAGTC',
 'GGTTAAGAG',
 'GTCCTAGAT',
 'GACCCCGAG',
 'GAGCCCGAG',
 'GGCCTAAAC',
 'CACACAGAG',
 'GTACCAGAT',
 'GGCCCTCAG',
 'AGCCCAGTC',
 'GGACGAGAT',
 'GGCCGCGAT',
 'AGCCCGGAT',
 'AGCCGAGAT',
 'GGACAAGAC',
 'GGCCCCGCA',
 'CGCCCGCAG',
 'CACCCAGAG',
 'GGCTGAAAG',
 'AATCCAGAG',
 'GGTCCAGTC',
 'GGCCCACAA',
 'CGCCCACTG',
 'GTACCGGAG',
 'GGGCAAGGG',
 'GGACGCGAG',
 'GGCCCAACA',
 'GGTCGAGCG',
 'GGCAAGGAG',
 'GGACCAGAT',
 'CGCCCGAAG',
 'CGAGCAGAG',
 'GGCCCGTAG',
 'TCCCGAGAG',
 'GCCCCGGAT',
 'AGCCCTAAG',
 'GGCCCTTGG',
 'GGAACCGAG',
 'GGCACATAC',
 'GGTCCAGCC',
 'GTCC

## Quiz

In [6]:
BA1N('rosalind_ba1n.txt')

AAGGCGGA
AGGGCGGA
ACAGCGGA
ACGACGGA
ACGGAGGA
ACGGGGGA
ACGGCAGA
ACGGCGAA
ACGGCGGA
ACGGCGGG
ACGGCGGC
ACGGCGGT
ACGGCGCA
ACGGCGTA
ACGGCCGA
ACGGCTGA
ACGGTGGA
ACGCCGGA
ACGTCGGA
ACCGCGGA
ACTGCGGA
ATGGCGGA
GAGGCGGA
GGGGCGGA
GCAGCGGA
GCGACGGA
GCGGAGGA
GCGGGGGA
GCGGCAGA
GCGGCGAA
GCGGCGGA
GCGGCGGG
GCGGCGGC
GCGGCGGT
GCGGCGCA
GCGGCGTA
GCGGCCGA
GCGGCTGA
GCGGTGGA
GCGCCGGA
GCGTCGGA
GCCGCGGA
GCTGCGGA
GTGGCGGA
CAGGCGGA
CGGGCGGA
CCAGCGGA
CCGACGGA
CCGGAGGA
CCGGGGGA
CCGGCAGA
CCGGCGAA
CCGGCGGA
CCGGCGGG
CCGGCGGC
CCGGCGGT
CCGGCGCA
CCGGCGTA
CCGGCCGA
CCGGCTGA
CCGGTGGA
CCGCCGGA
CCGTCGGA
CCCGCGGA
CCTGCGGA
CTGGCGGA
TAAGCGGA
TAGACGGA
TAGGAGGA
TAGGGGGA
TAGGCAGA
TAGGCGAA
TAGGCGGA
TAGGCGGG
TAGGCGGC
TAGGCGGT
TAGGCGCA
TAGGCGTA
TAGGCCGA
TAGGCTGA
TAGGTGGA
TAGCCGGA
TAGTCGGA
TACGCGGA
TATGCGGA
TGAGCGGA
TGGACGGA
TGGGAGGA
TGGGGGGA
TGGGCAGA
TGGGCGAA
TGGGCGGA
TGGGCGGG
TGGGCGGC
TGGGCGGT
TGGGCGCA
TGGGCGTA
TGGGCCGA
TGGGCTGA
TGGGTGGA
TGGCCGGA
TGGTCGGA
TGCGCGGA
TGTGCGGA
TCAACGGA
TCAGAGGA
TCAGGGGA
TCAGCAGA
TCAGCGAA
TCAGCGGA
TCAGCGGG
T

['AAGGCGGA',
 'AGGGCGGA',
 'ACAGCGGA',
 'ACGACGGA',
 'ACGGAGGA',
 'ACGGGGGA',
 'ACGGCAGA',
 'ACGGCGAA',
 'ACGGCGGA',
 'ACGGCGGG',
 'ACGGCGGC',
 'ACGGCGGT',
 'ACGGCGCA',
 'ACGGCGTA',
 'ACGGCCGA',
 'ACGGCTGA',
 'ACGGTGGA',
 'ACGCCGGA',
 'ACGTCGGA',
 'ACCGCGGA',
 'ACTGCGGA',
 'ATGGCGGA',
 'GAGGCGGA',
 'GGGGCGGA',
 'GCAGCGGA',
 'GCGACGGA',
 'GCGGAGGA',
 'GCGGGGGA',
 'GCGGCAGA',
 'GCGGCGAA',
 'GCGGCGGA',
 'GCGGCGGG',
 'GCGGCGGC',
 'GCGGCGGT',
 'GCGGCGCA',
 'GCGGCGTA',
 'GCGGCCGA',
 'GCGGCTGA',
 'GCGGTGGA',
 'GCGCCGGA',
 'GCGTCGGA',
 'GCCGCGGA',
 'GCTGCGGA',
 'GTGGCGGA',
 'CAGGCGGA',
 'CGGGCGGA',
 'CCAGCGGA',
 'CCGACGGA',
 'CCGGAGGA',
 'CCGGGGGA',
 'CCGGCAGA',
 'CCGGCGAA',
 'CCGGCGGA',
 'CCGGCGGG',
 'CCGGCGGC',
 'CCGGCGGT',
 'CCGGCGCA',
 'CCGGCGTA',
 'CCGGCCGA',
 'CCGGCTGA',
 'CCGGTGGA',
 'CCGCCGGA',
 'CCGTCGGA',
 'CCCGCGGA',
 'CCTGCGGA',
 'CTGGCGGA',
 'TAAGCGGA',
 'TAGACGGA',
 'TAGGAGGA',
 'TAGGGGGA',
 'TAGGCAGA',
 'TAGGCGAA',
 'TAGGCGGA',
 'TAGGCGGG',
 'TAGGCGGC',
 'TAGGCGGT',
 'TAGGCGCA',