In [1]:
import numpy as np
# Get k-mers from a sequence
def get_kmers(sequence, k):
    return [sequence[x:x+k].lower() for x in range(len(sequence) - k + 1)]

# Get the frequency of each k-mer in a sequence
def freq_kmers(seq, k):
    kmers = get_kmers(seq, k)
    freq = {}
    for kmer in kmers:
        if kmer not in freq: 
            freq[kmer] = 1
        else: 
            freq[kmer] += 1
    return freq


In [43]:
# Implement Spectrum Kernel
def spectrum_kernel(seq1, seq2, k):
	freq1 = freq_kmers(seq1, k)
	freq2 = freq_kmers(seq2, k)
	for key in freq1:
		if key not in freq2:
			freq2[key] = 0
	for key in freq2:
		if key not in freq1:
			freq1[key] = 0
	#freq1 = dict( sorted(freq1.items(), key=lambda x: x[0]) )
	#freq2 = dict( sorted(freq2.items(), key=lambda x: x[0]) )
	#freq1 = {k: v for k, v in sorted(freq1.items(), key=lambda item: item[1], reverse=True)}
	#freq2 = {k: v for k, v in sorted(freq2.items(), key=lambda item: item[1], reverse=True)}
	return np.dot([freq1[key] for key in freq1], [freq2[key] for key in freq1])


In [3]:
# change fasta file into a list of sequences
# fasta reader
def read_fasta(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    return lines

# fasta parser


def parse_fasta(lines):
    seq_list = []
    seq_name = [line for line in lines if line.startswith('>')]
    seq = ''
    for line, index in zip(lines, range(len(lines))):
        if index == len(lines) - 1:
            seq += line.strip()
            seq_list.append(seq)
        if line.startswith('>'):
            seq_list.append(seq)
            seq = ''
            continue
        else:
            seq += line.strip()
    for i in seq_list:
        if i == '':
            seq_list.remove(i)
    return seq_list, seq_name


In [49]:
seq, seq_name = parse_fasta(read_fasta('./kmeans/kmeans.fasta'))
#print(spectrum_kernel(seq1, seq2, 3))
print(seq_name)

['>intergenic1 /class=intergenic\n', '>intergenic1 /class=intergenic\n', '>intergenic2 /class=intergenic\n', '>intergenic2 /class=intergenic\n', '>intergenic3 /class=intergenic\n', '>intergenic3 /class=intergenic\n', '>intergenic4 /class=intergenic\n', '>intergenic4 /class=intergenic\n', '>intergenic5 /class=intergenic\n', '>intergenic5 /class=intergenic\n', '>intergenic6 /class=intergenic\n', '>intergenic6 /class=intergenic\n', '>intergenic7 /class=intergenic\n', '>intergenic7 /class=intergenic\n', '>intergenic8 /class=intergenic\n', '>intergenic8 /class=intergenic\n', '>intergenic9 /class=intergenic\n', '>intergenic9 /class=intergenic\n', '>intergenic10 /class=intergenic\n', '>intergenic10 /class=intergenic\n', '>intergenic11 /class=intergenic\n', '>intergenic11 /class=intergenic\n', '>intergenic12 /class=intergenic\n', '>intergenic12 /class=intergenic\n', '>intergenic13 /class=intergenic\n', '>intergenic13 /class=intergenic\n', '>intergenic14 /class=intergenic\n', '>intergenic14 /cl

In [44]:
seq1 = "AATCCG"
seq2 = "AATGCC"
print(spectrum_kernel(seq1, seq2, 2))


3


In [31]:
freq1 = freq_kmers(seq1, 2)
freq2 = freq_kmers(seq2, 2)
for key in freq1:
	if key not in freq2:
		freq2[key] = 0
for key in freq2:
	if key not in freq1:
		freq1[key] = 0

In [40]:
freq1 = dict( sorted(freq1.items(), key=lambda x: x[0]) )
freq2 = dict( sorted(freq2.items(), key=lambda x: x[0]) )
freq1

{'aa': 1, 'at': 1, 'cc': 1, 'cg': 1, 'gc': 0, 'tc': 1, 'tg': 0}

In [42]:
freq2

{'aa': 1, 'at': 1, 'cc': 1, 'cg': 0, 'gc': 1, 'tc': 0, 'tg': 1}

In [5]:
#KNN
def dist(seq1, seq2, k):
    return np.sqrt(spectrum_kernel(seq1, seq1, k)-2*spectrum_kernel(seq1, seq2, k)+spectrum_kernel(seq2, seq2, k))

def KNN(train, test, knn_num, kmer_size):
    seq_train = [] 
    seq_name_train = []
    for path in train:
        seq, seq_name = parse_fasta(read_fasta(path))
        seq_train += seq
        seq_name_train += seq_name
    seq_test, seq_name_test = parse_fasta(read_fasta(test))
    train_class_list = ["exons" if "exon" in seq_name_train[i] else "introns" for i in range(len(seq_name_train))]
    test_true_class_list = ["exons" if "exon" in seq_name_test[i] else "introns" for i in range(len(seq_name_test))]
    test_pred_class_list = []
    for i in range(len(seq_test)):
        dist_list = []
        for j in range(len(seq_train)):
            dist_list.append(dist(seq_test[i], seq_train[j], kmer_size))
        dist_list = np.array(dist_list)
        indexes = np.argsort(dist_list)
        exons_num = 0
        introns_num = 0
        for index in indexes[:knn_num]:
            if train_class_list[index] == "exons":
                exons_num += 1
            else:
                introns_num += 1
        if exons_num > introns_num:
            test_pred_class_list.append("exons")
        else:
            test_pred_class_list.append("introns")
    correct = 0
    for i, j in zip( test_true_class_list, test_pred_class_list):
        if i == j:
            correct += 1
    
    accuracy = correct/len(test_true_class_list)

    return accuracy

In [6]:
KNN(train = ["train-exons100.fasta", "train-introns100.fasta"], test = "test.fasta", knn_num = 5, kmer_size = 2)

0.97375

In [8]:
train = ["train-exons100.fasta", "train-exons100.fasta", "train-exons30.fasta", "train-introns10.fasta", "train-introns30.fasta", "train-introns100.fasta"]
test = "test.fasta"

In [20]:
knn_neighbors = [7]
kmer_sizes = [6,8]
#knn_results_list = []
for knn_num in knn_neighbors:
    for kmer_size in kmer_sizes:
        result = {}
        accuracy = KNN(train, test, knn_num, kmer_size)
        result['knn neighbors num'] = knn_num
        result['kmer size'] = kmer_size
        result['accuracy'] = accuracy
        knn_results_list.append(result)
        

In [21]:
knn_results_list

[{'knn neighbors num': 1, 'kmer size': 2, 'accuracy': 0.9625},
 {'knn neighbors num': 1, 'kmer size': 4, 'accuracy': 0.9625},
 {'knn neighbors num': 1, 'kmer size': 6, 'accuracy': 0.915},
 {'knn neighbors num': 1, 'kmer size': 8, 'accuracy': 0.82375},
 {'knn neighbors num': 3, 'kmer size': 2, 'accuracy': 0.9675},
 {'knn neighbors num': 3, 'kmer size': 4, 'accuracy': 0.965},
 {'knn neighbors num': 3, 'kmer size': 6, 'accuracy': 0.93125},
 {'knn neighbors num': 3, 'kmer size': 8, 'accuracy': 0.84625},
 {'knn neighbors num': 5, 'kmer size': 2, 'accuracy': 0.97},
 {'knn neighbors num': 5, 'kmer size': 4, 'accuracy': 0.965},
 {'knn neighbors num': 5, 'kmer size': 6, 'accuracy': 0.9475},
 {'knn neighbors num': 5, 'kmer size': 8, 'accuracy': 0.86375},
 {'knn neighbors num': 7, 'kmer size': 2, 'accuracy': 0.9725},
 {'knn neighbors num': 7, 'kmer size': 4, 'accuracy': 0.975},
 {'knn neighbors num': 7, 'kmer size': 6, 'accuracy': 0.96125},
 {'knn neighbors num': 7, 'kmer size': 8, 'accuracy': 0.

In [22]:
d = {'knn neighbors num':[],'kmer size':[],'accuracy':[]}
for i in range(len(knn_results_list)):
    d['knn neighbors num'].append(knn_results_list[i]['knn neighbors num'])
    d['kmer size'].append(knn_results_list[i]['kmer size'])
    d['accuracy'].append(knn_results_list[i]['accuracy'])


In [24]:
df = pd.DataFrame(data=d)
df.to_csv('KNNresults.csv', index=False)

In [26]:
df['accuracy'] = round(df['accuracy'] * 100,2)

In [28]:
df.to_csv('KNNresults.csv', index=False)

In [29]:
# change fasta file into a list of sequences
# fasta reader
def read_fasta(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    return lines

# fasta parser


def parse_fasta(lines):
    seq_list = []
    seq = ''
    seq_name = []
    for line, index in zip(lines, range(len(lines))):
        if index == len(lines) - 1:
            seq += line.strip()
            seq_list.append(seq)
        if line.startswith('>'):
            seq_list.append(seq)
            seq = ''
            name = line.strip()
            seq_name.append(name.split(" /")[0][1:])
            continue
        else:
            seq += line.strip()
    for i in seq_list:
        if i == '':
            seq_list.remove(i)
    return seq_list,seq_name

In [30]:
def diff(key1, key2):
    list1 = list(key1)
    list2 = list(key2)
    dif = 0
    for i in range(len(list1)):
        if list1[i] != list2[i]:
            dif += 1
    return dif


In [123]:
def neighborhood(seq, key, k):
    kmers = get_kmers(seq, k)
    neighbors = []
    for i in range(len(kmers)):
        if diff(kmers[i], key) == 1:
            neighbors.append(kmers[i])
    return neighbors

def mismatch_freq_kmers(seq, k):
    kmers = get_kmers(seq, k)
    freq = {}
    for kmer in kmers:
        if kmer not in freq:
            freq[kmer] = 1
        else:
            freq[kmer] += 1
    for key in freq:
          freq[key] += neighborhood(seq, key, k).count(key)
    return freq




In [139]:
def mismatch_spectrum_kernel(seq1, seq2, k):
	freq1 = mismatch_freq_kmers(seq1, k)
	freq2 = mismatch_freq_kmers(seq2, k)
	for key in freq1:
		if key not in freq2:
			freq2[key] = 0
	for key in freq2:
		if key not in freq1:
			freq1[key] = 0
	similarity = np.dot([freq1[key] for key in freq1], [freq2[key] for key in freq1])
	#normalization = np.dot(math.sqrt(np.dot([freq1[key] for key in freq1], [freq1[key] for key in freq1])),math.sqrt(np.dot([freq2[key] for key in freq1], [freq2[key] for key in freq1])))
	#similarity = similarity/normalization 
    #return similarity, [freq1[key] for key in freq1], [freq2[key] for key in freq1]
    #return similarity


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 14)

In [147]:
def mismatch_spectrum_kernel(seq1,seq2,k):
    freq1 = mismatch_freq_kmers(seq1, k)
    freq2 = mismatch_freq_kmers(seq2, k)
    for key in freq1:
	    if key not in freq2:
	        freq2[key] = 0
    for key in freq2:
		if key not in freq1:
			freq1[key] = 0
    similarity = np.dot([freq1[key] for key in freq1], [freq2[key] for key in freq1])


TabError: inconsistent use of tabs and spaces in indentation (1879938857.py, line 8)

In [149]:
def mismatch_spectrum_kernel(seq1,seq2,k):
    freq1 = mismatch_freq_kmers(seq1, k)
    freq2 = mismatch_freq_kmers(seq2, k)
    for key in freq1:
        if key not in freq2:
            freq2[key] = 0
    for key in freq2:
        if key not in freq1:
            freq1[key] = 0
    similarity = np.dot([freq1[key] for key in freq1], [freq2[key] for key in freq1])
    normalization = np.dot(math.sqrt(np.dot([freq1[key] for key in freq1], [freq1[key] for key in freq1])),math.sqrt(np.dot([freq2[key] for key in freq1], [freq2[key] for key in freq1])))
    similarity = similarity/normalization 
    return similarity, [freq1[key] for key in freq1], [freq2[key] for key in freq1]

In [150]:
def kmeans(seqpath, ncluster, kmer_size, maxniter, distfunc):
	seq_list, seq_names = parse_fasta(read_fasta(seqpath))
	centroids = np.random.randint(0, len(seq_list), ncluster)
	# initialize class_list
	class_list = {}
	seq_kernel_list = []
	for seq in seq_list:
		similarity_list = []
		for centroid in centroids:
			similarity, seq_spectrum, centroid_spectrum = distfunc(
                			seq, seq_list[centroid], kmer_size)
			seq_kernel_list.append(seq_spectrum)
			similarity_list.append(similarity)

		class_list[seq] = np.argmax(similarity_list)
	# update centroids 1st time
	centroids = []
	for i in range(ncluster):
		tmp_list = []
		for seq, seq_kernel in zip(seq_list, seq_kernel_list):
			if class_list[seq] == i:
				tmp_list.append(seq_kernel)
		centroids.append(np.mean(tmp_list, axis=0))
	centroids_new = centroids.copy()
	old_class_list = class_list.copy()
	for i in range(maxniter):
		# update class_list
		for seq,seq_kernel in zip(seq_list, seq_kernel_list):
			similarity_list = []
			for centroid in centroids_new:
				similarity = np.dot(seq_kernel, centroid)
				similarity_list.append(similarity)
			class_list[seq] = np.argmax(similarity_list)
		if class_list == old_class_list:
			break
		old_class_list = class_list.copy()
		# update centroids
		centroids_new = []
		for i in range(ncluster):
			tmp_list = []
			for seq, seq_kernel in zip(seq_list, seq_kernel_list):
				if class_list[seq] == i:
					tmp_list.append(seq_kernel)
			centroids_new.append(np.mean(tmp_list, axis=0))
		


	return centroids_new, class_list


In [151]:
seqspath = "kmeans.fasta"
centroids, class_list = kmeans(seqspath, 3, 2, 3, mismatch_spectrum_kernel)
#centroids

In [50]:
seqpath = "kmeans.fasta"
ncluster = 3
kmer_size = 2
maxniter = 3

In [64]:
centroids

array([ 581, 1329, 1444])

In [65]:
class_list

[2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 1,
 2,
 1,
 0,
 1,
 2,
 2,
 1,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 1,
 1,
 2,
 0,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 0,
 2,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 0,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 1,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 1,


In [52]:
seq_list, seq_names = parse_fasta(read_fasta(seqpath))
centroids = np.random.randint(0, len(seq_list), ncluster)
	# initialize class_list
class_list = {}
seq_kernel_list = []
for seq in seq_list:
	similarity_list = []
	for centroid in centroids:
		similarity, seq_spectrum, centroid_spectrum = mismatch_spectrum_kernel(
                			seq, seq_list[centroid], kmer_size)
		seq_kernel_list.append(seq_spectrum)
		similarity_list.append(similarity)

	class_list[seq] = np.argmax(similarity_list)

In [53]:
centroids = []
for i in range(ncluster):
	tmp_list = []
	for seq, seq_kernel in zip(seq_list, seq_kernel_list):
		if class_list[seq] == i:
			tmp_list.append(seq_kernel)
	centroids.append(np.mean(tmp_list, axis=0))
centroids_new = centroids.copy()
old_class_list = class_list.copy()

In [54]:
centroids

[array([24.67346939, 21.72108844, 24.67346939, 23.34013605, 22.49659864,
        20.68027211, 19.65986395, 20.70068027, 20.60544218, 17.70748299,
        18.23129252, 16.34013605, 14.30612245, 14.75510204, 11.24489796,
         7.86394558]),
 array([24.76612127, 23.71703561, 23.94898941, 22.3358999 , 23.74590953,
        22.30221367, 20.1973051 , 19.52743022, 20.51973051, 18.07122233,
        17.6294514 , 16.4080847 , 14.9894129 , 11.75264678, 10.92396535,
         8.16458133]),
 array([25.62101911, 22.0477707 , 25.29617834, 23.22292994, 21.17834395,
        22.17197452, 20.20382166, 20.24203822, 18.7611465 , 18.14968153,
        17.47452229, 17.47133758, 14.09872611, 12.99044586, 11.6910828 ,
         8.37898089])]

In [56]:
for seq,seq_kernel in zip(seq_list, seq_kernel_list):
	similarity_list = []
	for centroid in centroids_new:
		similarity = np.dot(seq_kernel, centroid)
		similarity_list.append(similarity)
	class_list[seq] = np.argmax(similarity_list)

old_class_list = class_list.copy()
		# update centroids
centroids_new = []
for i in range(ncluster):
	tmp_list = []
	for seq, seq_kernel in zip(seq_list, seq_kernel_list):
		if class_list[seq] == i:
			tmp_list.append(seq_kernel)
	centroids_new.append(np.mean(tmp_list, axis=0))

In [57]:
centroids_new

[array([20.34736842, 17.2       , 23.4       , 24.15789474, 21.09473684,
        15.36842105, 17.27368421, 24.68421053, 24.18947368, 17.44210526,
        20.97894737, 15.35789474, 16.05263158, 20.03157895, 12.41052632,
         9.01052632]),
 array([23.99619772, 25.76425856, 21.77186312, 20.53231939, 27.73764259,
        25.0418251 , 20.89353612, 18.35361217, 21.94296578, 17.57794677,
        16.69201521, 15.64638783, 15.36882129,  9.85931559,  9.95057034,
         7.87072243]),
 array([29.74647887, 22.36619718, 29.5915493 , 25.45774648, 15.8028169 ,
        21.21126761, 20.68309859, 19.18309859, 14.16197183, 19.33802817,
        17.21830986, 19.28169014, 12.68309859, 11.66901408, 12.4084507 ,
         8.1971831 ])]

In [154]:
seqspath = "kmeans.fasta"
ncluster = 3
kmer_size = 3
maxniter = 100
centroids, class_list = kmeans(seqpath, ncluster, kmer_size, maxniter, mismatch_spectrum_kernel)

  arr = asanyarray(a)


ValueError: shapes (59,) and (63932,) not aligned: 59 (dim 0) != 63932 (dim 0)

In [152]:
def printtest(seq_list,seq_names,class_list,kmer_size,ncluster,title):
    print('\n', title, ' (KMER =', kmer_size, ", CLUSTERS =", ncluster, "):")
    for i in range(ncluster):
        exon = 0
        intron = 0
        intergenic = 0
        print('\n', 'CLUSTER ',i + 1,":")
        for seq, annotation in zip(seq_list,seq_names):
            if class_list[seq] == i:
                if "exon" in annotation:
                    exon += 1
                if "intron" in annotation:
                    intron += 1
                if "intergenic" in annotation:
                    intergenic += 1
        sum = intergenic + exon + intron
        print('\n','    intergenic =', round(intergenic/sum,2),' (', intergenic, ')')
        print('\n','    intron =', round(intron/sum,2),' (', intron, ')')
        print('\n','    exon =', round(exon/sum,2),' (', exon, ')')

In [153]:
printtest(seq_list,seq_names,class_list,kmer_size,ncluster,"MISMATCH")


 MISMATCH  (KMER = 2 , CLUSTERS = 3 ):

 CLUSTER  1 :

     intergenic = 0.26  ( 116 )

     intron = 0.43  ( 191 )

     exon = 0.31  ( 140 )

 CLUSTER  2 :

     intergenic = 0.37  ( 222 )

     intron = 0.29  ( 174 )

     exon = 0.34  ( 207 )

 CLUSTER  3 :

     intergenic = 0.36  ( 162 )

     intron = 0.3  ( 135 )

     exon = 0.34  ( 153 )


In [73]:
zip(seq_list,seq_names)

<zip at 0x7fdc7dcc7080>

In [80]:
import math
math.sqrt(4)

2.0

TypeError: only size-1 arrays can be converted to Python scalars