In [1]:
import pandas as pd
import vcf
import numpy as np
from scipy.stats import mode
import sklearn.metrics
from sklearn import cluster

In [2]:
def GetResultsArray(svnCalls, results):
    
    mutations = set()
    for record in results:
        mutations.add(record.POS)
        
    truthColumn = np.zeros(len(svnCalls))

    i = 0
    for pos in svnCalls['END']:
        if pos in mutations:
            truthColumn[i] = 1
        i += 1
        
    return truthColumn

In [4]:
svnCalls = pd.read_csv('SNVCalls_IS1.txt', delimiter='\t', dtype={'CHROM':pd.np.str})
results = vcf.Reader(open('synthetic.challenge.set1.tumor.all.truth.vcf', 'rb'))
genomicFeatures = pd.read_csv('GenomicFeatures_IS1.txt', delimiter='\t', dtype={'CHROM':pd.np.str})

In [5]:
resultsArray = GetResultsArray(svnCalls, results)

In [6]:
mostCommonPredictions = mode(svnCalls[svnCalls.columns[3:-1]], axis=1)[0]
sklearn.metrics.f1_score(mostCommonPredictions, resultsArray)

0.96532793990176247

In [7]:
def ValuesMatch(prediction, result):
    return 1 if prediction == result else 0

valuesMatchFunc = np.vectorize(ValuesMatch)  
correctPredictions = valuesMatchFunc(mostCommonPredictions[:,0], resultsArray)

In [None]:
num_clusters = 6
k_means = cluster.KMeans(n_clusters=num_clusters)
k_means.fit(svnCalls[svnCalls.columns[3:-1]].T)

In [None]:
def TransformEuclideanPointsToPredictions(point):
    return 1 if point >= 0.5 else 0

vfunc = np.vectorize(TransformEuclideanPointsToPredictions) 
for i in range(0, num_clusters):
    clusterPredictions = vfunc(k_means.cluster_centers_[i])
    print sklearn.metrics.f1_score(clusterPredictions, resultsArray)

In [None]:
bestCluster = svnCalls[svnCalls.columns[3:-1]][np.where(k_means.labels_ == 0)[0]]
clusterPredictions = mode(bestCluster, axis=1)[0]

In [None]:
sklearn.metrics.f1_score(clusterPredictions, results)

In [None]:
svnCalls[svnCalls.columns[3:-1]]