## Raw parquet file reading

In [ ]:
val vcf = "/tmp/6-sample.vcf"
val localParquet = vcf+"6"+".parquet"

vcf: String = /tmp/6-sample.vcf
localParquet: String = /tmp/6-sample.vcf6.parquet


In [ ]:
val rawDF = sparkSession.read.parquet(localParquet)

rawDF: org.apache.spark.sql.DataFrame = [sampleId: string, variantId: string ... 1 more field]


In [ ]:
// populations to select
val pops = Set("GBR", "ASW", "CHB")

pops: scala.collection.immutable.Set[String] = Set(GBR, ASW, CHB)


In [ ]:
import sys.process._

val panelFile = "/tmp/ALL.panel"

s"wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel -O ${panelFile}"!!

by making the implicit value scala.language.postfixOps visible.
This can be achieved by adding the import clause 'import scala.language.postfixOps'
or by setting the compiler option -language:postfixOps.
See the Scaladoc for value scala.language.postfixOps for a discussion
why the feature should be explicitly enabled.
       s"wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel -O ${panelFile}"!!
                                                                                                                                      ^
import sys.process._
panelFile: String = /tmp/ALL.panel
res4: String = ""


In [ ]:
import scala.io.Source
def extract(filter: (String, String) => Boolean= (s, t) => true) = Source.fromFile(panelFile).getLines().map( line => {
  val toks = line.split("\t").toList
  toks(0) -> toks(1)
}).toMap.filter( tup => filter(tup._1, tup._2) )
  
// panel extract from file, filtering by the 2 populations
val panel: Map[String,String] = 
  extract((sampleID: String, pop: String) => pops.contains(pop)) 
  
val panelSamples = panel.keys.toList
// broadcast the panel 
//val bPanel = sparkContext.broadcast(panel)

import scala.io.Source
extract: (filter: (String, String) => Boolean)scala.collection.immutable.Map[String,String]
panel: Map[String,String] = Map(HG00139 -> GBR, NA18550 -> CHB, HG00254 -> GBR, NA18630 -> CHB, NA18532 -> CHB, NA18641 -> CHB, HG00128 -> GBR, NA18631 -> CHB, NA19920 -> ASW, NA18636 -> CHB, HG00232 -> GBR, NA20318 -> ASW, HG01789 -> GBR, NA18542 -> CHB, HG00109 -> GBR, NA20355 -> ASW, NA18614 -> CHB, HG00119 -> GBR, NA18625 -> CHB, HG00258 -> GBR, NA19819 -> ASW, NA18564 -> CHB, NA18595 -> CHB, HG00243 -> GBR, HG00159 -> GBR, NA19917 -> ASW, NA18553 -> CHB, NA18573 -> CHB, HG00265 -> GBR, HG00105 -> GBR, HG00117 -> GBR, NA20320 -> ASW, NA18647 -> CHB, NA18620 -> CHB, HG00127 -> GBR, NA18539 -> CHB, NA20359 -> ASW, NA20340 -> ASW, NA19834 -> ASW, NA18579 -> CHB, NA18591 ->...

In [ ]:
panelSamples

res7: List[String] = List(HG00139, NA18550, HG00254, NA18630, NA18532, NA18641, HG00128, NA18631, NA19920, NA18636, HG00232, NA20318, HG01789, NA18542, HG00109, NA20355, NA18614, HG00119, NA18625, HG00258, NA19819, NA18564, NA18595, HG00243, HG00159, NA19917, NA18553, NA18573, HG00265, HG00105, HG00117, NA20320, NA18647, NA18620, HG00127, NA18539, NA20359, NA20340, NA19834, NA18579, NA18591, HG00116, HG00106, HG00146, NA18557, NA18597, NA18562, HG00251, NA18609, HG00262, NA18603, NA18643, NA20281, NA19982, NA19711, NA18613, HG00240, NA18536, HG00151, NA20334, NA19700, NA20348, HG00099, HG00138, NA18546, NA18610, NA18531, NA18637, NA18525, NA18545, NA18621, NA18745, NA18626, HG00242, HG00150, NA19901, NA19712, NA20362, HG00239, NA19701, HG00149, HG00123, HG00097, HG00112, NA20342, HG0026...

In [ ]:
val finalDF = rawDF.filter($"sampleId" isin(panelSamples:_*))

finalDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [sampleId: string, variantId: string ... 1 more field]


In [ ]:
finalDF

res10: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [sampleId: string, variantId: string ... 1 more field]


In [ ]:
val sampleCount = finalDF.select($"sampleId").distinct.count
s"#Samples: $sampleCount"

sampleCount: Long = 255
res10: String = #Samples: 255


In [ ]:
val selectedVariants = finalDF.groupBy($"variantId").count.filter($"count" === sampleCount).select("variantId")

selectedVariants: org.apache.spark.sql.DataFrame = [variantId: string]


In [ ]:
val completeDF = finalDF.join(selectedVariants, finalDF("variantId") === selectedVariants("variantId"))
                        .select($"sampleId",finalDF("variantId"),$"genotype")

completeDF: org.apache.spark.sql.DataFrame = [sampleId: string, variantId: string ... 1 more field]


In [ ]:
val genoTable = completeDF.groupBy($"sampleId").pivot("variantId").agg(max("genotype"))

genoTable: org.apache.spark.sql.DataFrame = [sampleId: string, 6:1000012:1000013: double ... 8238 more fields]


In [ ]:
val featuresCols = genoTable.columns.drop(1)

featuresCols: Array[String] = Array(6:1000012:1000013, 6:1000110:1000111, 6:1000153:1000154, 6:1000166:1000167, 6:1000169:1000170, 6:1000174:1000175, 6:1000222:1000223, 6:1000279:1000280, 6:1000310:1000311, 6:1000350:1000351, 6:1000407:1000408, 6:1000458:1000459, 6:1000459:1000460, 6:1000475:1000476, 6:1000490:1000491, 6:1000524:1000525, 6:1000536:1000537, 6:1000542:1000543, 6:1000603:1000604, 6:1000716:1000717, 6:1000731:1000732, 6:1000755:1000761, 6:1000816:1000817, 6:1000878:1000879, 6:1000882:1000883, 6:1000883:1000884, 6:1000902:1000905, 6:1000904:1020635, 6:1000908:1000909, 6:1000909:1000910, 6:1001029:1001030, 6:1001082:1001083, 6:1001168:1001169, 6:1001206:1001207, 6:1001209:1001210, 6:1001210:1001211, 6:1001229:1001230, 6:1001247:1001248, 6:1001260:1001261, 6:1001277:1001278, 6...

In [ ]:
import org.apache.spark.ml.feature.VectorAssembler

val assembler = new VectorAssembler()
  .setInputCols(featuresCols)
  .setOutputCol("features")

import org.apache.spark.ml.feature.VectorAssembler
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_f4d25fcc18ab


In [ ]:
val trainingSet = assembler.transform(genoTable).select("sampleId", "features")
trainingSet.cache()
trainingSet.write.parquet("/tmp/genome.training.parquet")


In [ ]:
import org.apache.spark.ml.clustering.KMeans
//import org.apache.spark.ml.evaluation.ClusteringEvaluator


import org.apache.spark.ml.clustering.KMeans


In [ ]:
val kmeans = new KMeans().setK(3).setSeed(1L)
val model = kmeans.fit(trainingSet)

// Make predictions
val predictions = model.transform(trainingSet)

In [ ]:
val confMat = predictions.collect.toMap.values
    .groupBy(_._2).mapValues(_.map(_._1))
    .mapValues(xs => (1 to 3).map( i => xs.count(_ == i-1)).toList)

In [ ]:
<table>
<tr><td></td><td>#0</td><td>#1</td><td>#2</td></tr>
{ for (popu <- confMat) yield
  <tr><td>{popu._1}</td> { for (cnt <- popu._2) yield 
    <td>{cnt}</td>
  }
  </tr>
}
</table>