From 1e1b2a1d873e1270a1295d92fda56faa8c3f9774 Mon Sep 17 00:00:00 2001
From: Pjotr Prins <pjotr.public01@thebird.nl>
Date: Mon, 11 Apr 2016 15:48:53 +0200
Subject: [PATCH] Mongo example with python

---
 doc/Using_Mongo.md | 324 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 doc/Using_Mongo.md

diff --git a/doc/Using_Mongo.md b/doc/Using_Mongo.md
new file mode 100644
index 0000000..6a62556
--- /dev/null
+++ b/doc/Using_Mongo.md
@@ -0,0 +1,324 @@
+# Using bio-vcf with MongoDB
+
+bio-vcf can output many types of formats. In this exercise we will load
+Mongo with VCF data and do some queries on that.
+
+## Install Mongo (Debian)
+
+With su (password 'bioinformatics')
+
+```sh
+su
+apt-get install mongodb
+```
+
+## Install Mongo in $HOME
+
+Mongo comes with many distributions. Here we installed with guix. Check
+
+```sh
+guix package -A mongodb
+  mongodb 3.3.3   out     gn/packages/mongodb.scm:31:2
+```
+
+Create a directory for the database
+
+```sh
+mkdir -p ~/opt/var/mongodb
+mkdir -p ~/opt/etc
+```
+
+And create a configuration file ~/opt/etc/mongo.conf
+
+```
+verbose = true
+port = 27017
+dbpath = /home/user/opt/var/mongodb/
+noauth = true
+maxConns = 5
+rest = true
+```
+
+and run Mongo
+
+```sh
+env LC_ALL=C mongod --config ~/opt/etc/mongo.conf
+```
+
+```ruby
+use admin
+db.createUser({user:"admin", pwd:"admin123", roles:[{role:"root", db:"admin"}]})
+```
+
+## Use client
+
+mongo
+use admin
+db.createUser({user:"admin", pwd:"admin123", roles:[{role:"root", db:"admin"}]})
+
+## Tutorial Mongo
+
+Using the example on MongoDB's [website](https://docs.mongodb.org/getting-started/shell/import-data/)
+
+### Load data
+
+Records look like:
+
+```javascript
+{"address": {"building": "2780", "coord": [-73.98241999999999, 40.579505], "street": "Stillwell Avenue", "zipcode": "11224"}, "borough": "Brooklyn", "cuisine": "American ", "grades": [{"date": {"$date": 1402358400000}, "grade": "A", "score": 5}, {"date": {"$date": 1370390400000}, "grade": "A", "score": 7}, {"date": {"$date": 1334275200000}, "grade": "A", "score": 12}, {"date": {"$date": 1318377600000}, "grade": "A", "score": 12}], "name": "Riviera Caterer", "restaurant_id": "40356018"}
+{"address": {"building": "351", "coord": [-73.98513559999999, 40.7676919], "street": "West   57 Street", "zipcode": "10019"}, "borough": "Manhattan", "cuisine": "Irish", "grades": [{"date": {"$date": 1409961600000}, "grade": "A", "score": 2}, {"date": {"$date": 1374451200000}, "grade": "A", "score": 11}, {"date": {"$date": 1343692800000}, "grade": "A", "score": 12}, {"date": {"$date": 1325116800000}, "grade": "A", "score": 12}], "name": "Dj Reynolds Pub And Restaurant", "restaurant_id": "30191841"}
+```
+
+Note there are no specific identifiers. Or are there?
+
+```sh
+wget https://raw.githubusercontent.com/mongodb/docs-assets/primer-dataset/primer-dataset.json
+mongoimport --db test --collection restaurants --drop --file primer-dataset.json
+Mon Apr 11 00:24:50.963 dropping: test.restaurants
+Mon Apr 11 00:24:52.375 check 9 25359
+Mon Apr 11 00:24:52.448 imported 25359 objects
+```
+
+### Use the shell
+
+Run the mongo shell with
+
+```sh
+mongo
+```
+
+```ruby
+use test
+db.restaurants.find()
+db.restaurants.find( { "borough": "Manhattan" } )
+db.restaurants.find( { "grades.score": { $gt: 30 } } )
+... AND ...
+db.restaurants.find( { "cuisine": "Italian", "address.zipcode": "10075" ,"grades.score": { $gt: 30 }} )
+... OR ...
+db.restaurants.find(
+   { $or: [ { "cuisine": "Italian" }, { "address.zipcode": "10075" } ] }
+)
+... SORT ...
+db.restaurants.find().sort( { "borough": 1, "address.zipcode": 1 } )
+... Count ...
+db.restaurants.aggregate(
+   [
+     { $group: { "_id": "$borough", "count": { $sum: 1 } } }
+   ]
+   );
+
+db.restaurants.aggregate(
+   [
+     { $match: { "borough": "Queens", "cuisine": "Brazilian" } },
+     { $group: { "_id": "$address.zipcode" , "count": { $sum: 1 } } }
+   ]
+   );
+... Index ...
+db.restaurants.createIndex( { "cuisine": 1, "address.zipcode": -1 } )
+```
+
+### Prepare template with bio-vcf
+
+```sh
+wget https://github.com/pjotrp/bioruby-vcf/raw/master/test/data/input/gatk_exome.vcf
+cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos]'
+```
+
+Let's create a template named gatk_template.json
+
+```ruby
+
+{
+            "chr": "<%= rec.chrom %>",
+            "pos": <%= rec.pos %>,
+            "ref": "<%= rec.ref %>",
+            "alt": "<%= rec.alt[0] %>",
+            "dp":  <%= rec.info.dp %>
+}
+```
+
+And run it
+
+```sh
+cat gatk_exome.vcf |bio-vcf --template gatk_template.json |less
+cat gatk_exome.vcf |bio-vcf --template gatk_template.json > gatk_exome.json
+```
+
+Looks like
+
+```
+{
+  "rec": {
+            "chr": "X",
+            "pos": 134713855,
+            "ref": "G",
+            "alt": "A",
+            "dp":  4
+   }
+}
+```
+
+Import into mongo
+
+```sh
+mongoimport --db gatk --collection vcf --drop --file gatk_exome.json
+mongoimport --db gatk --collection vcf --drop --file gatk_exome.json --jsonArray
+```
+
+```ruby
+use gatk
+db.vcf.find()
+db.vcf.find( { "rec.chr": "X" } )
+db.vcf.find( { "rec.chr": "X" } ).count()
+3
+db.vcf.find( { "rec.dp": { $gt: 5 }}  )
+db.vcf.find( { "rec.dp": { $gt: 5 }}  ).count()
+25
+```
+
+Comparable bio-vcf statements
+
+```
+cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos,r.ref,r.alt,r.info.dp]' --filter "r.chr=='X'"|grep -v '#' |wc -l
+=>"[r.chr,r.pos,r.ref,r.alt,r.info.dp]", :filter=>"r.chr=='X'"}
+3
+cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos,r.ref,r.alt,r.info.dp]' --filter "r.info.dp>5"|grep -v '#' |wc -l
+=>"[r.chr,r.pos,r.ref,r.alt,r.info.dp]", :filter=>"r.info.dp>5"}
+25
+```
+
+Exercise 1.
+
+With bio-vcf take the field "Variant Confidence/Quality by Depth" and
+filter on QD>12.0. How many matches? Answer 112 out of 175
+
+Exercise 2.
+
+Do the same with MongoDB. So you can do
+
+```ruby
+db.vcf.find( { "rec.qd": { $gt: 12.0 }}  ).count()
+112
+```
+
+## Now for some real data
+
+Let's use our PIK3CA data in two 
+
+```
+cat gene_PIK3CA.vcf |bio-vcf --samples 2,3  --seval s.dp
+cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3  --seval s.dp --sfilter "s.dp>7"
+cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 0,3 --sfilter 's.dp>20' --seval s.dp
+3       178916645       24      39
+3       178916651       30      31
+3       178921407       32      43
+3       178936082       24      24
+3       178936091       27      32
+3       178947904       23      33
+3       178952072       38      45
+3       178952085       35      45
+3       178952088       34      45
+```
+
+Looking at annotations
+
+```
+cat gene_PIK3CA.vcf |bio-vcf --eval [r.chr,r.pos,r.info.ann] |grep ENST00000263967|wc -l
+30
+```
+
+alternative
+
+```
+cat gene_PIK3CA.vcf |bio-vcf --eval '[r.chr,r.pos,r.info.ann]' --filter 'r.info.ann =~ /ENST00000263967/' --seval 's.dp'
+3       178921407       T|synonymous_variant|LOW|PIK3CA|ENSG00000121879|transcript|ENST00000263967|protein_coding|5/21|c.889C>T|p.Leu297Leu|1046/9093|889/3207|297/1068||    32      32      38      43      27      34      30      37      32      36      44      37  25       27      43      30      11      23      19      37      28      17      13 ...
+```
+
+Let's try and do the same with Mongo
+
+```
+{
+  "rec": {
+            "chr": "<%= rec.chrom %>",
+            "pos": <%= rec.pos %>,
+            "ref": "<%= rec.ref %>",
+            "alt": "<%= rec.alt[0] %>",
+            "dp":  <%= rec.info.dp %>,
+            "ann":  "<%= rec.info.ann %>"
+   }
+}
+```
+
+```sh
+mongoimport --db PIK3CA --collection vcf --drop --file PIK3CA.json --jsonArray
+```
+
+```ruby
+db.vcf.find({"rec.ann": /ENST00000263967/i }).count()
+30
+```
+
+## Load results into Python
+
+```sh
+guix package -i python2-pip
+export PYTHONPATH="/home/user/.guix-profile/lib/python2.7/site-packages"
+pip install pymongo
+pip install --install-option="--prefix=$HOME/opt/python" pymongo
+export PYTHONPATH="/home/user/.guix-profile/lib/python2.7/site-packages:$HOME/opt/python/lib/python2.7/site-packages"
+```
+
+```python
+from pymongo import MongoClient
+
+client = MongoClient()
+db = client.test
+# cursor = db.restaurants.find()
+cursor = db.restaurants.find({"borough": "Manhattan"})
+for document in cursor:
+    print(document)
+    print(document["cuisine"])
+    print(document["grades"][0]["score"]>10)
+
+```
+
+## Exercise 1
+
+Write a Python script which queries the PIK3CA VCF file for the annotation as in
+
+
+```ruby
+db.vcf.find({"rec.ann": /ENST00000263967/i }).count()
+30
+```
+
+## Exercise 2
+
+Write a Python mongo script which queries the PIK3CA file for something
+similar to
+
+```sh
+cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3  --seval s.dp --sfilter "s.dp>7"
+```
+
+when the bio-vcf template is
+
+```ruby
+
+{
+  "rec": {
+            "chr": "<%= rec.chrom %>",
+            "pos": <%= rec.pos %>,
+            "ref": "<%= rec.ref %>",
+            "alt": "<%= rec.alt[0] %>",
+            "dp":  <%= rec.info.dp %>,
+            "samples": [
+              <%=
+              a = []
+              rec.each_sample { |s| a.push s.dp }
+              a.join(',')
+              %>
+   ]
+  }
+}
+```