### Calculating PRS using VCF files


In [3]:
from __future__ import division
from pyspark import SparkConf, SparkContext
from operator import add
import re
import glob, os
import csv
from collections import Counter
import ntpath
import functools
from functools import reduce
from math import log
import itertools
import PRS_VCF_utils
APP_NAME="MAVANvcfPRS"


# PRS calculation on pruned NFP data, 50, 5, VIF=2

#**ATTN: python index starts at 0, so if you want to specify the second column, use 1
#**ATTN: please remove the header of the GWAS file if there is any

# define column number for contents in GWAS

gwas_id=0    # column of SNP ID
gwas_p=7     # column of P value
gwas_or=5    # column of odds ratio
gwas_a1=3    # column of a1 in the GWAS
gwas_maf= 10 # column index of maf in the GWAS

# defin column number for contents in genfile
geno_id=2  # column number with rsID
geno_start=9 # column number of the 1st genotype, in the raw vcf files, after separated by the delimiter of choice
geno_a1 = 3  # column number that contains the reference allele

# List of thresholds:
thresholds=[0.5, 0.3, 0.2, 0.1, 0.05, 0.01, 0.001, 0.0001]

# file delimiters:
GWAS_delim="\t"
GENO_delim="\t"

# file names:
home="/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/"  #define homefolder path

gwasFiles="/home/nyao111/PRS_imputed/pgc.mdd.clump.withAF.txt"       # Name of GWAS file 


def getFileFromPattern(*pattern): # Multiple patterns need to be put into list format
    files=[]
    for pathpattern in pattern:
        files=glob.glob(files)

genoFileNamePattern=home+"*info03_maf01.vcf"   

genoFileNames=glob.glob(genoFileNamePattern)
# Alternatively, directly specify filename:
#genoFileName=[home+"fcgene_out_chr21comb.bierut1M_plus_filtered_chr21_c1_EA_COGA.gen",
              #home+"fcgene_out_chr21comb.bierut1M_plus_filtered_chr21_c1_EA_COGEND.gen",
              #home+"fcgene_out_chr22comb.bierut1M_plus_filtered_chr22_c1_EA_COGA.gen",
              #home+"fcgene_out_chr22comb.bierut1M_plus_filtered_chr22_c1_EA_COGEND.gen"]

genoExtension=".vcf"


# programme parameters
log_or=True  # sepcify whether you want to log your odds ratios
check_ref=True # if you know that there are mismatch between the top strand in the genotypes and that of the GWAS, set True. Not checking the reference allele will improve the speed

# sample file path and name
sampleFilePath="NFPimputed_pruned.sample" # include the full/relative path and name of the sample file
sampleFileDelim=" "  # sample File Delimiter
sampleFileID=0   # which column in the sample file has the ID
sample_skip=2  # how many lines to skip so that the sample names can be matched to the genotypes 1-to-1
##output file information

outputPath=home+"MAVAN_MOMS_mdd.csv"



In [2]:
import pyspark
from pyspark.sql import SQLContext

# We can give a name to our app (to find it in Spark WebUI) and configure execution mode

app_name = "PRS"

conf = pyspark.SparkConf().setAppName(app_name)
sc = pyspark.SparkContext(conf=conf)
print(sc)
sc.setLogLevel("WARN")
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("Start Reading Files")
#def main(gwasFile, genoFileList, thresholdList):
print("="*40)
print("Using these genoytpe files: ")

counter = 0
for filename in genoFileNames:
    if counter<20:
        counter+=1
        print(filename)
    else:
        print("and more....")
        break

<pyspark.context.SparkContext object at 0x2ba65237ecd0>
Using these genoytpe files: 
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/8_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/17_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/19_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/12_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/21_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/16_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/22_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/15_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/10_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/14_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/18_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-vcf-filtered/4_info03_maf01.vcf
/home/nyao111/MAVAN_imputed_161121/MOMS-v

### 1. Load files 

In [4]:
genodata=sc.textFile(genoFileNamePattern)
gwasfile=sc.textFile(gwasFiles)
print("Using the GWAS file: {}".format(ntpath.basename(gwasFiles)))
gwastable=gwasfile.filter(lambda line: "snpid" not in line).map(lambda line: line.split(GWAS_delim))
gwastableCA=gwastable.cache()


Using the GWAS file: pgc.mdd.clump.withAF.txt


### 1.1 Filter GWAS and prepare odds ratio


In [5]:
maxThreshold=max(thresholds)
gwasOddsMapMax=PRS_VCF_utils.filterGWASByP(GWASRdd=gwastableCA, pcolumn=gwas_p, idcolumn=gwas_id, oddscolumn=gwas_or, pHigh=maxThreshold, logOdds=log_or)
gwasOddsMapMaxCA=sc.broadcast(gwasOddsMapMax).value

Taking the log of odds-ratios


### 2. Initial processing 

In [8]:
# at this step, the genotpe is already filtered to keep only the ones in 'gwasOddsMap'
genointermediate=genodata.filter(lambda line: ("#" not in line))\
.map(lambda line: line.split(GENO_delim))\
.filter(lambda line: line[geno_id] in gwasOddsMapMaxCA)\
.map(lambda line: line[0:5]+[chunk.split(":")[3] for chunk in line[geno_start::]])\
.map(lambda line: line[0:5]+[triplet.split(",") for triplet in line[5::]])

genoAlleles=genointermediate.map(lambda line: (line[geno_id], (line[geno_a1], line[geno_a1+1])))
genotable=genointermediate.map(lambda line: (line[geno_id], list(itertools.chain.from_iterable(line[5::]))))\
.mapValues(lambda geno: [float(x) for x in geno])


In [9]:
print genoAlleles.count()

81009


### 2.1 Calculate and store MAF

In [10]:
reload(PRS_VCF_utils)
genoa1f=genointermediate.map(lambda line: (line[geno_id], (line[geno_a1], line[geno_a1+1]), [float(x) for x in list(itertools.chain.from_iterable(line[5::]))]))\
.map(lambda line: (line[0], line[1],PRS_VCF_utils.getMaf(line[2])))

#genoa1f.map(lambda line:"\t".join([line[0], "\t".join(line[1]), str(line[2])])).saveAsTextFile("../MOMS_info03_maf")
genoa1f.first()

(u'rs2240379', (u'G', u'A'), 0.9576415094339623)

### 3. Determine whether each SNP needs to be flipped

In [11]:
checktable=genoa1f.map(lambda line: (line[0], (line[1], line[2])))\
.join(gwastableCA.map(lambda line:(line[gwas_id], ((line[gwas_a1], line[gwas_a1+1]), line[gwas_maf]))))

In [12]:

reload(PRS_VCF_utils)
flagMap=checktable.mapValues(lambda line: PRS_VCF_utils.checkAlignment(line)).collectAsMap()

In [13]:
# filter the raw genotype file
reload(PRS_VCF_utils)
if check_ref:
    print("Calculating genotype dosage while taking into account strand alignment differences")
    genotypeMax=genotable.map(lambda line: PRS_VCF_utils.makeGenotypeCheckRef(line, checkMap=flagMap))
else:
    genotypeMax=genotable.map(lambda line: PRS_VCF_utils.makeGenotype(line, gwasOddsMapCA))

Calculating genotype dosage while taking into account strand alignment differences


In [14]:
# Calculate the PRS with the maximum threshold
# calculate PRS from genotype
reload(PRS_VCF_utils)
def calcPRSFromGeno(genotypeRDD, oddsMap):
    totalcount=genotypeRDD.count()
    multiplied=genotypeRDD.map(lambda line:[call * oddsMap[line[0]] for call in line[1]])
    filtered=multiplied.filter(lambda line: line is not None)
    PRS=multiplied.reduce(lambda a,b: map(add, a, b))
    normalizedPRS=[x/totalcount for x in PRS]
    return (totalcount,PRS)

prsMax=calcPRSFromGeno(genotypeMax, gwasOddsMapMaxCA)
prsDict={}
prsDict[maxThreshold]=prsMax
# Calculate PRS for the rest of the thresholds

In [15]:
def calcNoMax(genotypeRDD, thresholdlist, prsMap):
    if len(thresholdlist)>1:
        thresholdListNoMax=[x for x in thresholds if x != maxThreshold]
        thresholdNoMaxSorted=sorted(thresholdListNoMax, reverse=True)

        for threshold in thresholdNoMaxSorted:
            gwasFiltered=PRS_VCF_utils.filterGWASByP(GWASRdd=gwastableCA, pcolumn=gwas_p, idcolumn=gwas_id, oddscolumn=gwas_or, pHigh=threshold, logOdds=log_or)
            gwasFilteredBC=sc.broadcast(gwasFiltered)
            genotypeRDD=genotypeRDD.filter(lambda line: line[0] in gwasFilteredBC.value)
            if not genotypeRDD.isEmpty():
                prsOther=calcPRSFromGeno(genotypeRDD, gwasFilteredBC.value)
                prsMap[threshold]=prsOther
                print("finished calculating PRS at threshold of "+str(threshold))
        
    return prsMap

finalresult=calcNoMax(genotypeMax, thresholds, prsDict)

Taking the log of odds-ratios
finished calculating PRS at threshold of 0.3
Taking the log of odds-ratios
finished calculating PRS at threshold of 0.2
Taking the log of odds-ratios
finished calculating PRS at threshold of 0.1
Taking the log of odds-ratios
finished calculating PRS at threshold of 0.05
Taking the log of odds-ratios
finished calculating PRS at threshold of 0.01
Taking the log of odds-ratios
finished calculating PRS at threshold of 0.001
Taking the log of odds-ratios
finished calculating PRS at threshold of 0.0001


In [234]:
subjNames=genodata.filter(lambda line: "#CHROM" in line).collect()[0].split(GENO_delim)[-265:]
#pvalues, scores=labelPRS(prsDict, subjNames)
#writePRS(prsTable=scores, outputFile=outputPath, pvalues=pvalues)

In [235]:
len(subjNames)

265