src/hg/makeDb/doc/evaSnp3.txt

# Track for EVA snp release 3, which was released 2/24/2022 - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=3
# Tracks built by Lou on 4/19/2022

# Assemblies were the following: mm39, danRer11, bosTau9, dm6, mm10, galGal6, susScr11, rn6, 
# rn7, danRer10, equCab3, rheMac10, macFas5, oviAri4, felCat9, galGal5

# The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/)
# were compared to each of our dbs genome accessions with at least 100 hits/month to create that list

# All assemblies were passed by the python pipeline described below. 3 assemblies required some 
# variants to be removed (danRer11, dm6, oviAri4) due to issues with hgVai, see RM #29262.
# The removed variants were done so after initial data download and noted in the code comment

#####################################
#### Beginning of python3 code ######
#####################################

import subprocess,sys,argparse,os

#All databases with monthly hit counts of at least 100

#mm39 - Completes
# url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz"
# dbs='mm39'

#danRer11 - Completes if I remove rs499587024 
#url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz"
#dbs='danRer11'

#bosTau9 - completes
#url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz"
#dbs="bosTau9"

#dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt 
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz"
# dbs = "dm6"

#mm10 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz"
# dbs = "mm10"

# #galGal6 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz"
# dbs = "galGal6"

# #susScr11 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz"
# dbs = "susScr11"

# rn6 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz"
# dbs = "rn6"

# rn7 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz"
# dbs = "rn7

# danRer10 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz"
# dbs = "danRer10"

# equCab3 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz"
# dbs = "equCab3"

# rheMac10 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz"
# dbs = "rheMac10"

# macFas5 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz"
# dbs = "macFas5"

# oviAri4 - Completes if removed rs1089233242
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz"
# dbs = "oviAri4"

# felCat9 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz"
# dbs = "felCat9"

# galGal5 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz"
# dbs = "galGal5"

workDir = "/hive/data/outside/eva/"+dbs
if workDir[-1] != "/":
    workDir=workDir+"/"

def checkDirMakeDir(workDir):
    if not os.path.isdir(workDir):
        os.mkdir(workDir)

def bash(cmd):
    """Run the cmd in bash subprocess"""
    try:
        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
        bashStdoutt = rawBashOutput.stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return(bashStdoutt)

def wgetFile(url, workDir):
    bash("wget %s -P %s" % (url, workDir))
    fileName = workDir+url.split("/")[-1]
    return(fileName)
    
def checkDupsAndUnzip(fileName,workDir):
    """Check for duplicate lines (common in EVA), and unzip file"""
    lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip()
    uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip()
    currentFileName = workDir+"evaSnps.vcf"
    if lineCount != uniqCount:
        print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount))
        bash("zcat %s | uniq > %s" % (fileName,currentFileName))
    else:
        bash("zcat %s > %s" % (fileName,currentFileName))
    return(currentFileName)

def sanitizeChromNames(currentFileName):
    """Swaps the chrom IDs with the accession names which are indexed in chromAlias"""
    inputFile = open(currentFileName,'r')
    outputFile = open(currentFileName+".fixed","w")
    IDtoAccessionDic = {}
    
    for line in inputFile:
        if line.startswith("##contig"):
            ID = line.split('=')[2].split(',')[0]
            accession = line.split('=')[3].split('"')[1]
            IDtoAccessionDic[ID] = accession
            outputFile.write(line)
        elif line.startswith("#"):
            outputFile.write(line)
        else:
            line = line.split("\t")
            line[0] = IDtoAccessionDic[line[0]]
            line = "\t".join(line)
            outputFile.write(line)
    inputFile.close()
    outputFile.close()
    #Replacing the same name because chromAlias should be updated soon and this function will be removed
    bash("mv %s.fixed %s" % (currentFileName,currentFileName))

def convertChromToUcsc(currentFileName,dbs,workDir):
    """Convert to UCSC-style chroms"""
    bash("chromToUcsc --get %s" % (dbs))
    bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv"))
    bash("rm %s" % (dbs+".chromAlias.tsv"))
    currentFileName = workDir+"evaSnps.ucscChroms.vcf"
    return(currentFileName)
                               
def convertVcfToBed(currentFileName,workDir):
    """Convert to bed keeping only VC and SID fields"""
    bash("bgzip %s" % (currentFileName))
    bash("tabix -p vcf %s" % (currentFileName+".gz"))
    bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir))
    currentFileName = workDir+"evaSnp.bed"
    return(currentFileName)

def splitChromsAndRunHgVai(workDir,dbs):
    """Split all the chroms in tenths in order to be able to run hgVai without running out of memory"""
    chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
    inputFile = workDir+"evaSnps.ucscChroms.vcf.gz"
    outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep"
    for chrom in chromSizes[1:-1]:
        chrom = chrom.split("\t")
        chromName = chrom[0]
        print("Running "+chromName)
        if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp
            if dbs == 'mm39':
                bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
            --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
            else:
                bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
            --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
        else:
            breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
            for part in breakPoints:
                startCoord = round(int(chrom[1])*(part-.1))+1
                endCoord = round(int(chrom[1])*part)
                if part == breakPoints[0]: #Special exception for start
                    if dbs == 'mm39':
                        bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
                    else:
                        bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
                else:
                    if dbs == 'mm39':
                        bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))  
                    else:
                        bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))                    
    return(outputFile)

def splitVepFileByChrom(hgVaiResultsFile,workDir):
    """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs"""
    splitChromDir = workDir+"splitChroms/"
    checkDirMakeDir(splitChromDir)
    vepFile = open(hgVaiResultsFile,"r")
    for line in vepFile:
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
            #Need to compute itemRGB, aaChange, and ucscClass
            lineContents = line.rstrip().split("\t")
            chrom = lineContents[1].split(":")[0]
            outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a")
            outFile.write(line)
            outFile.close()
    vepFile.close()

def fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic):             
    """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList"""
    prevName = name
    #Check if chrom has changed and index current chrom
    if chrom != currentChrom:
        currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r")
        currentVepDic = {}
        for entry in currentVepFile:
            entry = entry.rstrip().split("\t")
            rsID = entry[0]
            if rsID not in currentVepDic.keys():
                currentVepDic[rsID] = {}
                currentVepDic[rsID]['ucscClassList'] = []
                currentVepDic[rsID]['ucscClassList'].append(entry[6])
                currentVepDic[rsID]['aaChange'] = []
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
            else:
                if entry[6] not in currentVepDic[rsID]['ucscClassList']:
                    currentVepDic[rsID]['ucscClassList'].append(entry[6])
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']:
                        currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
        currentChrom=chrom
        currentVepFile.close()
                
    ucscClass = ",".join(currentVepDic[name]['ucscClassList'])
    aaChange = " ".join(currentVepDic[name]['aaChange'])
    
    return(prevName,currentChrom,currentVepDic,ucscClass,aaChange) 

def assignRGBcolorByConsequence(currentVepDic,name):
    """Assign color based on most serious hgVai predicted consequence"""

    redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant',
               'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained',
               'stop_lost','coding_sequence_variant','transcript_ablation']
    greenVars = ['synonymous_variant','stop_retained_variant']
    blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant',
                'non_coding_transcript_exon_variant']
    blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant',
                 'NMD_transcript_variant','no_sequence_alteration']
    
    if any(item in currentVepDic[name]['ucscClassList'] for item in redVars):
        itemRgb = '255,0,0'
    elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars):
        itemRgb = '0,128,0'
    elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars):
        itemRgb = '0,0,255'
    elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars):
        itemRgb = '0,0,0'
    else:
        sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
        
    return(itemRgb)

def convertSOnumberToName(varClass):
    soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', 
           "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV'}
    varClass = soDic[varClass]
    return(varClass)

def createFinalBedWithVepAnnotations(currentFileName,workDir):
    """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai"""

    inputFile = open(currentFileName,"r")
    outputFile = open(workDir+"evaSnp.final.bed","w")
    prevName = ""
    currentChrom = ""
    currentVepDic = {}
    n=0

    for line in inputFile:
        n+=1
        if n%5000000 == 0:
            print("Current line count: "+str(n))
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
        #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange
        #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms
        
            lineContents = line.rstrip().split("\t")
            chrom = lineContents[0]
            chromStart = lineContents[1]
            chromEnd = lineContents[2]
            name = lineContents[3]
            ref = lineContents[9]
            alt = lineContents[10]
            varClass = lineContents[12]
            submitters = lineContents[13]
            
            #Convert from SO term to name. e.g. SO:0001483 to substitution
            varClass = convertSOnumberToName(varClass)
        
            #Check if is the same rsID, they are repeated in ~5% of file
            if prevName == name:
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
            else:        
                #Find the aaChange and ucscClass                
                prevName,currentChrom,currentVepDic,ucscClass,aaChange = fetchUcscClassAndAaChange(name,chrom,currentChrom,currentVepDic)
                itemRgb = assignRGBcolorByConsequence(currentVepDic,name)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")

    inputFile.close()
    outputFile.close()

def validateFinalFile(workDir,url):
    """Compare and make sure that the final bed rsID# matches the input file"""
    originalDownloadFileName = workDir+url.split("/")[-1]
    bedFile = workDir+"evaSnp.final.bed"
    originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName))
    finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile))
    if originalRsIDCount != finalBedRsIDCount:
        sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\
    "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount)))
    
def createBigBed(workDir,dbs):
    bedFile = workDir+"evaSnp.final.bed"
    bash("bedSort %s %s" % (bedFile,bedFile))
    bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \
    %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir))
    
checkDirMakeDir(workDir)
fileName = wgetFile(url, workDir)
fileName = workDir+url.split("/")[-1]
print("Got file")
currentFileName = checkDupsAndUnzip(fileName,workDir)
print("File unzipped")
try:
    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
    print("Chroms converted")
except:
    sanitizeChromNames(currentFileName)
    print("Chroms sanitized")
    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
    print("Chroms converted")
currentFileName = convertVcfToBed(currentFileName,workDir)    
print("File converted to bed")
hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs)
print("hgVai done")
splitVepFileByChrom(hgVaiResultsFile,workDir)
print("hgVai file split")
currentFileName = workDir+"evaSnp.bed"
createFinalBedWithVepAnnotations(currentFileName,workDir)
print("Validating file")
validateFinalFile(workDir,url)
print("Created final file")
createBigBed(workDir,dbs)
print("bigBed made")

#####################################
######### End of python3 code ######
#####################################

# Lastly links were made with a file containing the 16 assemblies:

for dbs in $(cat $1);
do
ln -s /hive/data/outside/eva/$dbs/evaSnp.bb /gbdb/$dbs/bbi/;
done

#######################

# On 5/3 the tracks were rebuilt to add better logic handling for multi-allele variants (rsIDs 
# that come up multiple times in the file). These IDs now get matched with their unique 
# hgVai alts. See #29031 for more info. Entire new updated pipeline included below.
# canFam3 was also added to the pipeline.

#####################################
#########  Start of python3 #########
#####################################

import subprocess,sys,argparse,os

#All databases with monthly hit counts of at least 100

# mm39 - Completes
# url = "http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/mus_musculus/GCA_000001635.9/GCA_000001635.9_current_ids.vcf.gz"
# dbs='mm39'

#danRer11 - Completes if I remove rs499587024 
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.4/GCA_000002035.4_current_ids.vcf.gz"
# dbs='danRer11'

#bosTau9 - completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002263795.2/GCA_002263795.2_current_ids.vcf.gz"
# dbs="bosTau9"

#dm6 - Completes if I remove /hive/data/outside/eva/dm6/problematicRSIDs.txt 
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001215.4/GCA_000001215.4_current_ids.vcf.gz"
# dbs = "dm6"

#mm10 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001635.6/GCA_000001635.6_current_ids.vcf.gz"
# dbs = "mm10"

# #galGal6 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.5/GCA_000002315.5_current_ids.vcf.gz"
# dbs = "galGal6"

# #susScr11 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000003025.6/GCA_000003025.6_current_ids.vcf.gz"
# dbs = "susScr11"

# rn6 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000001895.4/GCA_000001895.4_current_ids.vcf.gz"
# dbs = "rn6"

# rn7 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_015227675.2/GCA_015227675.2_current_ids.vcf.gz"
# dbs = "rn7

# danRer10 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002035.3/GCA_000002035.3_current_ids.vcf.gz"
# dbs = "danRer10"

# equCab3 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_002863925.1/GCA_002863925.1_current_ids.vcf.gz"
# dbs = "equCab3"

# rheMac10 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_003339765.3/GCA_003339765.3_current_ids.vcf.gz"
# dbs = "rheMac10"

# macFas5 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000364345.1/GCA_000364345.1_current_ids.vcf.gz"
# dbs = "macFas5"

# oviAri4 - Completes if removed rs1089233242
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000298735.2/GCA_000298735.2_current_ids.vcf.gz"
# dbs = "oviAri4"

# felCat9 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000181335.4/GCA_000181335.4_current_ids.vcf.gz"
# dbs = "felCat9"

# galGal5 - Completes
# url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/GCA_000002315.3/GCA_000002315.3_current_ids.vcf.gz"
# dbs = "galGal5"

# canFam3
url = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz"
dbs = "canFam3"

workDir = "/hive/data/outside/eva/"+dbs
if workDir[-1] != "/":
    workDir=workDir+"/"

def checkDirMakeDir(workDir):
    if not os.path.isdir(workDir):
        os.mkdir(workDir)

def bash(cmd):
    """Run the cmd in bash subprocess"""
    try:
        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
        bashStdoutt = rawBashOutput.stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return(bashStdoutt)

def wgetFile(url, workDir):
    bash("wget %s -P %s" % (url, workDir))
    fileName = workDir+url.split("/")[-1]
    return(fileName)
    
def checkDupsAndUnzip(fileName,workDir):
    """Check for duplicate lines (common in EVA), and unzip file"""
    lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip()
    uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip()
    currentFileName = workDir+"evaSnps.vcf"
    if lineCount != uniqCount:
        print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount))
        bash("zcat %s | uniq > %s" % (fileName,currentFileName))
    else:
        bash("zcat %s > %s" % (fileName,currentFileName))
    return(currentFileName)

def sanitizeChromNames(currentFileName):
    """Swaps the chrom IDs with the accession names which are indexed in chromAlias"""
    inputFile = open(currentFileName,'r')
    outputFile = open(currentFileName+".fixed","w")
    IDtoAccessionDic = {}
    
    for line in inputFile:
        if line.startswith("##contig"):
            ID = line.split('=')[2].split(',')[0]
            accession = line.split('=')[3].split('"')[1]
            IDtoAccessionDic[ID] = accession
            outputFile.write(line)
        elif line.startswith("#"):
            outputFile.write(line)
        else:
            line = line.split("\t")
            line[0] = IDtoAccessionDic[line[0]]
            line = "\t".join(line)
            outputFile.write(line)
    inputFile.close()
    outputFile.close()
    #Replacing the same name because chromAlias should be updated soon and this function will be removed
    bash("mv %s.fixed %s" % (currentFileName,currentFileName))

def convertChromToUcsc(currentFileName,dbs,workDir):
    """Convert to UCSC-style chroms"""
    bash("chromToUcsc --get %s" % (dbs))
    bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv"))
    bash("rm %s" % (dbs+".chromAlias.tsv"))
    currentFileName = workDir+"evaSnps.ucscChroms.vcf"
    return(currentFileName)
                               
def convertVcfToBed(currentFileName,workDir):
    """Convert to bed keeping only VC and SID fields"""
    bash("bgzip %s" % (currentFileName))
    bash("tabix -p vcf %s" % (currentFileName+".gz"))
    bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir))
    currentFileName = workDir+"evaSnp.bed"
    return(currentFileName)

def splitChromsAndRunHgVai(workDir,dbs):
    """Split all the chroms in tenths in order to be able to run hgVai without running out of memory"""
    chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
    inputFile = workDir+"evaSnps.ucscChroms.vcf.gz"
    outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep"
    for chrom in chromSizes[1:-1]:
        chrom = chrom.split("\t")
        chromName = chrom[0]
        print("Running "+chromName)
        if int(chrom[1]) < 1000000: #Don't worry about splitting if chrom is less than 1M bp
            if dbs == 'mm39':
                bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
            --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
            else:
                bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
            --hgvsP=on %s %s >> %s" % (chromName,chrom[1],dbs,inputFile,outputFile))
        else:
            breakPoints = [.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
            for part in breakPoints:
                startCoord = round(int(chrom[1])*(part-.1))+1
                endCoord = round(int(chrom[1])*part)
                if part == breakPoints[0]: #Special exception for start
                    if dbs == 'mm39':
                        bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
                    else:
                        bash("vai.pl --variantLimit=200000000 --position=%s:0-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,dbs,inputFile,outputFile))
                else:
                    if dbs == 'mm39':
                        bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqSelect --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))  
                    else:
                        bash("vai.pl --variantLimit=200000000 --position=%s:%s-%s --geneTrack=ncbiRefSeqCurated --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,dbs,inputFile,outputFile))                    
    return(outputFile)

def splitVepFileByChrom(hgVaiResultsFile,workDir):
    """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs"""
    splitChromDir = workDir+"splitChroms/"
    checkDirMakeDir(splitChromDir)
    vepFile = open(hgVaiResultsFile,"r")
    for line in vepFile:
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
            #Need to compute itemRGB, aaChange, and ucscClass
            lineContents = line.rstrip().split("\t")
            chrom = lineContents[1].split(":")[0]
            outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a")
            outFile.write(line)
            outFile.close()
    vepFile.close()

def fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts):             
    """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList"""
    prevName = nameAlt
    #Check if chrom has changed and index current chrom
    if chrom != currentChrom:
        currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r")
        currentVepDic = {}
        for entry in currentVepFile:
            entry = entry.rstrip().split("\t")
            rsID = entry[0]
            rsIDAlt = entry[0]+entry[2]
            #print(rsID)
            #altAllele = entry[2]

            if rsIDAlt not in currentVepDic.keys():
                currentVepDic[rsIDAlt] = {}
                currentVepDic[rsIDAlt]['ucscClassList'] = []
                currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
                currentVepDic[rsIDAlt]['aaChange'] = []
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
            else:
                if entry[6] not in currentVepDic[rsIDAlt]['ucscClassList']:
                    currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsIDAlt]['aaChange']:
                        currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])

            if rsID not in currentVepDic.keys():
                currentVepDic[rsID] = {}
                currentVepDic[rsID]['ucscClassList'] = []
                currentVepDic[rsID]['ucscClassList'].append(entry[6])
                currentVepDic[rsID]['aaChange'] = []
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
            else:
                if entry[6] not in currentVepDic[rsID]['ucscClassList']:
                    currentVepDic[rsID]['ucscClassList'].append(entry[6])
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']:
                        currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
                        
        currentChrom=chrom
        currentVepFile.close()
        
    try:
        ucscClass = ",".join(currentVepDic[nameAlt]['ucscClassList'])
        aaChange = " ".join(currentVepDic[nameAlt]['aaChange'])
    except:
        tryAttempts+=1
        ucscClass = ",".join(currentVepDic[name]['ucscClassList'])
        aaChange = " ".join(currentVepDic[name]['aaChange'])
       
    return(prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts) 

def assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt):
    """Assign color based on most serious hgVai predicted consequence"""

    redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant',
               'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained',
               'stop_lost','coding_sequence_variant','transcript_ablation']
    greenVars = ['synonymous_variant','stop_retained_variant']
    blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant',
                'non_coding_transcript_exon_variant']
    blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant',
                 'NMD_transcript_variant','no_sequence_alteration']
    
    try:
        if any(item in currentVepDic[nameAlt]['ucscClassList'] for item in redVars):
            itemRgb = '255,0,0'
        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in greenVars):
            itemRgb = '0,128,0'
        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blueVars):
            itemRgb = '0,0,255'
        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blackVars):
            itemRgb = '0,0,0'
        else:
            sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
    except:
        if any(item in currentVepDic[name]['ucscClassList'] for item in redVars):
            itemRgb = '255,0,0'
        elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars):
            itemRgb = '0,128,0'
        elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars):
            itemRgb = '0,0,255'
        elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars):
            itemRgb = '0,0,0'
        else:
            sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
        
            
    return(itemRgb)

def convertSOnumberToName(varClass):
    soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', 
           "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV', "SO:0000705": 'tandem_repeat'}
    varClass = soDic[varClass]
    return(varClass)

def buildDuplicateRsIDset(currentFileName):
    """Look for multi-allelic variants to then try and more accurately match those hgVai results"""
    duplicateSet = set()
    inputFile = open(currentFileName,"r")
    prevName = False
    for line in inputFile:
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
            lineContents = line.rstrip().split("\t")
            name = lineContents[3]
            if prevName == name:
                duplicateSet.add(name)
            prevName = name
    print("Total number of multi-allelic variants: "+str(len(duplicateSet)))
    return(duplicateSet)
        

def createFinalBedWithVepAnnotations(currentFileName,workDir):
    """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai"""

    inputFile = open(currentFileName,"r")
    outputFile = open(workDir+"evaSnp.final.bed","w")
    prevName = ""
    currentChrom = ""
    currentVepDic = {}
    n=0
    tryAttempts=0
    duplicateSet = buildDuplicateRsIDset(currentFileName)
    
    for line in inputFile:
        n+=1
        if n%5000000 == 0:
            print("Current line count: "+str(n))
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
        #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange
        #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms
        
            lineContents = line.rstrip().split("\t")
            chrom = lineContents[0]
            chromStart = lineContents[1]
            chromEnd = lineContents[2]
            name = lineContents[3]
            ref = lineContents[9]
            alt = lineContents[10]
            varClass = lineContents[12]
            submitters = lineContents[13]
            
            #Convert from SO term to name. e.g. SO:0001483 to substitution
            varClass = convertSOnumberToName(varClass)
        
            #Check if is the same rsID, they are repeated in ~5% of file
            if prevName == name:
                print("Repeated prevName: "+prevName)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
            elif name in duplicateSet:        
                #Find the aaChange and ucscClass
                #Try and match the allele with what hgVai outputs
                if varClass == "insertion":
                    if ',' in alt:
                        nameAlt = name+alt.split(",")[0][1:]
                    elif len(alt)-1 >= 59:
                        nameAlt = name+alt[len(ref):25]+"..."+alt[len(alt)-24:]+"("+str(len(alt)-1)+" nt)"
                    else:
                        nameAlt = name+alt[1:]
                elif varClass == "deletion":
                    nameAlt = name+"-"
                elif varClass == "tandem_repeat":
                    if len(alt) > len(ref):
                        nameAlt = name+alt[len(ref):]
                    else:
                        nameAlt = name+"-"
                else:
                    nameAlt = name+alt
          
                prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts)
                itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
            else:
                nameAlt = name
                prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts)
                itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")

    print("Total number of try attempts = "+str(tryAttempts))
    print("Total number of lines in output file = "+str(n))
    inputFile.close()
    outputFile.close()

def validateFinalFile(workDir,url):
    """Compare and make sure that the final bed rsID# matches the input file"""
    originalDownloadFileName = workDir+url.split("/")[-1]
    bedFile = workDir+"evaSnp.final.bed"
    originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName))
    finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile))
    if originalRsIDCount != finalBedRsIDCount:
        sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\
    "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount)))
    
def createBigBed(workDir,dbs):
    bedFile = workDir+"evaSnp.final.bed"
    bash("bedSort %s %s" % (bedFile,bedFile))
    bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \
    %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp.bb" % (bedFile,dbs,dbs,workDir))
    
checkDirMakeDir(workDir)
fileName = wgetFile(url, workDir)
fileName = workDir+url.split("/")[-1]
print("Got file")
currentFileName = checkDupsAndUnzip(fileName,workDir)
print("File unzipped")
try:
    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
    print("Chroms converted")
except:
    sanitizeChromNames(currentFileName)
    print("Chroms sanitized")
    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
    print("Chroms converted")
currentFileName = convertVcfToBed(currentFileName,workDir)    
print("File converted to bed")
hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs)
print("hgVai done")
splitVepFileByChrom(hgVaiResultsFile,workDir)
print("hgVai file split")
currentFileName = workDir+"evaSnp.bed"
createFinalBedWithVepAnnotations(currentFileName,workDir)
print("Validating file")
validateFinalFile(workDir,url)
print("Created final file")
createBigBed(workDir,dbs)
print("bigBed made")

#####################################
#### End of python3 code ############
#####################################