src/hg/makeDb/doc/evaSnp4.txt

# Track for EVA snp release 4  - https://www.ebi.ac.uk/eva/?RS-Release&releaseVersion=4
# Tracks built by Lou on 3/2/2023

# Track was built for the following 36 assemblies: panTro5, melGal5, equCab3, rn5, canFam3, oreNil2, 
# calJac3, bosTau7, taeGut2, bosTau6, galGal4, rn7, danRer10, danRer11, gorGor3, oviAri3, ponAbe3, 
# rheMac10, galGal6, danRer7, bosTau9, galGal5, oviAri4, mm39, mm10, anoGam3, rheMac8, gorGor4, felCat9, 
# macFas5, susScr3, chlSab2, ci3, dm6, susScr11, felCat3

# The GCA accession on the eva release by accession list (https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_assembly/)
# were compared to all native assemblies we have. In total there are 894617246 variants,

# All assemblies were passed by the python pipeline described below. There were 3 assemblies
# that requires post-processing, bosTau9 and galGal5 I had to remove their chrMT as we had none,
# and rheMac8 I had to change the name of the chrMT to one that matches us. See RM #30320

#####################################
#### Beginning of python3 code ######
#####################################

#NEW

import re, os, subprocess, sys

def findAllAccFromDescPages():
    """Search all gbdb description pages and pull out GCA/GCF accessions"""
    listOfHits = []
    assemblyAlias = {}
    allDescPages = bash("ls /gbdb/*/html/description.html").rstrip().split("\n")
    regex = re.compile('GC[AF]_[0-9]*\.[0-9]*')
    for page in allDescPages:
        try:
            hits = bash("cat "+page+" | grep -i 'GCA\|GCF'").rstrip().split("\n")
        except:
            hits = []
        if hits != []:
            ucscName = page.split("/")[2]
            hits = str(hits)
            assemblyAlias[ucscName] = {}
            try:
                accession = regex.findall(hits)[0]
                if accession.startswith("GCA"):
                    assemblyAlias[ucscName]['GCA'] = accession
                elif accession.startswith("GCF"):
                    assemblyAlias[ucscName]['GCF'] = accession
            except:
                print("The following page could not be matched with the accession format",page,hits)
    return(assemblyAlias)

def buildGCFdicFromGenbankFile():
    genbankSummaryDic = {}
    regexGenbankGCA = re.compile('GCA_[0-9]*\.[0-9]*')
    regexGenbankGCF = re.compile('GCF_[0-9]*\.[0-9]*')
    summaryFile = open('/hive/users/lrnassar/evaSnp4/assembly_summary_genbank.txt','r')
    for line in summaryFile:
        if "GCF_" in line:
            GCAacc = regexGenbankGCA.findall(line)[0]
            GCFacc = regexGenbankGCF.findall(line)[0]
            genbankSummaryDic[GCFacc] = GCAacc
    summaryFile.close()        
    return(genbankSummaryDic)

def findGCAaccForUCSCgcfDbs(assemblyAlias,genbankSummaryDic):
    """Try to find matching GCA accession to UCSC dbs based on genbank file or curling web page"""
    n=0
    regexGenbankGCA = re.compile('GCA_[0-9]*\.[0-9]*')
    for key, value in assemblyAlias.items():
        if 'GCF' in value:
            if value['GCF'] in genbankSummaryDic.keys():
                assemblyAlias[key]['GCA'] = genbankSummaryDic[value['GCF']]
                n+=1
            else: #Try to find it online at NCBI
                ncbiGCF = value['GCF']
                ncbiWgetPage = bash("curl https://www.ncbi.nlm.nih.gov/assembly/"+ncbiGCF+"/").rstrip().split("\n")
                for line in ncbiWgetPage:
                    if "GenBank assembly accession" in line and "GCA_" in line:
                        for subLine in line.split("/dd"):
                            if "GenBank assembly accession" in subLine:
                                GCAacc = regexGenbankGCA.findall(subLine)[0]
                                assemblyAlias[key]['GCA'] = GCAacc
                                n+=1
    print("Total number of UCSC GCFs matched with GCAs: ",n)
    return(assemblyAlias)
    
def sanitizeListToOnlyUCSCdbsWithGCA(assemblyAlias):
    n = 0
    m=0
    ucscGCAlist = {}
    for key, value in assemblyAlias.items():
            if 'GCA' in value:
                if value['GCA'] != 'GCA_000':
                    ucscGCAlist[key] = value['GCA']
                n+=1
            else:
                m+=1
    print("Total number of UCSC databases already with a GCA:",n)
    print("Total number of UCSC databases with GCF that will need GCA lookup:",m)
    return(ucscGCAlist)

def buildFinalDicOfUCSCdbsWithEvaSnp4Data(ucscGCAlist):
    ucscEvaSnp4Dbs = {}
    n=0
    regexEvaSnp4GCA = re.compile('GCA_[0-9]*\.[0-9]*')
    evaRelease4page = bash("curl https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_4/by_assembly/").rstrip().split("\n")
    activeRRassemblies = bash("""hgsql -h genome-centdb -e "select name from dbDb where active='1'" hgcentral""")
    for key in ucscGCAlist:
        if ucscGCAlist[key] in str(evaRelease4page):
            for line in evaRelease4page:
                if ucscGCAlist[key] in line:
                    if key in activeRRassemblies:
                        fullGCA = regexEvaSnp4GCA.findall(line)[0]
                        ucscEvaSnp4Dbs[key] = fullGCA
                        n+=1
    ucscEvaSnp4Dbs['mm10']='GCA_000001635.6' #Manually add mm10 which is weird due to patches
    print("Total number of assemblies with EvaSnp4 data: ",n)
    return(ucscEvaSnp4Dbs)

def checkDirMakeDir(workDir):
    if not os.path.isdir(workDir):
        os.mkdir(workDir)

def bash(cmd):
    """Run the cmd in bash subprocess"""
    try:
        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
        bashStdoutt = rawBashOutput.stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return(bashStdoutt)

def wgetFile(url, workDir):
    bash("wget -N %s -P %s" % (url, workDir))
    fileName = workDir+url.split("/")[-1]
    return(fileName)

def checkDupsAndUnzip(fileName,workDir):
    """Check for duplicate lines (common in EVA), and unzip file"""
    lineCount = bash("zcat %s | wc -l" % (fileName)).rstrip()
    uniqCount = bash("zcat %s | uniq | wc -l" % (fileName)).rstrip()
    currentFileName = workDir+"evaSnps.vcf"
    if lineCount != uniqCount:
        print("There are duplicated entries.\nOriginal file:%s\nUniq file:%s\nCreating new file." % (lineCount, uniqCount))
        bash("zcat %s | uniq > %s" % (fileName,currentFileName))
    else:
        bash("zcat %s > %s" % (fileName,currentFileName))
    return(currentFileName)

def sanitizeChromNames(currentFileName,dbs,workDir):
    """Swaps the chrom IDs with the accession names which are indexed in chromAlias"""
    inputFile = open(currentFileName,'r')
    outputFile = open(currentFileName+".fixed","w")
    IDtoAccessionDic = {}
    
    for line in inputFile:
        if line.startswith("##contig"):
            ID = line.split('=')[2].split(',')[0]
            accession = line.split('=')[3].split('"')[1]
            IDtoAccessionDic[ID] = accession
            outputFile.write(line)
        elif line.startswith("#"):
            outputFile.write(line)
        else:
            line = line.split("\t")
            line[0] = IDtoAccessionDic[line[0]]
            line = "\t".join(line)
            outputFile.write(line)
    inputFile.close()
    outputFile.close()
    #Replacing the same name because chromAlias should be updated soon and this function will be removed
    bash("mv %s.fixed %s" % (currentFileName,currentFileName))
    currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
    print("Chroms converted")
    
def addChrToChromNumbers(fileName,currentFileName,workDir):
    bash("zcat %s | uniq > %s" % (fileName,currentFileName))
    with open(workDir+"evaSnps.ucscChroms.vcf",'w') as newChromVcf:
        with open(currentFileName,"r") as oldChromVcf:
            for line in oldChromVcf:
                if line.startswith("#"):
                    newChromVcf.write(line)
                elif line.startswith("AC024175.3"): #Exception for zebrafish
                    newChromVcf.write(line.replace("AC024175.3","chrM"))
                else:
                    newChromVcf.write("chr"+line)
    currentFileName = workDir+"evaSnps.ucscChroms.vcf"
    return(currentFileName)

def convertChromToUcsc(currentFileName,dbs,workDir):
    """Convert to UCSC-style chroms"""
    bash("chromToUcsc --get %s" % (dbs))
    bash("chromToUcsc -i %s -o %sevaSnps.ucscChroms.vcf -a %s" % (currentFileName,workDir,dbs+".chromAlias.tsv"))
    bash("rm %s" % (dbs+".chromAlias.tsv"))
    currentFileName = workDir+"evaSnps.ucscChroms.vcf"
    return(currentFileName)
                               
def convertVcfToBed(currentFileName,workDir):
    """Convert to bed keeping only VC and SID fields"""
    bash("bgzip %s" % (currentFileName))
    bash("tabix -p vcf %s" % (currentFileName+".gz"))
    bash("vcfToBed %s %sevaSnp -fields=VC,SID" % (currentFileName+".gz",workDir))
    currentFileName = workDir+"evaSnp.bed"
    return(currentFileName)

def buildChromSizesDic(workDir,dbs):
    """Create a dictionary the chromSizes"""
    chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
    chromDic = {}
    for chrom in chromSizes[1:-1]:
        chrom = chrom.split("\t")
        chromDic[chrom[0]] = chrom[1]
    return(chromDic)
    
def splitChromsAndRunHgVai(workDir,dbs):
    """Split all the chroms in tenths in order to be able to run hgVai without running out of memory"""
    chromSizes = bash("fetchChromSizes %s" % (dbs)).split("\n")
    inputFile = workDir+"evaSnps.ucscChroms.vcf.gz"
    outputFile = workDir+"evaSnps"+dbs+"VaiResults.vep"
    n=0
    allTables = bash("\hgsql -e \"show tables\" "+dbs+"")
    if dbs in ['galGal6','oviAri4','ponAbe3']: #Special exception to use refGene because ncbiRefSeq has incorrect protein sequence in Link table see #29262
        geneTableToUse = "refGene"
#     elif "ncbiRefSeqSelect" in allTables: #Removing refseq select as the annotations are too sparse
#         geneTableToUse = "ncbiRefSeqSelect"
    elif "ncbiRefSeqCurated" in allTables:
        geneTableToUse = "ncbiRefSeqCurated"
    elif "ensGene" in allTables:
        geneTableToUse = "ensGene"
    elif "refGene" in allTables:
        geneTableToUse = "refGene"
    elif "ncbiGene" in allTables:
        geneTableToUse = "ncbiGene"
    else:
        print(dbs)
        sys.exit("Could not find any tables to use for the following database: "+dbs) 
    
    chromDic = buildChromSizesDic(workDir,dbs)
    #For function below, only bother with the chromosomes in the VCF
    chromsInVcf = bash("zcat "+inputFile+" | grep -v '^#' | cut -f1 | uniq").rstrip().split("\n")
    
    for chrom in chromsInVcf:
        chromName = chrom
        chromSize = chromDic[chrom]
        n+=1
        if (n % 100) == 0:
            print("Have currently run "+str(n)+"/"+str(len(chromsInVcf))+" chroms. Currently running "+chromName)
        if int(chromSize) < 1000000: #Don't worry about splitting if chrom is less than 1M bp
            bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:0-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \
            --hgvsP=on %s %s >> %s" % (chromName,chromSize,geneTableToUse,dbs,inputFile,outputFile))
        else:
            breakPoints = [.2,.4,.6,.8,1]
            for part in breakPoints:
                startCoord = round(int(chromSize)*(part-.2))
                endCoord = round(int(chromSize)*part)
                if part == breakPoints[0]: #Special exception for start
                    bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:0-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,endCoord,geneTableToUse,dbs,inputFile,outputFile))
                else:
                    bash("vai.pl --variantLimit=200000000 --hgVai=/usr/local/apache/cgi-bin/hgVai --position=%s:%s-%s --geneTrack=%s --hgvsG=off --hgvsCN=off \
                    --hgvsP=on %s %s >> %s" % (chromName,startCoord,endCoord,geneTableToUse,dbs,inputFile,outputFile))                    
    return(outputFile)

def splitVepFileByChrom(hgVaiResultsFile,workDir):
    """Split the vep file to one file per chrom in order to index entire chrom into a dic and quickly lookup the rsIDs"""
    splitChromDir = workDir+"splitChroms/"
    checkDirMakeDir(splitChromDir)
    vepFile = open(hgVaiResultsFile,"r")
    for line in vepFile:
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
            #Need to compute itemRGB, aaChange, and ucscClass
            lineContents = line.rstrip().split("\t")
            chrom = lineContents[1].split(":")[0]
            outFile = open(splitChromDir+"evaSnpVaiResults."+chrom+".vep","a")
            outFile.write(line)
            outFile.close()
    vepFile.close()

def fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir):             
    """Build dictionary out of vep results with rsID as index, populate aaChange and ucscClassList"""
    prevName = nameAlt
    #Check if chrom has changed and index current chrom
    if chrom != currentChrom:
        currentVepFile = open(workDir+"splitChroms/evaSnpVaiResults."+chrom+".vep","r")
        currentVepDic = {}
        for entry in currentVepFile:
            entry = entry.rstrip().split("\t")
            rsID = entry[0]
            rsIDAlt = entry[0]+entry[2]

            if rsIDAlt not in currentVepDic.keys():
                currentVepDic[rsIDAlt] = {}
                currentVepDic[rsIDAlt]['ucscClassList'] = []
                currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
                currentVepDic[rsIDAlt]['aaChange'] = []
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
            else:
                if entry[6] not in currentVepDic[rsIDAlt]['ucscClassList']:
                    currentVepDic[rsIDAlt]['ucscClassList'].append(entry[6])
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsIDAlt]['aaChange']:
                        currentVepDic[rsIDAlt]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])

            if rsID not in currentVepDic.keys():
                currentVepDic[rsID] = {}
                currentVepDic[rsID]['ucscClassList'] = []
                currentVepDic[rsID]['ucscClassList'].append(entry[6])
                currentVepDic[rsID]['aaChange'] = []
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
            else:
                if entry[6] not in currentVepDic[rsID]['ucscClassList']:
                    currentVepDic[rsID]['ucscClassList'].append(entry[6])
                if entry[13] != '-' and 'p' in entry[13] and '?' not in entry[13]:
                    if entry[13].split(";")[0].split("=")[1] not in currentVepDic[rsID]['aaChange']:
                        currentVepDic[rsID]['aaChange'].append(entry[13].split(";")[0].split("=")[1].split(":")[1])
                        
        currentChrom=chrom
        currentVepFile.close()
        
    try:
        ucscClass = ",".join(currentVepDic[nameAlt]['ucscClassList'])
        aaChange = " ".join(currentVepDic[nameAlt]['aaChange'])
    except:
        tryAttempts+=1
        ucscClass = ",".join(currentVepDic[name]['ucscClassList'])
        aaChange = " ".join(currentVepDic[name]['aaChange'])
       
    return(prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts) 

def assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt):
    """Assign color based on most serious hgVai predicted consequence"""

    redVars = ['exon_loss_variant','frameshift_variant','inframe_deletion','inframe_insertion','initiator_codon_variant',
               'missense_variant','splice_acceptor_variant','splice_donor_variant','splice_region_variant','stop_gained',
               'stop_lost','coding_sequence_variant','transcript_ablation']
    greenVars = ['synonymous_variant','stop_retained_variant']
    blueVars = ['5_prime_UTR_variant','3_prime_UTR_variant','complex_transcript_variant',
                'non_coding_transcript_exon_variant']
    blackVars = ['upstream_gene_variant','downstream_gene_variant','intron_variant','intergenic_variant',
                 'NMD_transcript_variant','no_sequence_alteration']
    
    try:
        if any(item in currentVepDic[nameAlt]['ucscClassList'] for item in redVars):
            itemRgb = '255,0,0'
        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in greenVars):
            itemRgb = '0,128,0'
        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blueVars):
            itemRgb = '0,0,255'
        elif any(item in currentVepDic[nameAlt]['ucscClassList'] for item in blackVars):
            itemRgb = '0,0,0'
        else:
            sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
    except:
        if any(item in currentVepDic[name]['ucscClassList'] for item in redVars):
            itemRgb = '255,0,0'
        elif any(item in currentVepDic[name]['ucscClassList'] for item in greenVars):
            itemRgb = '0,128,0'
        elif any(item in currentVepDic[name]['ucscClassList'] for item in blueVars):
            itemRgb = '0,0,255'
        elif any(item in currentVepDic[name]['ucscClassList'] for item in blackVars):
            itemRgb = '0,0,0'
        else:
            sys.exit("One of the following consequences not found in color list, please add: "+str(currentVepDic[name]['ucscClassList']))
        
            
    return(itemRgb)

def convertSOnumberToName(varClass):
    soDic={"SO:0001483": 'substitution', "SO:0000667": 'insertion', "SO:0000159": 'deletion', 
           "SO:0001059": 'sequence alteration',"SO:1000032": 'delins',"SO:0002007": 'MNV', "SO:0000705": 'tandem_repeat'}
    varClass = soDic[varClass]
    return(varClass)

def buildDuplicateRsIDset(currentFileName):
    """Look for multi-allelic variants to then try and more accurately match those hgVai results"""
    duplicateSet = set()
    inputFile = open(currentFileName,"r")
    prevName = False
    for line in inputFile:
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
            lineContents = line.rstrip().split("\t")
            name = lineContents[3]
            if prevName == name:
                duplicateSet.add(name)
            prevName = name
    print("Total number of multi-allelic variants: "+str(len(duplicateSet)))
    return(duplicateSet)
        

def createFinalBedWithVepAnnotations(currentFileName,workDir):
    """Parse bed file and output final file without FILTER field, and with SO name, item RGB, and aaChange/ucscClass from hgVai"""

    inputFile = open(currentFileName,"r")
    outputFile = open(workDir+"evaSnp.final.bed","w")
    prevName = ""
    currentChrom = ""
    currentVepDic = {}
    n=0
    tryAttempts=0
    duplicateSet = buildDuplicateRsIDset(currentFileName)
    
    for line in inputFile:
        n+=1
        if n%5000000 == 0:
            print("Current line count: "+str(n))
        if line.startswith("#") or line.startswith("Uploaded"):
            continue
        else:
        #Desired schema chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt varClass submitters ucscClass aaChange
        #Need to compute itemRGB, aaChange, and ucscClass. Also need to change SO terms
        
            lineContents = line.rstrip().split("\t")
            chrom = lineContents[0]
            chromStart = lineContents[1]
            chromEnd = lineContents[2]
            name = lineContents[3]
            ref = lineContents[9]
            alt = lineContents[10]
            varClass = lineContents[12]
            submitters = lineContents[13]
            
            #Convert from SO term to name. e.g. SO:0001483 to substitution
            varClass = convertSOnumberToName(varClass)
        
            #Check if is the same rsID, they are repeated in ~5% of file
            if prevName == name:
                print("Repeated prevName: "+prevName)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
            elif name in duplicateSet:        
                #Find the aaChange and ucscClass
                #Try and match the allele with what hgVai outputs
                if varClass == "insertion":
                    if ',' in alt:
                        nameAlt = name+alt.split(",")[0][1:]
                    elif len(alt)-1 >= 59:
                        nameAlt = name+alt[len(ref):25]+"..."+alt[len(alt)-24:]+"("+str(len(alt)-1)+" nt)"
                    else:
                        nameAlt = name+alt[1:]
                elif varClass == "deletion":
                    nameAlt = name+"-"
                elif varClass == "tandem_repeat":
                    if len(alt) > len(ref):
                        nameAlt = name+alt[len(ref):]
                    else:
                        nameAlt = name+"-"
                else:
                    nameAlt = name+alt
          
                prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir)
                itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")
            else:
                nameAlt = name
                prevName,currentChrom,currentVepDic,ucscClass,aaChange,tryAttempts = fetchUcscClassAndAaChange(name,alt,nameAlt,chrom,currentChrom,currentVepDic,tryAttempts,workDir)
                itemRgb = assignRGBcolorByConsequence(currentVepDic,name,alt,nameAlt)
                outputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t0\t.\t"+chromStart+"\t"+chromEnd+"\t"+
                                 itemRgb+"\t"+ref+"\t"+alt+"\t"+varClass+"\t"+submitters+"\t"+ucscClass+"\t"+aaChange+"\n")

    print("Total number of try attempts = "+str(tryAttempts))
    print("Total number of lines in output file = "+str(n))
    inputFile.close()
    outputFile.close()

def validateFinalFile(workDir,url):
    """Compare and make sure that the final bed rsID# matches the input file"""
    originalDownloadFileName = workDir+url.split("/")[-1]
    bedFile = workDir+"evaSnp.final.bed"
    originalRsIDCount = bash("zcat %s | grep -v '^#' | cut -f3 | sort | uniq | wc -l" % (originalDownloadFileName))
    finalBedRsIDCount = bash("cut -f4 %s | sort | uniq | wc -l" % (bedFile))
    if originalRsIDCount != finalBedRsIDCount:
        sys.exit("There was an error in the pipeline, the initial numer of uniq RS IDs does not match the final bed file.\n\n"\
    "%s - %s%s - %s" % (originalDownloadFileName,str(originalRsIDCount),bedFile,str(finalBedRsIDCount)))
    finalBedLineCount = bash("cat %s | wc -l" % (bedFile))
    finalBedUniqueLineCount = bash("cat %s | sort | uniq | wc -l" % (bedFile))
    if finalBedLineCount != finalBedUniqueLineCount:
        sys.exit("There are duplicate entries in the file: "+str(finalBedLineCount)+" vs "+str(finalBedUniqueLineCount))
    
def createBigBed(workDir,dbs):
    bedFile = workDir+"evaSnp.final.bed"
    bash("bedSort %s %s" % (bedFile,bedFile))
    bash("bedToBigBed -tab -as=$HOME/kent/src/hg/lib/evaSnp.as -type=bed9+6 -extraIndex=name \
    %s http://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.chrom.sizes %sevaSnp4.bb" % (bedFile,dbs,dbs,workDir))
    
def chechIfDbsDone(key):
    if os.path.exists("/hive/data/outside/eva4/"+key+"/evaSnp4.bb"):
        print("DBS already completed: "+key)
        return(True)
    else:
        if os.path.exists("/hive/data/outside/eva4/"+key+"/evaSnps.ucscChroms.vcf") or os.path.exists("/hive/data/outside/eva4/"+key+"/evaSnps.ucscChroms.vcf.gz"):
            print("Cleaning up directory for previous attempt of "+key)
            bash("rm /hive/data/outside/eva4/"+key+"/evaSnps*")
            if os.path.exists("/hive/data/outside/eva4/"+key+"/splitChroms"):
                try:
                    bash("rm /hive/data/outside/eva4/"+key+"/splitChroms/*")
                except:
                    print("Directly cleaned")
        return(False)
    
def buildEvaSnp4Track(ucscDatabaseName,ucscEvaSnp4Dbs):
    """Build evaSnp4 track from start to finish"""
    dbs = ucscDatabaseName
    GCAacc = ucscEvaSnp4Dbs[dbs]
    workDir = "/hive/data/outside/eva4/"+dbs+"/"
    urlToFile = "https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_4/by_assembly/"+GCAacc+"/"+GCAacc+"_current_ids.vcf.gz"
    checkDirMakeDir(workDir)
    fileName = wgetFile(urlToFile, workDir)
    fileName = workDir+urlToFile.split("/")[-1]
    print("Got file: ",dbs)
    currentFileName = checkDupsAndUnzip(fileName,workDir)
    print("File unzipped")
    try:
        currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
        print("Chroms converted")
    except:
        try:
            sanitizeChromNames(currentFileName,dbs,workDir)
            print("Chroms sanitized")
            currentFileName = convertChromToUcsc(currentFileName,dbs,workDir)
            print("Chroms converted")
        except:
            currentFileName = addChrToChromNumbers(fileName,currentFileName,workDir)
            print("Chroms completed after adding '1' to starts.")
    currentFileName = convertVcfToBed(currentFileName,workDir)    
    print("File converted to bed")
    hgVaiResultsFile = splitChromsAndRunHgVai(workDir,dbs)
    print("hgVai done")
    splitVepFileByChrom(hgVaiResultsFile,workDir)
    print("hgVai file split")
    currentFileName = workDir+"evaSnp.bed"
    createFinalBedWithVepAnnotations(currentFileName,workDir)
    print("Validating file")
    validateFinalFile(workDir,urlToFile)
    print("Created final file")
    createBigBed(workDir,dbs)
    print("bigBed made")
    
def main(mode):
    #Run main with either "all" or a specific UCSC database name
    #Find all the assemblies we have with matching evaSnp4 data
    assemblyAlias = findAllAccFromDescPages()
    genbankSummaryDic = buildGCFdicFromGenbankFile()
    assemblyAlias = findGCAaccForUCSCgcfDbs(assemblyAlias,genbankSummaryDic)
    ucscGCAlist = sanitizeListToOnlyUCSCdbsWithGCA(assemblyAlias)
    ucscEvaSnp4Dbs = buildFinalDicOfUCSCdbsWithEvaSnp4Data(ucscGCAlist)
    #Build evaSNP4 data for either one specific assembly, or all assemblies
    if mode == 'all':
        for key in ucscEvaSnp4Dbs:
            if not chechIfDbsDone(key):
                try:
                    buildEvaSnp4Track(key,ucscEvaSnp4Dbs)
                except:
                    print("Problem with DBS: "+key)
                    continue
    else:
        chechIfDbsDone("danRer11")
        buildEvaSnp4Track(mode,ucscEvaSnp4Dbs)
    
main("all")

#####################################
######### End of python3 code ######
#####################################

# Lastly links were made with a file containing the 36 assemblies:

for dbs in $(cat /hive/data/outside/eva4/assemblyReleaseList.txt);
do
ln -s /hive/data/outside/eva4/$dbs/evaSnp4.bb /gbdb/$dbs/bbi/;
done