# Merging genotypes

In this script we will take the harmonized genotypes and merge them.
Note that UIUC2014 only contain duplicates from the ADAPT file, so we will remove it from further analysis.

## Preliminaries

Let's import modules and set paths

In [1]:
import subprocess, os, glob
import pandas as pd
import numpy as np
from GenotypeQC import Split_chr

In [2]:
#Setting paths
projpath  = os.path.realpath("..")
pathgenos = os.path.join(projpath, "DataBases", "Genotypes")

## Concatenating each file by chromosome

Let's first merge the different files divided by chromosomes into a single one.
In the CV file we'll need to remove some duplicated SNPs.

In [3]:
#Move directory
os.chdir(pathgenos)

In [28]:
#Entering every directory in harmonize folder
filenames = os.listdir(os.path.join(pathgenos, "03_Harmonized"))

for filename in filenames: # loop through all the files and folders
    if os.path.isdir(os.path.join(pathgenos, "03_Harmonized", filename)):
        os.chdir(os.path.join(pathgenos, "03_Harmonized", filename))
        #Opening new file and pasting the file names to use for merging
        f = open(filename + ".txt", "w+")
        for file in glob.glob("*_harmonized.bed"):
            f.write(file.split(".")[0] + "\n")
        f.close()
        if "CV_" in filename or "Euro" in filename: #in CV and Euro we need to remove some SNPs before merging
            subprocess.run(["plink", "--merge-list", filename + ".txt", "--allow-no-sex", "--make-bed", "--out", 
                            os.path.join(pathgenos, "03_Harmonized", filename + "_excludedtemp" )])
            for file in glob.glob("*_harmonized.bed"):
                subprocess.run(["plink", "--bfile", file.split(".")[0], "--exclude", 
                                os.path.join(pathgenos, "03_Harmonized", filename + "_excludedtemp-merge.missnp" ), 
                                "--make-bed", "--out", file.split(".")[0] + "_temp"])
                
            f = open(filename + ".txt", "w+")
            for file in glob.glob("*_temp.bed"):
                f.write(file.split(".")[0] + "\n")
            f.close()
            subprocess.run(["plink", "--merge-list", filename + ".txt", "--allow-no-sex", "--make-bed", "--out", 
                            os.path.join(pathgenos, "03_Harmonized", filename + "_all_harmonized" )])
            for file in glob.glob("*_temp.*"):
                os.remove(file)   
                
        else:
            subprocess.run(["plink", "--merge-list", filename + ".txt", "--allow-no-sex", "--make-bed", "--out", 
                            os.path.join(pathgenos, "03_Harmonized", filename + "_all_harmonized" )])

## Merging and cleaning all files

Now we will merge the harmonized files into four different dataset as explained below.

In [72]:
#Move directory
os.chdir(os.path.join(pathgenos, "03_Harmonized"))

In [75]:
#Creating initial merging files without UIUC2014
f = open("merge_sparse.txt", "w+")
for file in glob.glob("*harmonized.bed"):
    if "CHP" in file or "UIUC2014" in file:
        pass
    else:
        f.write(file.split(".")[0] + "\n")
f.close()

In [76]:
#Merging all listed files, create SNP list to exclude and exclude them from all files to merge

#For each file with the list of datasets create a file of excluded SNPs
for setfile in glob.glob("merge*.txt"): 
    setname   = setfile.split(".")[0]
    #reading file to get the database names
    filenames = pd.read_csv(setfile, header = None) 
    subprocess.run(["plink", "--merge-list", setfile, "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, "ExcludeSnps") ])
    
    for i in range( (len(filenames.index)) ):
        #excluding SNPs from each database on the list
        subprocess.run(["plink", "--bfile", filenames.iloc[i,0], "--exclude", os.path.join(pathgenos, "04_Merge", setname, "ExcludeSnps-merge.missnp"),
                        "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, filenames.iloc[i,0] + "_excludedtemp") ])
    #creating a new mergefile with the databases with excluded SNPs
    f = open(os.path.join(pathgenos, "04_Merge", setname, "mergefile.txt"), "w+") 
    for bedfile in glob.glob(os.path.join(pathgenos, "04_Merge", setname, "*.bed") ):
        f.write(bedfile.split(".")[0] + "\n")
    f.close()
    #Second merging for all phenos file and QC process
    subprocess.run(["plink", "--merge-list", os.path.join(pathgenos, "04_Merge", setname, "mergefile.txt"), 
                    "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, setname) ])

    subprocess.run(["plink", "--bfile", os.path.join(pathgenos, "04_Merge", setname, setname), #Being more strict (--geno 0.08) in here allows to retain the CV samples
                    "--geno", "0.05", "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno") ]) 

    subprocess.run(["plink", "--bfile", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno"), 
                    "--maf", "0.05", "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno" + "_maf")])

    subprocess.run(["plink", "--bfile", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno" + "_maf"), 
                    "--hwe", "1e-50", "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno" + "_maf" + "_hwe") ])
   
    subprocess.run(["plink", "--bfile", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno" + "_maf" + "_hwe"), 
                    "--mind", "0.05", "--make-bed", "--out", os.path.join(pathgenos, "04_Merge", setname, setname + "_geno" + "_maf" + "_hwe" + "_mind01")])
    
    for file in glob.glob(os.path.join(pathgenos, "04_Merge", setname, "*") ): #Removing intermediary files
        exclude = ["mind01.", "split"]
        if any(x in file for x in exclude):
            pass
        else:
            os.remove(file)

Now let's see how many SNPs and samples per file there are.
There are no duplicated IIDs left.

In [77]:
for file in glob.glob(os.path.join(pathgenos, "04_Merge", "**", "*.log") ):
    with open(file) as myfile:
        print("In file: " + file.split(".")[0])
        for num, line in enumerate(myfile, 1):
            if "people" in line or "variants" in line:
                print(line, end='')
        print("Finished file... \n")

In file: /home/tomas/Documents/Research/PopStruct/DataBases/Genotypes/04_Merge/merge_sparse/merge_sparse_geno_maf_hwe_mind01
126937 variants loaded from .bim file.
2730 people (942 males, 1788 females) loaded from .fam.
126937 variants and 2729 people pass filters and QC.
Finished file... 



Let's split the database in chromosomes to use fineStructure

In [78]:
os.chdir(os.path.join(pathgenos, "04_Merge", "merge_sparse"))
Split_chr()

## Merge with reference samples and LD prune
Now we will merge the merged dataset with the reference samples, and run LD pruned

In [83]:
os.chdir(os.path.join(pathgenos, "04_Merge"))
reference_geno = glob.glob(os.path.join(pathgenos, "00_Reference", "*.bed")) #Location of reference genomes
ref_file       = reference_geno[0].split(".")[0]
ref_slice      = reference_geno[0].split(".")[0] + "_slice"

#Entering every directory in merge folder
filenames = os.listdir(os.chdir(os.path.join(pathgenos, "04_Merge")))

for filename in filenames: 
    #If filename is a folder enter in it
    if os.path.isdir(os.path.join(pathgenos, "04_Merge", filename)):
        os.chdir(os.path.join(pathgenos, "04_Merge", filename))
        for file in glob.glob("*.bim"):
            snps = pd.read_csv(file, sep = "\t", header = None)
            snps.iloc[:,1].to_csv("snplist.txt", index = False)
            
        #For each bed file, generate a snplist and extract them from the reference, and then merge them
        for file in glob.glob("*.bed"): 
            file_pref = file.split(".")[0]
            subprocess.run(["plink", "--bfile", ref_file, "--extract", "snplist.txt", "--make-bed", "--out", ref_slice])
            #First merging will create a list of problematic SNPs
            subprocess.run(["plink", "--bfile", file_pref, "--bmerge", ref_slice, "--make-bed", "--out", file_pref + "_ref"]) 
            
            #Removing problematic SNPs from datasets and try a merging again
            removesnp = glob.glob("*.missnp")[0]
            subprocess.run(["plink", "--bfile", file_pref, "--exclude", removesnp , "--make-bed", "--out", file_pref + "_temp" ])
            
            subprocess.run(["plink", "--bfile", ref_slice, "--exclude", removesnp, "--make-bed", "--out", ref_slice + "_temp" ])
            
            #Merging again
            ref_output     = os.path.join(pathgenos, "05_Ref", file_pref + "_ref") #Output location
            subprocess.run(["plink", "--bfile", file_pref + "_temp", "--bmerge", ref_slice + "_temp", "--make-bed", "--out", ref_output ])
            
            #Basic QC
            subprocess.run(["plink", "--bfile", ref_output, "--geno", "0.1", "--make-bed", "--out", ref_output ])
            subprocess.run(["plink", "--bfile", ref_output, "--maf", "0.05", "--make-bed", "--out", ref_output ])
            subprocess.run(["plink", "--bfile", ref_output, "--mind", "0.1", "--make-bed", "--out", ref_output ])

            #LD prune
            ld_output      = os.path.join(pathgenos, "06_Pruned", file_pref + "_ref" + "_pruned" ) #Output location
            subprocess.run(["plink", "--bfile", ref_output, "--indep-pairwise", "50", "5", "0.5", "--make-bed", "--out", ld_output])
            subprocess.run(["plink", "--bfile", ref_output, "--exclude", ld_output + ".prune.out", "--make-bed", "--out", ld_output])

            #Remove intermediate files
            for file in glob.glob("*_temp*"):
                os.remove(file)
            for file in glob.glob("*_ref*"):
                os.remove(file)
            for file in glob.glob(os.path.join(pathgenos, "00_Reference", "*_slice*" )):
                os.remove(file)

Let's see how many SNPs after the merging with the reference sampels

In [84]:
for file in glob.glob(os.path.join(pathgenos, "05_Ref", "*.log") ):
    with open(file) as myfile:
        print("In file: " + file.split(".")[0])
        for num, line in enumerate(myfile, 1):
            if "people" in line or "variants" in line:
                print(line, end='')
        print("Finished file... \n")

In file: /home/tomas/Documents/Research/PopStruct/DataBases/Genotypes/05_Ref/merge_sparse_geno_maf_hwe_mind01_ref
96780 variants loaded from .bim file.
6173 people (2793 males, 3380 females) loaded from .fam.
0 people removed due to missing genotype data (--mind).
96780 variants and 6173 people pass filters and QC.
Finished file... 



And after LD prune

In [85]:
for file in glob.glob(os.path.join(pathgenos, "06_Pruned", "*.log") ):
    with open(file) as myfile:
        print("In file: " + file.split(".")[0])
        for num, line in enumerate(myfile, 1):
            if "people" in line or "variants" in line:
                print(line, end='')
        print("Finished file... \n")

In file: /home/tomas/Documents/Research/PopStruct/DataBases/Genotypes/06_Pruned/merge_sparse_geno_maf_hwe_mind01_ref_pruned
96780 variants loaded from .bim file.
6173 people (2793 males, 3380 females) loaded from .fam.
--exclude: 85536 variants remaining.
85536 variants and 6173 people pass filters and QC.
Finished file... 



Split by chromosome

In [86]:
os.chdir(os.path.join(pathgenos, "05_Ref"))
Split_chr()

Split the reference from our samples (to generate the CV estimation with admixture first)

In [8]:
for file in glob.glob(os.path.join(pathgenos, "06_Pruned", "*.bed") ):
    filename = file.split(".")[0]
    refpops  = glob.glob(os.path.join(pathgenos, "00_Reference", "*.fam") )[0]
    subprocess.run(["plink", "--bfile", filename, "--remove", refpops, "--make-bed", "--out", filename + "_splitadapt"])

## PCA from pruned data

In [87]:
#Running PCA from pruned files
os.chdir(os.path.join(pathgenos, "06_Pruned") )
pathpca = os.path.join(projpath, "Results", "PCA")
for file in glob.glob("*pruned.bed"):
    filename = file.split(".")[0]
    outname  = os.path.join(pathpca, filename + "_PCA")
    subprocess.run(["plink", "--bfile", filename, "--pca", "50", "--out", outname])