# WS_ch06A.ipynb
# WESmith 11/18/22
## WS created this notebook to follow along chap 6 code from book
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶
## also see book code in Chapter06/Data_Formats.py

# MANAGING DATASETS WITH PLINK

# NOTE: This recipe generates 3.6G of data in the 'generated' folder.
# The recipe only takes a few minutes to run to regenerate this data, so the generated data is not kept.
# Keep the large source file hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz compressed (it is 1.2GB compressed and 8GB uncompressed). 
# It must be uncompressed before calling in this recipe. This is done in the code below, and it is recompressed after it is used.

In [None]:
import os
from collections import defaultdict

In [None]:
# three files to retrieve

# this is just 39k
#!wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt

# this uncompressed file is just 35MB
#!wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
#!gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz

# this .gz file is 1.2GB and it uncompresses to 8GB! 
# it needs to be uncompressed to read, then just keep the compressed file
#!wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz
#!gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz

In [None]:
# move big files to Downloads so they will not be backed up
remote_data_dir = '/home/smithw/Downloads/bioinformatics/ch06_data'
gen_dir         = os.path.join(remote_data_dir, 'generated')
base_file       = 'hapmap3_r3_b36_fwd.consensus.qc.poly'
meta_data       = 'relationships_w_pops_041510.txt'

### 1) GET THE METADATA

In [None]:
# get family and indifidual IDs, and offspring data
f = open(os.path.join(remote_data_dir, meta_data))
pop_ind = defaultdict(list)
f.readline()  # header
offspring = []
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1] # WS individual ID
    mom = toks[2]
    dad = toks[3]
    if mom != '0' or dad != '0':
        offspring.append((fam_id, ind_id))
    pop = toks[-1]
    pop_ind[pop].append((fam_id, ind_id))
f.close()

In [None]:
pop_ind.keys()

### 2) SUBSAMPLE THE DATA

In [None]:
# decompress the big file (-d flag): takes about 80 seconds
comp_file = os.path.join(remote_data_dir, base_file + '.ped.gz')
!gzip -d {comp_file}

In [None]:
val = {'hapmap1':'0.01', 'hapmap10':'0.1'} # WS 1% and 10% cases
for j, k in val.items():
    cmd  = '../plink2 --pedmap {}'.format(os.path.join(remote_data_dir, base_file)) +\
            ' --out {}'.format(os.path.join(gen_dir, j)) +\
            ' --thin {} --geno 0.1 --export ped'.format(k)
    print(cmd); print()
    os.system(cmd) # each case takes about a minute

In [None]:
# recompress the big file: this takes around 10 minutes
# the large file is automatically removed
uncomp_file = os.path.join(remote_data_dir, base_file + '.ped')
!gzip {uncomp_file}

### 3), 4), 5) GENERATE SUBSETS WITH JUST AUTOSOMES

In [None]:
def get_non_auto_SNPs(map_file, exclude_file):
    f = open(map_file)
    w = open(exclude_file, 'w')
    for l in f:
        toks = l.rstrip().split('\t')
        try:
            chrom = int(toks[0])
        except ValueError:
            rs = toks[1]
            w.write('%s\n' % rs)
    w.close()

In [None]:
for k in ['1', '10']:
    h = os.path.join(gen_dir, 'hapmap{}.map'.format(k))
    e = os.path.join(gen_dir, 'exclude{}.txt'.format(k))
    get_non_auto_SNPs(h, e)

In [None]:
# this just takes seconds
for k in ['1', '10']:
    h = os.path.join(gen_dir, 'hapmap{}'.format(k))
    cmd  = '../plink2 --pedmap {}'.format(h) +\
           ' --out '     + gen_dir  + '/hapmap{}_auto'.format(k) +\
           ' --exclude ' + gen_dir  + '/exclude{}.txt --export ped'.format(k)
    os.system(cmd)
    print(cmd)

### 6) GENERATE SUBSETS WITHOUT OFFSPRING

In [None]:
# this takes seconds
for k in ['1', '10']:
    h = os.path.join(gen_dir, 'hapmap{}_auto'.format(k))
    cmd  = '../plink2 --pedmap {} --filter-founders'.format(h) +\
        ' --out ' + gen_dir + '/hapmap{}_auto_noofs'.format(k) +\
        ' --export ped'
    os.system(cmd)
    print(cmd)

### 7) LD PRUNING

In [None]:
# just doing for 10% data, as per book;
# first step: generate a list of markers to be kept if the set is LD-pruned;
# this uses a sliding window of 50 SNPs, advancing by 10 SNPs at a time with a cut
# value of 0.1
cmd = '../plink2 --pedmap ' + gen_dir + '/hapmap10_auto_noofs --indep-pairwise 50 10 0.1' +\
      ' --out ' + gen_dir + '/keep --export ped'
os.system(cmd)
print(cmd)

In [None]:
# now extract SNPs to be kept
# NOTE: the book included '--recode', which caused an error; removed it
# (--recode isn't in Fata_Formats.py either)
cmd = '../plink2 --pedmap ' + gen_dir + '/hapmap10_auto_noofs --extract ' +\
                              gen_dir + '/keep.prune.in' +\
                  ' --out ' + gen_dir + '/hapmap10_auto_noofs_LD --export ped'
os.system(cmd)
print(cmd)

### 8) RECODE INTO DIFFERENT FORMATS

In [None]:
# recode AGCT to another code that labels alleles with 1 and 2
# NOTE: don't follow book here, follow Data_Formats.py
cmd = '../plink2 --pedmap ' + gen_dir + '/hapmap10_auto_noofs_LD' +\
      ' --out ' + gen_dir + '/hapmap10_auto_noofs_LD_12 --export ped 12'
print(cmd)      
os.system(cmd)

In [None]:
# recode file in binary format
cmd = '../plink2 --make-bed --pedmap ' + gen_dir + '/hapmap10_auto_noofs_LD' +\
      ' --out ' + gen_dir + '/hapmap10_auto_noofs_LD'
print(cmd)
os.system(cmd)

### 9) EXTRACT A SINGLE CHROMOSOME

In [None]:
cmd = '../plink2 --pedmap ' + gen_dir + '/hapmap10_auto_noofs --chr 2' +\
      ' --out ' + gen_dir + '/hapmap10_auto_noofs_2 --export ped'
print(cmd)
os.system(cmd)