# WS_ch06A.ipynb
# WESmith 11/18/22
## WS created this notebook to follow along chap 6 code from book
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶
## also see book code in Chapter06/Data_Formats.py

# MANAGING DATASETS WITH PLINK

In [None]:
import os
from collections import defaultdict

In [None]:
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz
# !wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt

# !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz
# !gzip -d hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz

In [None]:
data_dir  = 'data/ch06_data'
meta_data = 'relationships_w_pops_041510.txt'

### 1) GET THE METADATA

In [None]:
# get family and indifidual IDs, and offspring data
f = open(os.path.join(data_dir, meta_data))
pop_ind = defaultdict(list)
f.readline()  # header
offspring = []
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1] # WS individual ID
    mom = toks[2]
    dad = toks[3]
    if mom != '0' or dad != '0':
        offspring.append((fam_id, ind_id))
    pop = toks[-1]
    pop_ind[pop].append((fam_id, ind_id))
f.close()

In [None]:
pop_ind.keys()

### 2) SUBSAMPLE THE DATA

In [None]:
base_file = 'hapmap3_r3_b36_fwd.consensus.qc.poly'
out       = 'hapmap1'
thin      = '0.01' # WS get 1% of data
#out       = 'hapmap10'
#thin      = '0.1' # WS get 10% of data
cmd  = './plink2 --pedmap {}'.format(os.path.join(data_dir, base_file)) +\
        ' --out {}'.format(os.path.join(data_dir, out)) +\
        ' --thin {} --geno 0.1 --export ped'.format(thin)
cmd

In [None]:
# 1% subsample and 10% subsample each took about a minute
# don't rerun this
#os.system(cmd)

### 3), 4), 5) GENERATE SUBSETS WITH JUST AUTOSOMES

In [None]:
def get_non_auto_SNPs(map_file, exclude_file):
    f = open(map_file)
    w = open(exclude_file, 'w')
    for l in f:
        toks = l.rstrip().split('\t')
        try:
            chrom = int(toks[0])
        except ValueError:
            rs = toks[1]
            w.write('%s\n' % rs)
    w.close()

In [None]:
for k in ['1', '10']:
    h = os.path.join(data_dir, 'hapmap{}.map'.format(k))
    e = os.path.join(data_dir, 'exclude{}.txt'.format(k))
    get_non_auto_SNPs(h, e)

### 6) GENERATE SUBSETS WITHOUT OFFSPRING

### 7) LD PRUNING

In [None]:
# just doing for 10% data, as per book;
# first step: generate a list of markers to be kept if the set is LD-pruned;
# this uses a sliding window of 50 SNPs, advancing by 10 SNPs at a time with a cut
# value of 0.1
cmd = './plink2 --pedmap ./data/ch06_data/hapmap10_auto_noofs --indep-pairwise 50 10 0.1' +\
      ' --out ./data/ch06_data/keep --export ped'
#os.system(cmd)

In [None]:
# now extract SNPs to be kept
# NOTE: the book included '--recode', which caused an error; removed it
# (--recode isn't in Fata_Formats.py either)
cmd = './plink2 --pedmap ./data/ch06_data/hapmap10_auto_noofs --extract ./data/ch06_data/keep.prune.in' +\
      ' --out ./data/ch06_data/hapmap10_auto_noofs_LD --export ped'
#os.system(cmd)

### 8) RECODE INTO DIFFERENT FORMATS

In [None]:
# recode AGCT to another code that labels alleles with 1 and 2
# NOTE: don't follow book here, follow Data_Formats.py
cmd = './plink2 --pedmap ./data/ch06_data/hapmap10_auto_noofs_LD' +\
      ' --out ./data/ch06_data/hapmap10_auto_noofs_LD_12 --export ped 12'
#cmd      
#os.system(cmd)

In [None]:
# recode file in binary format
cmd = './plink2 --make-bed --pedmap ./data/hapmap10_auto_noofs_LD' +\
      ' --out ./data/ch06_data/hapmap10_auto_noofs_LD'
#cmd
#os.system(cmd)

### 9) EXTRACT A SINGLE CHROMOSOME

In [None]:
cmd = './plink2 --pedmap ./data/ch06_data/hapmap10_auto_noofs --chr 2' +\
      ' --out ./data/ch06_data/hapmap10_auto_noofs_2 --export ped'
#cmd
os.system(cmd)