# WS_ch10A.ipynb
# WESmith 11/23/22
## WS created this notebook to follow along chap 10 code from book
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶
## also see book code in Chapter10/PCA.py

In [None]:
import os
from sklearn.decomposition import PCA
import numpy as np
from genomics.popgen.pca import plot

In [None]:
data_dir  = 'data/ch06_data'  # use data from chapter 6
meta_data = 'relationships_w_pops_041510.txt'
ped_data  = 'hapmap10_auto_noofs_LD_12.ped'

### 1) LOAD METADATA: (FAMILY ID, INDIVIDUAL ID) FOR EACH POPULATION

In [None]:
f = open(os.path.join(data_dir, meta_data))
ind_pop = {}
f.readline()  # header
for l in f:
    toks = l.rstrip().split('\t')
    fam_id = toks[0]
    ind_id = toks[1]
    pop = toks[-1]
    ind_pop['/'.join([fam_id, ind_id])] = pop
f.close()

### 2) GET THE ORDER OF INDIVIDUALS WITH THE NUMBER OF SNPS TO PROCESS

In [None]:
f = open(os.path.join(data_dir, ped_data))
ninds = 0
ind_order = []
for line in f:
    ninds += 1
    toks = line[:100].replace(' ', '\t').split('\t') #  for speed
    fam_id = toks[0]
    ind_id = toks[1]
    ind_order.append('%s/%s' % (fam_id, ind_id))
nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2
f.close()

In [None]:
ninds, nsnps, nsnps/ninds

### 3) CREATE THE ARRAY FOR PCA

In [None]:
# this took about 50 sec to run
pca_array = np.empty((ninds, nsnps), dtype=int)
print(pca_array.shape)
f = open(os.path.join(data_dir, ped_data))
for ind, line in enumerate(f):
    snps = line.replace(' ', '\t').split('\t')[6:]
    for pos in range(len(snps) // 2):
        a1 = int(snps[2 * pos])  # WS taking even indices only: every other value
        a2 = int(snps[2 * pos])  # WS this is identical to a1
        my_code = a1 + a2 - 2    # WS this is 2*(a1 - 1): maps 1 to 0, 2 to 2
        pca_array[ind, pos] = my_code
f.close()

In [None]:
pca_array[0:5,0:20]

### 4) COMPUTE THE PCA WITH UP TO 8 COMPONENTS

In [None]:
# this took about 7 sec
my_pca = PCA(n_components=8)
my_pca.fit(pca_array)
# get the 8D coordinates for each sample
trans  = my_pca.transform(pca_array)

### 5) PLOT THE PCA

In [None]:
trans[0:5,:]

In [None]:
sc_ind_comp = {}
for i, ind_pca in enumerate(trans):
    sc_ind_comp[ind_order[i]] = ind_pca
# plot.render_pca() just plots 2 PCA components
# plot.render_pca_eight() only plots 8 PCA components
plot.render_pca_eight(sc_ind_comp, cluster=ind_pop)
plot.plt.show()

### WS - SEE HOW THE SNPS ARE REPRESENTED IN THE PED FILE

In [None]:
# get some example lines
f = open(os.path.join(data_dir, ped_data))
dd = []
num = 5
for k in range(num):
    dd.append(next(f))
f.close()

In [None]:
snps  = []
snps2 = []
for k in dd:
    snps.append(k.replace(' ', '\t').split('\t')[6:])
    snps2.append(k.replace(' ', '\t').split('\t')[0:20])

In [None]:
snps[0][0:7]

In [None]:
snps2[0][0:13]

In [None]:
dd_array = np.empty((num, nsnps//2), dtype=int)
for ind, k in enumerate(snps):
    for pos in range(nsnps // 2):
        a = int(k[2 * pos])
        my_code = 2 * (a - 1)
        dd_array[ind, pos] = my_code

In [None]:
pca_array[0,:20]

In [None]:
dd_array[0,:20]

In [None]:
pca_array[2,:20]

In [None]:
dd_array[2,:20]

In [None]:
# WS bottom line: to go from ped file to pca array:
# for each line in the ped file, take every-other value starting at 0 index: 
# change 1 to 0, keep 2 as 2; that's it; don't understand why the even indexing
# into the ped file, or why the remapping from 1 to 0 and keeping 2 to 2