In [5]:
import numpy as np
from tqdm import tqdm as tqdm
import pandas as pd
import bz2
import subprocess
import pickle as pkl
from itertools import chain
from glob import glob

In [16]:
working_path = '/cluster/raid/home/f80878961/beestrong/'
count_path = '{}data/allele_freqs/sync_files/'.format(working_path)
queen_geno_path = '{}data/queen_genotypes/All/'.format(working_path)

tlf_path = '{}tlf/'.format(working_path)
pheno_df = pd.read_csv('{}BeeStrong_metadata.csv'.format(tlf_path))

script_path = "/cluster/raid/home/f80878961/scripts/tmp_scripts/"

In [4]:
# bs_ids = pheno_df['Colony'].to_list()
# snp_nr = 7023976 # will need sanity check here

## Settings

In [32]:
snp_nr = 7023976 # wc -l BS17_0071.count -1

## All
filtered_count_path = '{}data/allele_freqs/All/'.format(working_path)

bs_ids = sorted([x.split('/')[-1].split('.')[0] for x in glob('{}*.count.bz2'.format(count_path))])

chunk_size = 101 # 15 chunks
batch_file = '{}bsid_batch.txt'.format(filtered_count_path)

# bs_ids = pheno_df['Colony'].to_list()
print(len(set(bs_ids).intersection(pheno_df['Colony'].to_list())))
print(len(bs_ids))

# for SNP filtering
min_prev = 0.1 # prevalence threshold for common alleles (if >2 common allele, then real tri-allelic)
min_depth = 10 # both mean and median depth 
max_depth = 50

1442
1513


## Merge allele frequencies .sync files

### Write per-allele count matrices (SNP x colony for A, T, C, G)

In [23]:
def write_merge_sync_script(script_fn, array_str):
    runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STRING
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=20g
#SBATCH --time=4:00:00
#SBATCH --job-name=merge_sync
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb

# file listing batches of BeeStrong ids (bs_id) for parallel computing
batch_file=$1
snp_nr=$2
count_path=$3
out_path=$4

batch_id=$SLURM_ARRAY_TASK_ID

# getting bs_ids
bs_ids=$(sed -n ${batch_id}'{p;q}' ${batch_file})

python ~/scripts/plan-b-omics/bin/merge_sync.py --batch_id ${batch_id} --snp_nr ${snp_nr} --bs_ids ${bs_ids} --count_path ${count_path} --out_path ${out_path}

echo DONE""".replace("ARRAY_STRING", array_str)
    with open(script_fn, 'w') as outf:
        outf.write(runstr)

In [25]:
# create batch file for first time
with open(batch_file, "w") as outf:
    for i in range(0, len(bs_ids), chunk_size):
        chunk = bs_ids[i:i+chunk_size]
        outf.write(",".join(chunk) + "\n")

In [28]:
array_str = '1-15'
script_fn = '{}merge_sync.run'.format(script_path)
write_merge_sync_script(script_fn, array_str)

In [29]:
%%bash -s "$script_fn" "$batch_file" "$snp_nr" "$count_path" "$filtered_count_path"
cd $5
sbatch $1 $2 $3 $4 $5

Submitted batch job 1388883


### Merge into 3-D matrix

In [33]:
with open(batch_file, "r") as inf:
    batch_bs_ids = [x.rstrip().split(',') for x in inf.readlines()]
assert sum([x == bs_ids[i] for i, x in enumerate(list(chain(*batch_bs_ids)))]) == len(bs_ids)

In [34]:
%%time
allele2jj = {'A': 0, 'T': 1, 'C':2, 'G':3}
count_mat = np.zeros((snp_nr, len(bs_ids), len(allele2jj)), dtype=np.uint16)

for allele, jj in allele2jj.items():
    print(allele, jj)
    
    for j, b_bs_ids in enumerate(batch_bs_ids):
        batch_id = j + 1
        batch_size = len(b_bs_ids)
        with open('{}count_{}_{}.txt'.format(filtered_count_path, allele, batch_id), 'r') as inf:
            i = 0
            for l in tqdm(inf):                    
                count_mat[i, j * chunk_size : j * chunk_size + batch_size, jj] = np.array(l.split()[2:], dtype=np.int16)
                i += 1

A 0


7023976it [01:27, 79911.85it/s]
7023976it [01:12, 96871.43it/s]
7023976it [01:13, 95607.20it/s]
7023976it [01:12, 96395.80it/s]
7023976it [01:13, 95634.25it/s]
7023976it [01:13, 95841.95it/s]
7023976it [01:12, 96750.18it/s]
7023976it [01:13, 96216.97it/s]
7023976it [01:13, 96170.55it/s]
7023976it [01:13, 94955.64it/s]
7023976it [01:10, 99338.00it/s] 
7023976it [01:10, 100071.38it/s]
7023976it [01:10, 99098.12it/s] 
7023976it [01:20, 86892.53it/s]
7023976it [01:19, 88464.59it/s]


T 1


7023976it [01:20, 87638.04it/s]
7023976it [01:19, 88887.25it/s]
7023976it [01:20, 87229.69it/s]
7023976it [01:21, 86617.28it/s]
7023976it [01:21, 86346.12it/s]
7023976it [01:20, 87418.80it/s]
7023976it [01:20, 87219.67it/s]
7023976it [01:22, 85541.27it/s]
7023976it [01:22, 85528.61it/s]
7023976it [01:22, 85412.89it/s]
7023976it [01:21, 86602.88it/s]
7023976it [01:21, 86004.87it/s]
7023976it [01:20, 86980.82it/s]
7023976it [01:21, 86634.61it/s]
7023976it [01:19, 87840.42it/s]


C 2


7023976it [01:22, 85362.98it/s]
7023976it [01:21, 86695.99it/s]
7023976it [01:21, 86272.75it/s]
7023976it [01:21, 85767.50it/s]
7023976it [01:23, 83933.48it/s]
7023976it [01:21, 85708.65it/s]
7023976it [01:22, 85253.64it/s]
7023976it [01:22, 84884.01it/s]
7023976it [01:23, 84503.23it/s]
7023976it [01:23, 84575.33it/s]
7023976it [01:23, 84403.22it/s]
7023976it [01:22, 85003.84it/s]
7023976it [01:22, 84759.25it/s]
7023976it [01:23, 84321.06it/s]
7023976it [01:21, 86137.15it/s]


G 3


7023976it [01:21, 86597.14it/s]
7023976it [01:21, 86523.52it/s]
7023976it [01:23, 84421.79it/s]
7023976it [01:21, 85817.88it/s]
7023976it [01:22, 85413.62it/s]
7023976it [01:22, 85604.58it/s]
7023976it [01:22, 85174.04it/s]
7023976it [01:23, 84585.02it/s]
7023976it [01:24, 83101.66it/s]
7023976it [01:22, 84669.79it/s]
7023976it [01:22, 84762.85it/s]
7023976it [01:22, 85111.30it/s]
7023976it [01:21, 86124.85it/s]
7023976it [01:22, 85098.11it/s]
7023976it [01:20, 86877.37it/s]

CPU times: user 1h 9min 1s, sys: 10min 24s, total: 1h 19min 25s
Wall time: 1h 20min 4s





In [35]:
%%time
with open('{}count_mat.pkl'.format(filtered_count_path), 'wb') as outf:
    pkl.dump(count_mat, outf, protocol=pkl.HIGHEST_PROTOCOL)

CPU times: user 909 ms, sys: 2min 15s, total: 2min 16s
Wall time: 2min 33s


## Filter SNPs and recode alleles

filtering 
1. monoallelic filtering : 3/4 tables empty
2. mitochondrial SNP: contig NC_001566.1
3. tri-allelic SNP filtering: >2 allele present in min. 10% colonies, else delete variants from third and fourth alleles
4. min-max depth (10/50, on mean AND median) (after tri-allelic variants deletions, also not including N and deletions)

In [36]:
%%time
with open('{}count_mat.pkl'.format(filtered_count_path), 'rb') as inf:
    count_mat = pkl.load(inf)
print(count_mat.nbytes/1000000000)

with open(batch_file, "r") as inf:
    batch_bs_ids = [x.rstrip().split(',') for x in inf.readlines()]
assert sum([x == bs_ids[i] for i, x in enumerate(list(chain(*batch_bs_ids)))]) == len(bs_ids)

85.018205504
CPU times: user 72.6 ms, sys: 2min 34s, total: 2min 34s
Wall time: 3min 12s


In [37]:
batch_i = 0
batch_ii = len(bs_ids)

In [38]:
%%time
# counts the number of A, T, C, G allele per SNP # should take 7min
allele_prev = np.sum(count_mat[:, batch_i:batch_ii, :] > 0, axis=1)
allele_prev

CPU times: user 2min 54s, sys: 13.8 s, total: 3min 7s
Wall time: 3min 8s


array([[   1, 1513,    3,    9],
       [   4, 1513,   27,    2],
       [1513,   11,    2,   22],
       ...,
       [1437,  849,  118,  337],
       [ 397, 1364, 1058,  218],
       [ 335, 1504,  310,  170]])

In [39]:
%%time
# if count 3/4 alleles == 0, then monoallelic
mono_mask = np.sum(allele_prev > 0, axis=1) == 1
print(np.sum(mono_mask))

# tri-allelic stringent
tri_stringent_mask = np.sum(allele_prev > 0, axis=1) > 2
print(np.sum(tri_stringent_mask))

# tri-allelic relaxed (filter SNP only when >2 common alleles, i.e., an allele present in >= 10% of colonies)
tri_mask = np.sum(allele_prev >= ((batch_ii - batch_i) * min_prev), axis=1) > 2
print(np.sum(tri_mask))

3261
6963287
17207
CPU times: user 408 ms, sys: 63.9 ms, total: 472 ms
Wall time: 476 ms


In [10]:
# # SNP with ambiguous alternative alleles
# i = 134
# i = 157
# i = 159

In [11]:
# iterate through stringent tri-allelic SNPs: filter if ambigous else recode (delete 3rd and fourth)
ambi_mask = np.full(snp_nr, False)
for i in tqdm(np.argwhere(tri_stringent_mask).flatten()):

    # get number of alleles and sort by prevalence 
    atcg_prev = allele_prev[i, :]
    alleles_sorted_by_prevalence = np.argsort(atcg_prev)

    # if 2nd and 3rd alleles are equally common --> ambiguous --> filter
    if atcg_prev[alleles_sorted_by_prevalence[1]] == atcg_prev[alleles_sorted_by_prevalence[2]]:
        ambi_mask[i] = True

    # else delete 3rd and 4rth alleles
    else:
        count_mat[i, batch_i:batch_ii, alleles_sorted_by_prevalence[:2]] = 0

100%|██████████| 6955445/6955445 [01:08<00:00, 101246.59it/s]


In [12]:
np.sum(ambi_mask)

126577

In [13]:
%%time
# depth filtering --> if to, long I could consider iterating ...to have an overview of the time it takes # 8 min
depth  = np.sum(count_mat[:, batch_i:batch_ii, :], axis = 2)

CPU times: user 2min 41s, sys: 4min 21s, total: 7min 2s
Wall time: 7min 5s


In [14]:
%%time
# kernel died while computing these...
mean_depth = np.mean(depth, axis = 1)

CPU times: user 24.3 s, sys: 286 ms, total: 24.6 s
Wall time: 24.6 s


In [15]:
# median is the issue...

In [16]:
median_depth = np.full(snp_nr, -1)
for i in tqdm(range(snp_nr)):
    median_depth[i] =  np.median(depth[i])

100%|██████████| 7023976/7023976 [02:54<00:00, 40315.44it/s]


In [17]:
%%time
depth_mask = (mean_depth < min_depth) | (mean_depth > max_depth) | (median_depth < min_depth) | (median_depth > max_depth) 

CPU times: user 24.6 ms, sys: 9.71 ms, total: 34.3 ms
Wall time: 36.7 ms


In [18]:
np.sum(depth_mask)

120911

In [19]:
%%time
chr_ids = [''] * snp_nr
snp_pos = [''] * snp_nr
mito_mask = np.full(snp_nr, False)
with open('{}count_A_1.txt'.format(out_path), 'r') as inf:
    i = 0
    for l in tqdm(inf):
        ch = l.split()[0]
        pos = l.split()[1]
        chr_ids[i] = ch
        snp_pos[i] = pos
        if ch == 'NC_001566.1':
            mito_mask[i] = True
        i += 1

7023976it [00:25, 279103.26it/s]

CPU times: user 23.4 s, sys: 1.64 s, total: 25.1 s
Wall time: 25.2 s





In [20]:
np.sum(mito_mask)

287

In [21]:
# SNP filtering
mask = mono_mask + tri_mask + depth_mask + mito_mask + ambi_mask

In [22]:
print(np.sum(mask))

259604


In [23]:
print(snp_nr - np.sum(mask))

6764372


In [24]:
%%time
# getting the ref allele counts 5-8min...
count_ref  = np.max(count_mat[:, batch_i:batch_ii, :], axis = 2)

CPU times: user 4min 34s, sys: 1min 3s, total: 5min 38s
Wall time: 5min 39s


In [25]:
%%time
# creating reference allele count dataframe
count_ref_df = pd.DataFrame(count_ref[~mask])
count_ref_df.insert(0, 'SNP_position', np.array(snp_pos)[~mask])
count_ref_df.insert(0, 'Chromosome', np.array(chr_ids)[~mask])
count_ref_df.columns = ['Chromosome', 'SNP_position'] + bs_ids[batch_i: batch_ii]

CPU times: user 4.23 s, sys: 25.7 s, total: 29.9 s
Wall time: 30.1 s


In [26]:
%%time
# 22min
count_ref_df.to_csv('{}count_ref.txt'.format(out_path), header=False, index=False, sep='\t')

CPU times: user 21min 20s, sys: 33.5 s, total: 21min 53s
Wall time: 22min 11s


In [27]:
%%time
# creating depth dataframe # 30min...
depth_df = pd.DataFrame(depth[~mask])
depth_df.insert(0, 'SNP_position', np.array(snp_pos)[~mask])
depth_df.insert(0, 'Chromosome',  np.array(chr_ids)[~mask])
depth_df.columns = ['Chromosome', 'SNP_position'] + bs_ids[batch_i: batch_ii]

CPU times: user 11.3 s, sys: 31min 29s, total: 31min 41s
Wall time: 31min 49s


In [28]:
%%time
depth_df.to_csv('{}depth.txt'.format(out_path), header=False, index=False, sep='\t')

CPU times: user 21min 19s, sys: 48.2 s, total: 22min 7s
Wall time: 22min 41s


## Queen genotype reconstruction

In [14]:
out_path

'/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/'

In [40]:
count_ref_df = pd.read_csv('{}count_ref.txt'.format(out_path), header=None, sep='\t', nrows=10)
depth_df = pd.read_csv('{}depth.txt'.format(out_path), header=None, sep='\t', nrows=10)

In [41]:
header = ['Chromosome', 'SNP_position'] + bs_ids
count_ref_df.columns = header
depth_df.columns = header

In [48]:
header_df = pd.DataFrame(columns=header)
header_df.to_csv('{}header.txt'.format(out_path), header=True, index=False, sep='\t')

In [47]:
header_df

Unnamed: 0,Chromosome,SNP_position,BS16_0001,BS16_0002,BS16_0003,BS16_0004,BS16_0005,BS16_0006,BS16_0007,BS16_0008,...,BS18_0670,BS18_0671,BS18_0672,BS18_0673,BS18_0674,BS18_0675,BS18_0681,BS18_0682,BS18_0684,BS18_0686


In [42]:
count_ref_df

Unnamed: 0,Chromosome,SNP_position,BS16_0001,BS16_0002,BS16_0003,BS16_0004,BS16_0005,BS16_0006,BS16_0007,BS16_0008,...,BS18_0670,BS18_0671,BS18_0672,BS18_0673,BS18_0674,BS18_0675,BS18_0681,BS18_0682,BS18_0684,BS18_0686
0,NC_037638.1,5671,18,19,72,31,47,44,18,34,...,37,36,24,31,31,46,33,27,39,38
1,NC_037638.1,5698,20,16,74,22,50,44,18,34,...,39,41,26,25,27,45,31,23,39,33
2,NC_037638.1,6621,48,30,113,23,50,39,40,36,...,71,61,41,53,34,73,65,28,47,48
3,NC_037638.1,6717,30,28,105,25,58,37,37,34,...,58,44,49,48,41,61,46,33,40,48
4,NC_037638.1,6823,39,29,61,21,40,39,31,24,...,50,45,27,38,30,46,38,38,27,32
5,NC_037638.1,6832,44,29,57,19,38,40,33,22,...,43,44,23,38,33,46,38,37,28,35
6,NC_037638.1,6835,45,29,64,21,39,38,36,22,...,44,47,25,39,33,49,38,38,27,36
7,NC_037638.1,6914,41,25,67,26,45,39,46,23,...,37,36,44,43,43,54,33,37,36,42
8,NC_037638.1,7034,21,13,65,19,29,33,18,24,...,35,30,29,33,26,47,29,25,27,32
9,NC_037638.1,7092,24,21,61,28,35,34,29,25,...,34,33,24,42,24,40,22,24,33,16


In [43]:
count_ref_df.to_csv('{}count_ref_test.txt'.format(out_path), header=True, index=False, sep='\t')
depth_df.to_csv('{}depth_test.txt'.format(out_path), header=True, index=False, sep='\t')

In [27]:
def write_run_beethoven_script(script_fn):
    runstr="""#!/bin/bash -l
#SBATCH --cpus-per-task=10
#SBATCH --mem=40g
#SBATCH --time=48:00:00
#SBATCH --job-name=beethoven
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb

dir_in=$1
depth_file=$2
count_ref_file=$3
prefix=$4

genoqueen_hom ${dir_in} ${depth_file} ${count_ref_file} 2 10 ${prefix} 1000

echo DONE"""
    with open(script_fn, 'w') as outf:
        outf.write(runstr)

In [8]:
script_fn = '{}beethoven.run'.format(script_path)
write_run_beethoven_script(script_fn)

In [9]:
dir_in = out_path
depth_file = 'depth_header.txt'
count_ref_file = 'count_ref_header.txt'

In [10]:
%%bash -s "$script_fn" "$dir_in" "$depth_file" "$count_ref_file"
cd $2
sbatch $1 $2 $3 $4

Submitted batch job 1369553


### by population

In [4]:
dir_in = allele_count_path
depth_file = 'depth_header.txt'
count_ref_file = 'count_ref_header.txt'

In [5]:
set(pheno_df['Genetic group'])

{'Caucasica', 'Ligustica_Carnica', 'Mellifera', 'hybrid'}

In [19]:
%%time
depth_df = pd.read_csv('{}{}'.format(dir_in, depth_file), sep='\t')

CPU times: user 6min 59s, sys: 5min 11s, total: 12min 11s
Wall time: 12min 15s


In [28]:
%%time
for gg in set(pheno_df['Genetic group']):
    print(gg)
    depth_df[['Chromosome', 'SNP_position'] + list(pheno_df[pheno_df['Genetic group'] == gg]['Colony'])].to_csv('{}depth_{}.txt'.format(allele_count_path, gg), header=True, index=False, sep='\t')

Caucasica
Mellifera
Ligustica_Carnica
hybrid


In [6]:
%%time
count_ref_df = pd.read_csv('{}{}'.format(dir_in, count_ref_file), sep='\t')

CPU times: user 7min 28s, sys: 3min 19s, total: 10min 48s
Wall time: 11min 18s


In [7]:
%%time
for gg in set(pheno_df['Genetic group']):
    print(gg)
    count_ref_df[['Chromosome', 'SNP_position'] + list(pheno_df[pheno_df['Genetic group'] == gg]['Colony'])].to_csv('{}count_ref_{}.txt'.format(allele_count_path, gg), header=True, index=False, sep='\t')

Mellifera
Caucasica
hybrid
Ligustica_Carnica
CPU times: user 20min 51s, sys: 1min 14s, total: 22min 5s
Wall time: 22min 12s


In [32]:
script_fn = '{}beethoven.run'.format(script_path)
write_run_beethoven_script(script_fn)

In [30]:
dir_in = '/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/hom_pop/'

In [33]:
for gg in set(pheno_df['Genetic group']):
    command = ['sbatch', script_fn, dir_in, 'depth_{}.txt'.format(gg), 'count_ref_{}.txt'.format(gg), 'queen_{}'.format(gg)]
    result = subprocess.run(command, cwd=dir_in, capture_output=True, text=True)

## PLINK


### .bim_like

I need to create bim-like files but be careful they are overwritten with make-bed command

1. Chromosome code
2. Variant ID
3. Position in centimorgans (safe to use dummy value of '0')
4. Base-pair coordinate (1-based; limited to 231-2)
5. ALT ('A1' in PLINK 1.x) allele code
6. REF ('A2' in PLINK 1.x) allele code


In [111]:
%%time
with open('{}count_mat.pkl'.format(allele_count_path), 'rb') as inf:
    count_mat = pkl.load(inf)
print(count_mat.nbytes/1000000000)

81.028587136
CPU times: user 0 ns, sys: 1min 33s, total: 1min 33s
Wall time: 1min 52s


In [116]:
f_snp_nr = 6764372
snp_nr = 7023976
assert snp_nr == count_mat.shape[0]

In [117]:
%%time
# load original SNP ids
chr_ids = [''] * snp_nr
snp_pos = [''] * snp_nr
with open('{}data/gwas/BS17_0071.count'.format(working_path), 'r') as inf:
    print(inf.readline())
    i = 0
    for l in tqdm(inf):
        ch = l.split()[0]
        pos = l.split()[1]
        chr_ids[i] = ch
        snp_pos[i] = pos
        i += 1

CHROM POS REF ALT BS17-0071



7023976it [00:05, 1403874.48it/s]

CPU times: user 4.31 s, sys: 685 ms, total: 5 s
Wall time: 5.08 s





In [120]:
%%time
# load filtered SNPs
f_chr_ids = [''] * f_snp_nr
f_snp_pos = [''] * f_snp_nr
with open('{}filtered_snps.txt'.format(allele_count_path), 'r') as inf:
    inf.readline()
    i = 0
    for l in tqdm(inf):
        ch = l.split()[0]
        pos = l.split()[1]
        f_chr_ids[i] = ch
        f_snp_pos[i] = pos
        i += 1
assert f_snp_nr == i

6764372it [00:04, 1543827.37it/s]

CPU times: user 3.7 s, sys: 676 ms, total: 4.37 s
Wall time: 4.45 s





In [125]:
plink_path = '{}data/queen_genotypes/hom_pop/'.format(working_path)
bim = pd.read_csv('{}{}.bim'.format(plink_path, 'queen'), sep='\t', header=None)

In [132]:
# good
bim[3].to_list() == [int(x) for x in f_snp_pos]

True

In [133]:
# get back mask by comparing original snps and filtered snps. TO DO: if rerun filtering, include this part directly
snp_ids = ['{}_{}'.format(x, snp_pos[i]) for i, x in enumerate(chr_ids)]
f_snp_ids = ['{}_{}'.format(x, f_snp_pos[i]) for i, x in enumerate(f_chr_ids)]

assert len(snp_ids) == len(set(snp_ids))
assert len(f_snp_ids) == len(set(f_snp_ids))

snp_mask = ~np.isin(np.array(snp_ids), np.array(f_snp_ids))

assert np.sum(~snp_mask) == f_snp_nr

In [134]:
# find reference and alternative allele per SNP (REF: the one with the most counts. not sure if correct but probably REF and ALT are interchangeable) 
jj2allele = np.array(['A', 'T', 'C', 'G'])
ref_alt = np.full((snp_nr, 2), '.')
for i in tqdm(np.argwhere(~snp_mask).flatten()):
    ref_alt[i] = jj2allele[np.argsort(np.sum(count_mat[i, :, :], axis=0))[::-1][:2]]

100%|██████████| 6764372/6764372 [03:30<00:00, 32146.91it/s]


In [139]:
bim_like = pd.DataFrame({
    'CHR': f_chr_ids,
    'SNP_ID': f_snp_ids,
    'GEN_DIST': ['0'] * f_snp_nr,
    'BP_POS': f_snp_pos,
    'ALT': ref_alt[~snp_mask][:, 1],
    'REF': ref_alt[~snp_mask][:, 0]
})

In [142]:
# recode snp id
chr2int = dict(zip(list(dict.fromkeys(bim_like['CHR'])), range(1, 1 + len(set(bim_like['CHR'])))))
bim_like['CHR'] = bim_like['CHR'].replace(chr2int)

def recode_SNP_ids(old_id):
    chrom, pos = old_id.split('.1_')
    return '{}:{}'.format(chr2int['{}.1'.format(chrom)], pos)

bim_like['SNP_ID'] = bim_like['SNP_ID'].map(recode_SNP_ids)

  bim_like['CHR'] = bim_like['CHR'].replace(chr2int)


In [143]:
bim_like

Unnamed: 0,CHR,SNP_ID,GEN_DIST,BP_POS,ALT,REF
0,1,1:5671,0,5671,G,T
1,1,1:5698,0,5698,C,T
2,1,1:6621,0,6621,G,A
3,1,1:6717,0,6717,T,C
4,1,1:6823,0,6823,C,G
...,...,...,...,...,...,...
6764367,16,16:7225908,0,7225908,C,T
6764368,16,16:7225911,0,7225911,T,C
6764369,16,16:7226186,0,7226186,A,G
6764370,16,16:7226346,0,7226346,A,G


In [147]:
# good
bim[1].to_list() == bim_like['SNP_ID'].to_list()

True

In [149]:
%%time
bim_like.to_csv('{}queen.bim_like'.format(hom_pop_path), header=False, index=False, sep='\t')

CPU times: user 6.47 s, sys: 509 ms, total: 6.98 s
Wall time: 7.11 s


In [183]:
hom_pop_path

'/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/hom_pop/'

### .ped and .bed

Family ID (FID)

Individual ID (IID)

Paternal ID

Maternal ID

Sex

Phenotype

genotypes 2 columns per SNP when diploid

In [43]:
batch_j = 3
chunk_bs_ids = batch_bs_ids[batch_j]

In [None]:
def write_ped(bim_file, chunk_bs_ids, snp_nr, bgs_file, batch_j, chunk_size, plink_path, prefix):

    # contains REF and ALT alleles
    bim_df = pd.read_csv(bim_file, sep='\t')
    
    ped_mat = np.full((len(chunk_bs_ids), 2 * snp_nr), '', dtype='U1')
    
    with open(bgs_file, 'r') as inf:
        
        # index of the first colony of the batch/chunk
        start_j = batch_j * chunk_size
    
        # check we have the same bs ids in header than in batch
        header_bsids = inf.readline()
        assert header_bsids.rstrip().split(',')[3:][start_j: start_j + len(chunk_bs_ids)] == chunk_bs_ids, 'bs_id mismatch'
        
        for i in tqdm(range(snp_nr)):
            # parse number of ref allele per SNP (0, 1, 2 as queen is diploid)
            ref_numbers = inf.readline().rstrip().split(',')[3:]
    
            for j in range(start_j, start_j + len(chunk_bs_ids)):
                ref_nr = ref_numbers[j]
                if ref_nr == '2':
                    ped_mat[j - start_j, 2 * i] = bim_df.iloc[i, 5]
                    ped_mat[j - start_j, 2 * i + 1] = bim_df.iloc[i, 5]
                elif ref_nr == '1': 
                    ped_mat[j - start_j, 2 * i] = bim_df.iloc[i, 5]
                    ped_mat[j - start_j, 2 * i + 1] = bim_df.iloc[i, 4]
                elif ref_nr == '0': 
                    ped_mat[j - start_j, 2 * i] = bim_df.iloc[i, 4]
                    ped_mat[j - start_j, 2 * i + 1] = bim_df.iloc[i, 4]
                else: 
                    print('ERROR')

    ped_df = pd.DataFrame({
        'FID': chunk_bs_ids, 
        'IID': chunk_bs_ids,
        'PID': ['0'] * len(chunk_bs_ids),
        'MID': ['0'] * len(chunk_bs_ids),
        'SEX': ['0'] * len(chunk_bs_ids), 
        'PHENOTYPE': ['-9'] * len(chunk_bs_ids)
    })
    ped_df = ped_df.join(pd.DataFrame(ped_mat))
    ped_df.to_csv('{}{}_{}.ped'.format(plink_path, prefix, batch_j), header=False, index=False, sep='\t')

In [5]:
def write_write_ped_script(script_fn, array_str):
    runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STRING
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=100g
#SBATCH --time=6:00:00
#SBATCH --job-name=write_ped
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb

# file listing batches of BeeStrong ids (bs_id) for parallel computing
bim_file=$1
snp_nr=$2
bgs_file=$3
chunk_size=$4
plink_path=$5
prefix=$6
batch_file=$7

batch_id=$SLURM_ARRAY_TASK_ID

# getting bs_ids
chunk_bs_ids=$(sed -n ${batch_id}'{p;q}' ${batch_file})

python ~/scripts/plan-b-omics/bin/write_ped.py --bim_file ${bim_file} --chunk_bs_ids ${chunk_bs_ids} --snp_nr ${snp_nr} --bgs_file ${bgs_file} --batch_id ${batch_id} --chunk_size ${chunk_size} --plink_path ${plink_path} --prefix ${prefix}

echo DONE""".replace("ARRAY_STRING", array_str)
    with open(script_fn, 'w') as outf:
        outf.write(runstr)

In [6]:
prefix = 'queen'
bim_file = '{}{}.bim'.format(allele_count_path, prefix)
snp_nr = 6764372
bgs_file = '{}{}.bgs'.format(allele_count_path, prefix)
chunk_size = 50
batch_file = '{}bsid_batch.txt'.format(plink_path)

In [7]:
# prod
bs_ids = pheno_df['Colony'].to_list()
batch_bs_ids = []
with open(batch_file, "w") as outf:
    for i in range(0, len(bs_ids), chunk_size):
        chunk = bs_ids[i:i+chunk_size]
        batch_bs_ids.append(chunk)
        outf.write(",".join(chunk) + "\n")
assert sum([x == bs_ids[i] for i, x in enumerate(list(chain(*batch_bs_ids)))]) == len(bs_ids)

In [8]:
array_str = '1-{}'.format(len(batch_bs_ids))
script_fn = '{}write_ped.run'.format(script_path)
write_write_ped_script(script_fn, array_str)

In [9]:
%%bash -s "$script_fn" "$bim_file" "$snp_nr" "$bgs_file" "$chunk_size" "$plink_path" "$prefix" "$batch_file" 
cd $6
sbatch $1 $2 $3 $4 $5 $6 $7 $8

Submitted batch job 1378632


a simple cat on all theose .ped files should work

### .ped with probs and by population

In [160]:
prefix = 'queen'
plink_path = '{}data/queen_genotypes/hom_pop/'.format(working_path)
populations = ['Caucasica', 'Ligustica_Carnica', 'Mellifera', 'hybrid']
chunk_size = 25
prob_threshold = 0.95

In [153]:
# need to split .prob files into 3: AA, AR, RR --> easier to read probability of these genotypes simulateously then...
def split_prob_file(
    plink_path, prefix, pop):

    prob_file = '{}{}_{}.prob'.format(plink_path, prefix, pop)
    aa_file = '{}{}_{}_AA.prob'.format(plink_path, prefix, pop)
    ar_file = '{}{}_{}_AR.prob'.format(plink_path, prefix, pop)
    rr_file = '{}{}_{}_RR.prob'.format(plink_path, prefix, pop)
    
    out_aa = open(aa_file, 'w')
    out_ar = open(ar_file, 'w')
    out_rr = open(rr_file, 'w')
    
    with open(prob_file, 'r') as inf:
        headers = inf.readline()
        out_aa.write(headers.rstrip(',geno\n') + '\n')
        out_ar.write(headers.rstrip(',geno\n') + '\n')
        out_rr.write(headers.rstrip(',geno\n') + '\n')

        for l in tqdm(inf):
            split_line = l.rstrip().split(',')
            geno = split_line[-1]
            if geno == 'AA':
                out_aa.write(','.join(split_line[:-1]) + '\n')
            elif geno == 'AR':
                out_ar.write(','.join(split_line[:-1]) + '\n')
            elif geno == 'RR':
                out_rr.write(','.join(split_line[:-1]) + '\n')
            else: 
                print('GENO: {}'.format(geno))
                      
    out_aa.close()
    out_ar.close()
    out_rr.close()

In [125]:
for pop in populations:
    split_prob_file(plink_path, prefix, pop)

20293116it [00:46, 437729.20it/s]
20293116it [24:53, 13589.27it/s]
20293116it [15:02, 22474.99it/s]
20293116it [14:04, 24026.43it/s]


20
645
405
372


#### debug

In [14]:
bim = pd.read_csv('{}{}.bim'.format(plink_path, prefix), sep='\t', header=None)

In [18]:
pop = 'Mellifera'
batch_file = '{}bsid_batch_{}.txt'.format(plink_path, pop)
with open(batch_file, "r") as inf:
    batch_bs_ids = [x.rstrip().split(',') for x in inf.readlines()]

    

In [20]:
batch_j = 0
chunk_bs_ids = batch_bs_ids[batch_j]

In [21]:
chunk_bs_ids[7]

'BS16_0136'

In [96]:
bim = pd.read_csv('{}{}.bim'.format(plink_path, prefix), sep='\t', header=None)
bgs_file = '{}{}_{}.bgs'.format(plink_path, prefix, pop)
aa_prob_file = '{}{}_{}_AA.prob'.format(plink_path, prefix, pop)
ar_prob_file = '{}{}_{}_AR.prob'.format(plink_path, prefix, pop)
rr_prob_file = '{}{}_{}_RR.prob'.format(plink_path, prefix, pop)
snp_nr = len(bim)

In [97]:
start_j = batch_j * chunk_size

In [98]:
inf = open(bgs_file, 'r')
inf_aa = open(aa_prob_file, 'r')
inf_ar = open(ar_prob_file, 'r')
inf_rr = open(rr_prob_file, 'r')

In [99]:
header_bsids = inf.readline()

In [100]:
assert header_bsids.rstrip().split(',')[3:][start_j: start_j + len(chunk_bs_ids)] == chunk_bs_ids, 'bs_id mismatch'
inf_aa.readline()
inf_ar.readline()
inf_rr.readline()

',Chromosome,SNP_position,BS16_0004,BS16_0032,BS16_0040,BS16_0044,BS16_0045,BS16_0047,BS16_0134,BS16_0136,BS16_0137,BS16_0141,BS16_0144,BS16_0145,BS16_0146,BS16_0147,BS16_0148,BS16_0149,BS16_0150,BS16_0151,BS16_0152,BS16_0154,BS16_0155,BS16_0171,BS16_0172,BS16_0174,BS16_0175,BS16_0176,BS16_0216,BS16_0218,BS16_0276,BS16_0277,BS16_0279,BS16_0280,BS16_0300,BS16_0326,BS16_0382,BS16_0404,BS16_0407,BS16_0408,BS16_0409,BS16_0410,BS16_0411,BS16_0412,BS16_0413,BS16_0414,BS16_0415,BS16_0416,BS16_0417,BS16_0418,BS16_0419,BS16_0420,BS16_0421,BS16_0422,BS16_0423,BS16_0424,BS16_0425,BS16_0426,BS16_0427,BS16_0428,BS16_0429,BS16_0437,BS16_0438,BS16_0439,BS16_0440,BS16_0449,BS16_0496,BS16_0500,BS16_0501,BS16_0502,BS16_0503,BS16_0504,BS16_0511,BS16_0513,BS16_0514,BS16_0515,BS16_0516,BS16_0517,BS16_0518,BS16_0520,BS16_0521,BS16_0540,BS16_0541,BS16_0542,BS16_0543,BS16_0544,BS16_0545,BS16_0546,BS16_0547,BS16_0549,BS16_0550,BS16_0551,BS16_0553,BS16_0554,BS16_0570,BS16_0571,BS16_0572,BS16_0573,BS16_0574,BS17

In [95]:
inf.close()
inf_aa.close()
inf_ar.close()
inf_rr.close()

In [101]:
for i in tqdm(range(snp_nr)):
    ref_numbers = inf.readline()
    aa_probs = inf_aa.readline()
    ar_probs = inf_ar.readline()
    rr_probs = inf_rr.readline()
    if i == 6240306: 
        break

 92%|█████████▏| 6240306/6764372 [02:52<00:14, 36224.92it/s]


In [None]:
bim_like

In [103]:
ref_numbers

'6240306,NC_037651.1,9770952,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2\n'

In [104]:
ar_probs

'6240306,NC_037651.1,9770952,1.1253998361201744e-07,1.9988935406767335e-06,8.44031190971643e-08,4.742949587530005e-07,2.675415631799144e-10,1.9988935406767335e-06,0.9975190960441253,6.318023592626571e-06,6.330093744971093e-08,2.675415631799144e-10,1.9988935406767335e-06,8.44031190971643e-08,1.1253998361201744e-07,3.5537405396740974e-06,1.4991376729384742e-06,3.5537405396740974e-06,8.432281095893794e-07,1.1232484713080014e-05,2.6652486915791254e-06,4.7384201416619685e-06,3.5537405396740974e-06,1.500566321253565e-07,8.432281095893794e-07,3.5537405396740974e-06,4.7474651455224204e-08,3.5537405396740974e-06,2.6652486915791254e-06,6.318023592626571e-06,6.336118011097954e-09,1.1296899377085736e-11,4.751983177356673e-09,3.570694667313889e-11,8.42420045159261e-06,2.6652486915791254e-06,8.456384362775845e-10,3.5639083851319007e-09,4.751983177356673e-09,1.4976922194352454e-05,4.751983177356673e-09,3.557133760522609e-07,4.7384201416619685e-06,6.336118011097954e-09,4.751983177356673e-09,1.49769221

In [65]:
i += 1

In [66]:
C

6240308

In [52]:
ref_numbers = ref_numbers.rstrip().split(',')[3:]
aa_probs = aa_probs.rstrip().split(',')[3:]
ar_probs = ar_probs.rstrip().split(',')[3:]
rr_probs = rr_probs.rstrip().split(',')[3:]

In [67]:
ref_numbers = inf.readline().rstrip().split(',')[3:]
aa_probs = inf_aa.readline().rstrip().split(',')[3:]
ar_probs = inf_ar.readline().rstrip().split(',')[3:]
rr_probs = inf_rr.readline().rstrip().split(',')[3:]

In [55]:
j = 6

In [57]:
header_bsids.rstrip().split(',')[3:][j]

'BS16_0134'

In [68]:
ref_nr = ref_numbers[j]
ref_nr

'2'

In [69]:
prob_threshold

0.95

In [71]:
(ref_nr == '2') and (float(rr_probs[j]) >= prob_threshold)

True

In [None]:
ar_probs

In [61]:
ar_probs[j]

'0.9895146463136785'

In [150]:
i

7023687

In [151]:
bim_like.iloc[6240306, ]

CHR                 14
SNP_ID      14:9770952
GEN_DIST             0
BP_POS         9770952
ALT                  T
REF                  C
Name: 6240306, dtype: object

In [72]:
bim.iloc[i, 5]

'C'

In [73]:
bim.iloc[i, 4]

'T'

In [74]:
ped_file = '{}{}_{}.ped'.format(plink_path, prefix, pop)

In [75]:
inf_ped = open(ped_file, 'r')


In [77]:
for l in inf_ped:
    if l.split()[1] == 'BS16_0134': break

In [79]:
l_split = l.split()

In [92]:
len(l_split[6:]) / 2

6764372.0

In [93]:
l_split[(6 + 2*6240306):]

['C',
 '0',
 'C',
 'T',
 'C',
 'C',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'C',
 'C',
 'G',
 'G',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'G',
 'G',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'T',
 'T',
 'C',
 'C',
 'T',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'T',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'C',
 'C',
 'G',
 'G',
 'G',
 'A',
 'G',
 'G',
 'G',
 'G',
 'C',
 'C',
 'A',
 'A',
 'G',
 'G',
 'C',
 'C',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'C',
 'C',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'A',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'T',
 'T',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'C',
 'C',
 'G',
 'A',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'T',
 'T',
 'C',
 'C',
 '0',
 '0',
 'A',
 'A',
 'A',
 'A',
 'G',
 'G',
 'A',
 'A',
 'G',
 'G',
 'C',
 'C',
 '0',
 '0',
 'C',
 'C',
 '0',
 '0',
 'G',
 'G',
 'C',
 'C',
 'C',
 'C',
 'C',
 'C',
 'T',
 'T',
 'C',
 'C',
 'C',
 'C',
 'G',
 'G',
 'C',
 'C',
 'C'

In [94]:
bim

Unnamed: 0,0,1,2,3,4,5
0,1,1:5671,0,5671,0,T
1,1,1:5698,0,5698,C,T
2,1,1:6621,0,6621,G,A
3,1,1:6717,0,6717,0,C
4,1,1:6823,0,6823,C,G
...,...,...,...,...,...,...
6764367,16,16:7225908,0,7225908,C,T
6764368,16,16:7225911,0,7225911,T,C
6764369,16,16:7226186,0,7226186,A,G
6764370,16,16:7226346,0,7226346,0,G


In [85]:
bim.loc[6240306]

0            14
1    14:9770952
2             0
3       9770952
4             0
5             C
Name: 6240306, dtype: object

In [120]:
def write_ped_pop(
    plink_path, prefix, pop, chunk_bs_ids, batch_j, chunk_size):

    # bim like because not output of --make-bed (which filters some alternative alleles)
    bim_like = pd.read_csv('{}{}.bim_like'.format(plink_path, prefix), sep='\t', header=None)
    
    bgs_file = '{}{}_{}.bgs'.format(plink_path, prefix, pop)
    aa_prob_file = '{}{}_{}_AA.prob'.format(plink_path, prefix, pop)
    ar_prob_file = '{}{}_{}_AR.prob'.format(plink_path, prefix, pop)
    rr_prob_file = '{}{}_{}_RR.prob'.format(plink_path, prefix, pop)
    snp_nr = len(bim_like)
    # snp_nr = 10
    
    start_j = batch_j * chunk_size
    ped_mat = np.full((len(chunk_bs_ids), 2 * snp_nr), '', dtype='U1')
    
    inf = open(bgs_file, 'r')
    inf_aa = open(aa_prob_file, 'r')
    inf_ar = open(ar_prob_file, 'r')
    inf_rr = open(rr_prob_file, 'r')

    # check we have the same bs ids in header than in batch
    header_bsids = inf.readline()
    assert header_bsids.rstrip().split(',')[3:][start_j: start_j + len(chunk_bs_ids)] == chunk_bs_ids, 'bs_id mismatch'
    inf_aa.readline()
    inf_ar.readline()
    inf_rr.readline()

    # iterate snps and colonies
    for i in tqdm(range(snp_nr)):
        ref_numbers = inf.readline().rstrip().split(',')[3:]
        aa_probs = inf_aa.readline().rstrip().split(',')[3:]
        ar_probs = inf_ar.readline().rstrip().split(',')[3:]
        rr_probs = inf_rr.readline().rstrip().split(',')[3:]

        for j in range(start_j, start_j + len(chunk_bs_ids)):
            ref_nr = ref_numbers[j]

            # homozygote reference
            if (ref_nr == '2') and (float(rr_probs[j]) >= prob_threshold):
                ped_mat[j - start_j, 2 * i] = bim_like.iloc[i, 5]
                ped_mat[j - start_j, 2 * i + 1] = bim_like.iloc[i, 5]
            # heterozygote
            elif (ref_nr == '1') and (float(ar_probs[j]) >= prob_threshold):
                ped_mat[j - start_j, 2 * i] = bim_like.iloc[i, 5]
                ped_mat[j - start_j, 2 * i + 1] = bim_like.iloc[i, 4]
            # homozygote alternative
            elif (ref_nr == '0') and (float(aa_probs[j]) >= prob_threshold):
                ped_mat[j - start_j, 2 * i] = bim_like.iloc[i, 4]
                ped_mat[j - start_j, 2 * i + 1] = bim_like.iloc[i, 4]
            # below prob_threshold
            elif ref_nr in {'0', '1', '2'}:
                ped_mat[j - start_j, 2 * i] = '0'
                ped_mat[j - start_j, 2 * i + 1] = '0'
            else: 
                print('ERROR')
            
    inf.close()
    inf_aa.close()
    inf_ar.close()
    inf_rr.close()

    ped_df = pd.DataFrame({
        'FID': ['0'] * len(chunk_bs_ids), 
        'IID': chunk_bs_ids,
        'PID': ['0'] * len(chunk_bs_ids),
        'MID': ['0'] * len(chunk_bs_ids),
        'SEX': ['0'] * len(chunk_bs_ids), 
        'PHENOTYPE': ['-9'] * len(chunk_bs_ids)
    })
    ped_df = ped_df.join(pd.DataFrame(ped_mat))
    ped_df.to_csv('{}{}_{}_{}.ped'.format(plink_path, prefix, pop, batch_j), header=False, index=False, sep='\t')

In [27]:
batch_j = 0
chunk_bs_ids = batch_bs_ids[batch_j]

In [121]:
write_ped_pop(
    plink_path, prefix, pop, chunk_bs_ids, batch_j, chunk_size)

100%|██████████| 10/10 [00:00<00:00, 1696.11it/s]


prod

In [154]:
def write_write_ped_script(script_fn, array_str):
    runstr="""#!/bin/bash -l
#SBATCH --array=ARRAY_STRING
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=100g
#SBATCH --time=6:00:00
#SBATCH --job-name=write_ped
#SBATCH --output=%x_%A_%a.out
#SBATCH --error=%x_%A_%a.err

modulesld
ebld
module use /software/anaconda3/envs/eb/easybuild/modules/all
conda activate planb

# file listing batches of BeeStrong ids (bs_id) for parallel computing
batch_file=$1
chunk_size=$2
plink_path=$3
prefix=$4
pop=$5
prob_threshold=$6
    
batch_id=$SLURM_ARRAY_TASK_ID

# getting bs_ids
chunk_bs_ids=$(sed -n ${batch_id}'{p;q}' ${batch_file})
    
python ~/scripts/plan-b-omics/bin/write_ped.py --chunk_bs_ids ${chunk_bs_ids} --batch_id ${batch_id} --chunk_size ${chunk_size} --plink_path ${plink_path} --prefix ${prefix} --pop ${pop} --prob_threshold ${prob_threshold}

echo DONE""".replace("ARRAY_STRING", array_str)
    with open(script_fn, 'w') as outf:
        outf.write(runstr)

In [162]:
chunk_size = 25 # should match the
plink_path = '{}data/queen_genotypes/hom_pop/'.format(working_path)
prefix = 'queen'
# populations = ['Caucasica', 'Ligustica_Carnica', 'Mellifera', 'hybrid']

prob_threshold = 0.95

In [163]:
# write batch files
for pop in populations:
    batch_file = '{}bsid_batch_{}.txt'.format(plink_path, pop)
    bs_ids = pheno_df[pheno_df['Genetic group'] == pop]['Colony'].to_list()
    print(len(bs_ids))
    
    batch_bs_ids = []
    with open(batch_file, "w") as outf:
        for i in range(0, len(bs_ids), chunk_size):
            chunk = bs_ids[i:i+chunk_size]
            batch_bs_ids.append(chunk)
            outf.write(",".join(chunk) + "\n")

20
645
405
372


In [176]:
pop = 'Ligustica_Carnica'
# pop = 'Caucasica'
# pop = 'Mellifera'
# pop = 'hybrid'

batch_file = '{}bsid_batch_{}.txt'.format(plink_path, pop)

In [177]:
with open(batch_file, "r") as inf:
    batch_bs_ids = [x.rstrip().split(',') for x in inf.readlines()]

In [178]:
array_str = '1-{}'.format(len(batch_bs_ids)) if len(batch_bs_ids) > 1 else '1'
script_fn = '{}write_ped.run'.format(script_path)
write_write_ped_script(script_fn, array_str)

In [179]:
%%bash -s "$script_fn" "$batch_file" "$chunk_size" "$plink_path" "$prefix" "$pop" "$prob_threshold" 
cd $4
sbatch $1 $2 $3 $4 $5 $6 $7

Submitted batch job 1380901


cat the ped files

### .map

.map file is also needed to convert ped to bed (only map + bed)

CHR   SNP_ID   GEN_DIST   BP_POS

basically bim wihout REF and ALT

In [8]:
prefix = 'queen'
plink_path = '{}data/queen_genotypes/hom_pop/'.format(working_path)
populations = ['Caucasica', 'Ligustica_Carnica', 'Mellifera', 'hybrid']

In [180]:
bim_like = pd.read_csv('{}{}.bim_like'.format(plink_path, prefix), sep='\t', header=None)

In [181]:
bim_like

Unnamed: 0,0,1,2,3,4,5
0,1,1:5671,0,5671,G,T
1,1,1:5698,0,5698,C,T
2,1,1:6621,0,6621,G,A
3,1,1:6717,0,6717,T,C
4,1,1:6823,0,6823,C,G
...,...,...,...,...,...,...
6764367,16,16:7225908,0,7225908,C,T
6764368,16,16:7225911,0,7225911,T,C
6764369,16,16:7226186,0,7226186,A,G
6764370,16,16:7226346,0,7226346,A,G


In [182]:
for pop in populations:
    print(pop)
    bim[[0, 1, 2, 3]].to_csv('{}{}_{}.map'.format(plink_path, prefix, pop), header=False, index=False, sep='\t')

Caucasica
Ligustica_Carnica
Mellifera
hybrid


In [7]:
bim

Unnamed: 0,0,1,2,3,4,5
0,1,1:5671,0,5671,0,T
1,1,1:5698,0,5698,C,T
2,1,1:6621,0,6621,G,A
3,1,1:6717,0,6717,0,C
4,1,1:6823,0,6823,C,G
...,...,...,...,...,...,...
6764367,16,16:7225908,0,7225908,C,T
6764368,16,16:7225911,0,7225911,T,C
6764369,16,16:7226186,0,7226186,A,G
6764370,16,16:7226346,0,7226346,0,G


In [33]:
prefix = 'queen'
snp_nr = 6764372
bs_ids = pheno_df['Colony'].to_list()

In [42]:
# read bim file
bim_file = '{}{}.bim'.format(allele_count_path, prefix)
bim_df = pd.read_csv(bim_file, sep='\t')
bim_df

Unnamed: 0,CHR,SNP_ID,GEN_DIST,BP_POS,ALT,REF
0,NC_037638.1,NC_037638.1_5671,0,5671,G,T
1,NC_037638.1,NC_037638.1_5698,0,5698,C,T
2,NC_037638.1,NC_037638.1_6621,0,6621,G,A
3,NC_037638.1,NC_037638.1_6717,0,6717,T,C
4,NC_037638.1,NC_037638.1_6823,0,6823,C,G
...,...,...,...,...,...,...
6764367,NC_037653.1,NC_037653.1_7225908,0,7225908,C,T
6764368,NC_037653.1,NC_037653.1_7225911,0,7225911,T,C
6764369,NC_037653.1,NC_037653.1_7226186,0,7226186,A,G
6764370,NC_037653.1,NC_037653.1_7226346,0,7226346,A,G


In [68]:
# write map file
bim_df[['CHR', 'SNP_ID', 'GEN_DIST', 'BP_POS']][:snp_nr].to_csv('{}{}.map'.format(plink_path, prefix), header=False, index=False, sep='\t')

In [70]:
# and fam file
fam_df = pd.DataFrame({
    'FID': bs_ids, 
    'IID': bs_ids,
    'PID ': ['0'] * len(bs_ids),
    'MID': ['0'] * len(bs_ids),
    'SEX': ['0'] * len(bs_ids), 
    'PHENOTYPE': ['-9'] * len(bs_ids)
})
fam_df.to_csv('{}{}.fam'.format(plink_path, prefix), header=False, index=False, sep='\t')

In [71]:
bim_file = '{}{}.bim'.format(plink_path, prefix)
bim_df.to_csv(bim_file, header=False, index=False, sep='\t')
bim_df = pd.read_csv(bim_file, sep='\t', header=None)
bim_df

Unnamed: 0,0,1,2,3,4,5
0,1,1:5671,0,5671,G,T
1,1,1:5698,0,5698,C,T
2,1,1:6621,0,6621,G,A
3,1,1:6717,0,6717,T,C
4,1,1:6823,0,6823,C,G
...,...,...,...,...,...,...
6764367,16,16:7225908,0,7225908,C,T
6764368,16,16:7225911,0,7225911,T,C
6764369,16,16:7226186,0,7226186,A,G
6764370,16,16:7226346,0,7226346,A,G


convert to bed in CLI. Not sure how long it will take for convertion to bed...

ml PLINK

plink --file queen --make-bed --out queen --allow-extra-chr

In [73]:
# recode bs_ids of suiss colonies (_ instead of -)
with open('{}suisse/suisse_original.fam'.format(plink_path), 'r') as inf:
    with open('{}suisse/suisse.fam'.format(plink_path), 'w') as outf:
        for l in inf:
            outf.write(l.replace('BS18-', 'BS18_'))

recode queen_phased (for comparison with queen)

In [8]:
bim_file = '{}suisse_queen_phased/queen_phased.bim'.format(plink_path)
bim_df = pd.read_csv(bim_file, sep='\t', header=None)
bim_df

Unnamed: 0,0,1,2,3,4,5
0,1,snp1,0,5671,.,T
1,1,snp2,0,5698,C,T
2,1,snp3,0,6621,.,A
3,1,snp4,0,6823,.,G
4,1,snp5,0,6832,.,G
...,...,...,...,...,...,...
5944678,16,snp5944679,0,7225891,A,G
5944679,16,snp5944680,0,7225908,.,T
5944680,16,snp5944681,0,7225911,T,C
5944681,16,snp5944682,0,7226346,.,G


In [22]:
len(bim_df)

5944683

In [23]:
new_snp_ids = [''] * len(bim_df)
i= 0
for row in tqdm(bim_df.iterrows()):
    new_snp_ids[i] = '{}:{}'.format(row[1][0], row[1][3])
    i +=1

5944683it [03:23, 29222.47it/s]


In [25]:
len(new_snp_ids)

5944683

In [27]:
bim_df[1] = new_snp_ids

In [28]:
bim_df

Unnamed: 0,0,1,2,3,4,5
0,1,1:5671,0,5671,.,T
1,1,1:5698,0,5698,C,T
2,1,1:6621,0,6621,.,A
3,1,1:6823,0,6823,.,G
4,1,1:6832,0,6832,.,G
...,...,...,...,...,...,...
5944678,16,16:7225891,0,7225891,A,G
5944679,16,16:7225908,0,7225908,.,T
5944680,16,16:7225911,0,7225911,T,C
5944681,16,16:7226346,0,7226346,.,G


In [29]:
bim_df.to_csv(bim_file, header=False, index=False, sep='\t')

In [30]:
bim_file


'/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/plink/suisse_queen_phased/queen_phased.bim'

In [33]:
bim_df[[0, 1, 2, 3]].to_csv('/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/plink/suisse_queen_phased/queen_phased.map', header=False, index=False, sep='\t')

In [None]:
# recode snp names of queen_phased

bim_df[['CHR', 'SNP_ID', 'GEN_DIST', 'BP_POS']].to_csv('{}{}.map'.format(plink_path, prefix), header=False, index=False, sep='\t')

In [12]:
# recode tfam 


tfam_df = pd.read_csv('{}suisse_queen_phased/suisse.tfam'.format(plink_path), sep='\s', header=None)

  tfam_df = pd.read_csv('{}suisse_queen_phased/suisse.tfam'.format(plink_path), sep='\s', header=None)


In [14]:
tfam_df[0] = len(tfam_df) * [0]

In [186]:
fam = pd.read_csv('/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/suisse/queen_suisse/queen_suisse.fam', sep='\s', header=None)

  fam = pd.read_csv('/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/suisse/queen_suisse/queen_suisse.fam', sep='\s', header=None)


In [188]:
fam[0] = ['0'] * len(fam)

In [189]:
fam.to_csv('/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/suisse/queen_suisse_suiss_conc/queen_suisse.fam', header=False, index=False, sep='\t')

In [39]:
def recode_suisse_bsids(old_id):
    return '{}_{}'.format(old_id, old_id)

In [40]:
tfam_df[1] = tfam_df[1].map(recode_suisse_bsids)

In [16]:
tfam_df.to_csv('{}suisse_queen_phased/suisse.tfam'.format(plink_path), header=False, index=False, sep='\t')

## GREML

In [191]:
plink_path = '/cluster/raid/home/f80878961/beestrong/data/queen_genotypes/hom_pop/'


In [192]:
prefix = 'hom_pop'
out_path = plink_path
fam_df = pd.read_csv('{}{}.fam'.format(out_path, prefix), header=None, sep='\s') 

  fam_df = pd.read_csv('{}{}.fam'.format(out_path, prefix), header=None, sep='\s')


In [193]:
fam_df

Unnamed: 0,0,1,2,3,4,5
0,0,BS16_0060,0,0,0,-9
1,0,BS16_0061,0,0,0,-9
2,0,BS16_0162,0,0,0,-9
3,0,BS16_0163,0,0,0,-9
4,0,BS16_0164,0,0,0,-9
...,...,...,...,...,...,...
1437,0,BS17_0506,0,0,0,-9
1438,0,BS17_0511,0,0,0,-9
1439,0,BS17_0512,0,0,0,-9
1440,0,BS17_0513,0,0,0,-9


In [197]:
# reorder pheno df according to fam_df
pheno_df = pheno_df.set_index("Colony").loc[fam_df[1].to_list()].reset_index()
pheno_df

Unnamed: 0,Colony,date,Genetic group,percent_Ligustica_Carnica,percent_Mellifera,percent_Caucasica,bee_weigth,nbr_pho_varroa,nbr_pho_varroa_100bee,v_pho,...,Day of Year,Latitude,Longitude,v_meta,log_v_meta,log_v_pho,log_v_mito,Varroa group,Recapping group,DMR group
0,BS16_0060,06/07/2016,Caucasica,0.118259,1.369552e-04,0.881604,51.0,0.0,0.0,0.000000,...,188,47.00,2.583,0.000000e+00,-15.168735,-1.987004,-8.156349,Low,Low,
1,BS16_0061,06/07/2016,Caucasica,0.109495,9.200239e-05,0.890413,52.0,1.0,0.0,0.269231,...,188,47.00,2.583,6.124765e-06,-12.003170,-1.312186,-4.984972,Low,Low,Low
2,BS16_0162,31/08/2016,Caucasica,0.047143,1.376441e-05,0.952843,63.0,28.0,6.0,6.222222,...,244,47.00,2.583,5.620967e-05,-9.786422,1.828127,-2.992058,Medium,Medium,Medium
3,BS16_0163,31/08/2016,Caucasica,0.015421,1.019360e-05,0.984569,52.0,2.0,1.0,0.538462,...,244,47.00,2.583,0.000000e+00,-15.168735,-0.619039,-8.046489,Low,Low,Low
4,BS16_0164,31/08/2016,Caucasica,0.005619,4.687815e-08,0.994381,60.0,9.0,2.0,2.100000,...,244,47.00,2.583,2.652597e-05,-10.537386,0.741937,-3.941058,Medium,Medium,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,BS17_0506,01/08/2017,hybrid,0.418368,5.687723e-01,0.012860,47.5,1.0,0.0,0.294737,...,213,43.25,-0.833,8.441952e-06,-11.682297,-1.221672,-5.483170,Low,Low,Low
1438,BS17_0511,01/08/2017,hybrid,0.044593,7.929777e-01,0.162429,41.0,0.0,0.0,0.000000,...,213,43.25,-0.833,4.292615e-07,-14.661200,-1.987004,-8.551460,Low,Low,High
1439,BS17_0512,01/08/2017,hybrid,0.113869,5.741510e-01,0.311980,39.2,0.0,0.0,0.000000,...,213,43.25,-0.833,0.000000e+00,-15.168735,-1.987004,-7.279276,Low,Low,High
1440,BS17_0513,01/08/2017,hybrid,0.554205,4.213804e-01,0.024415,43.0,0.0,0.0,0.000000,...,213,43.25,-0.833,2.434765e-05,-10.623075,-1.987004,-3.551751,Medium,Low,Low


In [198]:
# check consistent colony order between pheno df and plink
bs_ids = pheno_df['Colony'].to_list()
assert fam_df[1].to_list() == bs_ids, 'messed up colony order'

In [199]:
columns = ['Colony', 'log_v_pho', 'log_v_meta', 'Apiary', 'date', 'Year', 'Day of Year']

# Varroa-Associated families
vaf_abund = [
    'Bifidobacteriaceae',
    'Lactobacillaceae',
    'Pseudomonadaceae',   
    'Debaryomycetaceae',
    'Bartonellaceae',  
    'Morganellaceae'
]
vaf_prev = [
    'Oscillospiraceae',
    'Corynebacteriaceae',
    'Cannabaceae',
    'Aeromonadaceae',
    'Weeksellaceae',
    'Araliaceae',
    'Actinomycetaceae',
    'Trypanosomatidae'
]

fn = '{}tlf/Core_Family_LRA.csv'.format(working_path)
abund_df = pd.read_csv(fn) #, index_col='Colony').reset_index

In [200]:
gcta_pheno_df = pd.DataFrame(
    {
        '0' : [0] * len(bs_ids),
        # '0': bs_ids,
        'Colony': bs_ids
    }
)
gcta_pheno_df = gcta_pheno_df.merge(pheno_df[columns], on='Colony', how='left')
gcta_pheno_df = gcta_pheno_df.merge(abund_df[['Colony'] + vaf_abund + vaf_prev], on='Colony', how='left')

# replace abundance values by presence-absence for prevalence associations
lod = np.log(5e-06)
gcta_pheno_df[gcta_pheno_df[vaf_prev] >= lod] = int(1)
gcta_pheno_df[gcta_pheno_df[vaf_prev] < lod] = int(0)

gcta_pheno_df = gcta_pheno_df.fillna('NA')

In [201]:
gcta_pheno_df

Unnamed: 0,0,Colony,log_v_pho,log_v_meta,Apiary,date,Year,Day of Year,Bifidobacteriaceae,Lactobacillaceae,...,Bartonellaceae,Morganellaceae,Oscillospiraceae,Corynebacteriaceae,Cannabaceae,Aeromonadaceae,Weeksellaceae,Araliaceae,Actinomycetaceae,Trypanosomatidae
0,0,BS16_0060,-1.987004,-15.168735,88.0,06/07/2016,2016,188,-6.146180,-4.997377,...,-6.744082,-9.018644,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
1,0,BS16_0061,-1.312186,-12.003170,88.0,06/07/2016,2016,188,-6.114353,-4.710165,...,-6.098915,-10.080948,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
2,0,BS16_0162,1.828127,-9.786422,88.0,31/08/2016,2016,244,-5.950136,-4.598848,...,-7.069882,-7.210088,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
3,0,BS16_0163,-0.619039,-15.168735,88.0,31/08/2016,2016,244,-5.707229,-4.439819,...,-6.104517,-10.880938,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
4,0,BS16_0164,0.741937,-10.537386,88.0,31/08/2016,2016,244,-5.405451,-4.150527,...,-7.050514,-9.954003,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,0,BS17_0506,-1.221672,-11.682297,128.0,01/08/2017,2017,213,-5.955760,-4.546061,...,-7.303314,-5.188458,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1438,0,BS17_0511,-1.987004,-14.661200,128.0,01/08/2017,2017,213,-5.556695,-3.947906,...,-5.801414,-8.585859,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1439,0,BS17_0512,-1.987004,-15.168735,97.0,01/08/2017,2017,213,-5.081466,-4.056083,...,-9.312203,-9.878038,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1440,0,BS17_0513,-1.987004,-10.623075,97.0,01/08/2017,2017,213,-5.038656,-4.301295,...,-7.182466,-4.987978,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [202]:
# phenotypes
fn = '{}{}.v_pho'.format(out_path, prefix)
gcta_pheno_df[['0', 'Colony', 'log_v_pho']].to_csv(fn, header=None, sep='\t', index=False)

fn = '{}{}.v_meta'.format(out_path, prefix)
gcta_pheno_df[['0', 'Colony', 'log_v_meta']].to_csv(fn, header=None, sep='\t', index=False)

pd.read_table(fn, header=None) 

Unnamed: 0,0,1,2
0,0,BS16_0060,-15.168735
1,0,BS16_0061,-12.003170
2,0,BS16_0162,-9.786422
3,0,BS16_0163,-15.168735
4,0,BS16_0164,-10.537386
...,...,...,...
1437,0,BS17_0506,-11.682297
1438,0,BS17_0511,-14.661200
1439,0,BS17_0512,-15.168735
1440,0,BS17_0513,-10.623075


In [203]:
# categorical covariable
fn = '{}{}.covar'.format(out_path, prefix)
gcta_pheno_df[['0', 'Colony', 'Apiary']].to_csv(fn, header=None, sep='\t', index=False)
# gcta_pheno_df[['0', 'Colony', 'Apiary', 'Year'] + vaf_prev].to_csv(fn, header=None, sep='\t', index=False)
# gcta_pheno_df[['0', 'Colony'] + vaf_prev].to_csv(fn, header=None, sep='\t', index=False)
# gcta_pheno_df[['0', 'Colony', 'Apiary']].to_csv(fn, header=None, sep='\t', index=False)
pd.read_table(fn, header=None)

Unnamed: 0,0,1,2
0,0,BS16_0060,88.0
1,0,BS16_0061,88.0
2,0,BS16_0162,88.0
3,0,BS16_0163,88.0
4,0,BS16_0164,88.0
...,...,...,...
1437,0,BS17_0506,128.0
1438,0,BS17_0511,128.0
1439,0,BS17_0512,97.0
1440,0,BS17_0513,97.0


In [204]:
# continuous covariable
fn = '{}{}.qcovar'.format(out_path, prefix)
gcta_pheno_df[['0', 'Colony', 'Day of Year']].to_csv(fn, header=None, sep='\t', index=False)
# gcta_pheno_df[['0', 'Colony', 'Day of Year'] + vaf_abund].to_csv(fn, header=None, sep='\t', index=False)
#gcta_pheno_df[['0', 'Colony'] + vaf_abund].to_csv(fn, header=None, sep='\t', index=False)
pd.read_table(fn, header=None)

Unnamed: 0,0,1,2
0,0,BS16_0060,188
1,0,BS16_0061,188
2,0,BS16_0162,244
3,0,BS16_0163,244
4,0,BS16_0164,244
...,...,...,...
1437,0,BS17_0506,213
1438,0,BS17_0511,213
1439,0,BS17_0512,213
1440,0,BS17_0513,213
