In [4]:
import os
import pandas as pd
import numpy as np
import io
import subprocess

## 1. Concatenate fasta sequences of selected chromosomes

The sequences are expected to be in a one file, if they are not combine them at first using step 0

### Step 0

In [2]:
def combine_fasta(output_file, *fasta_files):
    with open(output_file, 'w') as out_f:
        for idx, f in enumerate(fasta_files) :
            with open(f, 'r') as in_file:
                out_f.write(in_file.read())
                out_f.write('\n')
    

In [29]:
combine_fasta('combined_fasta', 'first.fa', 'second.fa')

### Step 1

In [3]:
def concat_fasta(input_file, output_prefix, chromosomes: list):
    select_chr = []
    cur_chr = None
    with open(input_file, 'r') as inp_file:
        buf = io.StringIO()
        for line in inp_file:
            line = line.strip()
            if line.startswith('>'):
                chr_name = line.lstrip('>').split(' ')[0]
                if chr_name in chromosomes:
                    cur_chr = chr_name
                    select_chr.append(chr_name)
                else:
                    cur_chr = None
            else:
                if cur_chr is not None:
                    buf.write(line)

    combined_seqs = buf.getvalue()
    length_genome = len(combined_seqs)
    select_chr_str = '_'.join(select_chr)

    with open(output_prefix, 'w') as out_pref:
        out_pref.write(f">concatenated_{select_chr_str} LN:{length_genome}\n{combined_seqs}")
    
    output_file = f'{output_prefix}_concatenated_chromosomes.fa'
    try:
        os.rename(output_prefix, output_file)
        
    except FileNotFoundError:
        print(f'Error: file {out_pref} is not found')

In [5]:
input_file = '/Users/kristina/Desktop/internship_biotech/GRCh38_full_analysis_set_plus_decoy_hla.fa'
output_file = 'combined_chr1_chr2_test'
concat_fasta(input_file, output_file, ('chr1', 'chr2'))

## 2. Merge VCF files

 0. Before you merge VCF files you need to find the length of each chromosome in order to shift your SNPs. I'll be using combined file from step 0

In [6]:
#write only chromosomes name without > in input
def get_length(input_file, chromosomes: list):
    sel_chr = []
    chr_lengths = []
    with open(input_file, 'r') as inp_file:
        cur_chr = None
        # first_line = inp_file.readline()
        # cur_chr = first_line.strip().lstrip('>').split(' ')[0]
        for line in inp_file:
            line = line.strip()
            if line.startswith('>'):
                if cur_chr is not None:
                    chr_lengths.append((cur_chr, len(buf1.getvalue())))
                chr_name = line.lstrip('>').split(' ')[0]
                if  chr_name in chromosomes:
                    sel_chr.append(chr_name)
                    cur_chr = chr_name
                    buf1 = io.StringIO()
                else:
                    cur_chr = None
                    buf1 = None
            else:
                if cur_chr is not None:
                    buf1.write(line)
        if cur_chr is not None:
            chr_lengths.append((cur_chr, len(buf1.getvalue())))

    return chr_lengths

In [8]:
input_file = '/Users/kristina/Desktop/internship_biotech/GRCh38_full_analysis_set_plus_decoy_hla.fa'
print(get_length(input_file, ['chr1', 'chr2']))

[('chr1', 248956422), ('chr2', 242193529)]


1. Run these functions in the exact order in order get concatenated vcf file in the end

In [130]:
def sort_vcf(input_file, output_file):
    subprocess.run([f'bcftools sort {input_file}| bgzip >  {output_file} '], shell=True) #sort plus bgzip

def bgzip_vcf(input_file): #should be skipped 
    subprocess.run([f'bgzip {input_file}'], shell=True)

def shift_positions_one_chr(input_file, output_file, shift: int): #you can run it for each chr file independently and then merge
    subprocess.run([f"bcftools view {input_file} | awk 'BEGIN{{OFS=\"\\t\"}} /^#/ {{print; next}} {{ $2 = $2 + {shift}; print}}' >  {output_file}"], shell=True)

def rename_chr(input_file, output_file, change_names): #change_names.txt must include in the first column old names and in the second new names, sep=' '
    subprocess.run([f'bcftools annotate --rename-chrs {change_names} {input_file} > {output_file}'], shell=True)

def concat_vcf(output_file, vcf_file_1, vcf_file_2):
    subprocess.run([f'bcftools concat --naive {vcf_file_1} {vcf_file_2} > {output_file}'], shell=True)

In [122]:
sort_vcf('/Users/kristina/Desktop/internship_biotech/py_ped_sim/2_chr_first_family.gz', '2_chr_first_family_sorted.gz')

Writing to /tmp/bcftools-sort.jdAhLs
Merging 6 temporary files
Cleaning
Done


In [123]:
shift_positions_one_chr('/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted.gz', '/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted_shifted.gz', 248956422)

In [125]:
bgzip_vcf('/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted_shifted.gz')

In [133]:
rename_chr('/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted_shifted.gz.gz', '/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted_shifted_renamed', change_names='/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/change_chr_names.txt')

In [134]:
bgzip_vcf('/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted_shifted_renamed')

In [136]:
concat_vcf('chr1_chr2_concat_not_renamed.vcf.gz', '/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/first_family_biall3_sorted.vcf.gz', '/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/2_chr_first_family_sorted_shifted.gz.gz')

## 3. Change positions in your recombination map

In this step you need to shift the positions inside your recombination map, add first line '1   0.0' and add recombination point between two chromosomes with recombination rate 0.5 (to point that these parts are independent). Also ensue that your positions are of int type and rates in float type and the positions go in the ascending order. Unfortunately, SLiM uses the same recombination rates for both sexes (or maybe I missed sth), so I will use recombination map for females from this source https://www.nature.com/articles/s41586-024-08450-5

chr1 400
chr2 500 #for length of the first chromosome in the 1-based system
chr3 600
chr4 700

In [119]:
def shift_recomb(recomb_map, lengths:list, chromosomes:list):  
    df = pd.read_csv(recomb_map, skiprows=10, sep='\t')
    df = df[df['Chr'].isin(chromosomes)]
    cur_shift = 0
    inter_chr_break = []

    for idx, chr in enumerate(chromosomes):
        shift = int(lengths[idx].split(' ')[1])
        mask = (df['Chr']== chr)
        df.loc[mask, 'pos'] = df.loc[mask, 'pos'] + cur_shift #add length of the previous chromosome
        cur_shift = shift + cur_shift
        if idx < len(chromosomes) - 1:
            inter_chr_break.append([cur_shift, 0.5]) #between chromosomes
            inter_chr_break.append([cur_shift + 1, 0.5])
        

    rec_pos_multiple_chr_temp = df.loc[:, ['pos', 'cMperMb']]
    zero_row = pd.DataFrame([[1, 0.0]], columns=['pos', 'cMperMb'])
    inter_chr_break_df = pd.DataFrame(inter_chr_break, columns=['pos', 'cMperMb']) #in 1 based system
    rec_pos_multiple_chr = pd.concat([zero_row, rec_pos_multiple_chr_temp, inter_chr_break_df], axis=0)
    rec_pos_multiple_chr['pos'] = rec_pos_multiple_chr['pos'].astype(int)
    rec_pos_multiple_chr['cMperMb'] = rec_pos_multiple_chr['cMperMb'].astype(float)
    rec_pos_multiple_chr = rec_pos_multiple_chr.sort_values(by='pos')
    print(rec_pos_multiple_chr.describe())
    rec_pos_multiple_chr.to_csv(f'recomb_map_shifted.tsv', sep='\t', header=None, index=None) #no header and no indexes must be included

In [60]:
shifts = ['chr1 248956422', 'chr2 242193529']

In [120]:
shift_recomb('/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/maps/maps.mat.tsv', shifts, ['chr1', 'chr2'])

                pos     cMperMb
count  4.940000e+02  494.000000
mean   2.449957e+08    1.330144
std    1.418634e+08    0.787715
min    1.000000e+00    0.000000
25%    1.227500e+08    0.778082
50%    2.460000e+08    1.198018
75%    3.672064e+08    1.859779
max    4.904564e+08    3.839971


## 4. Run simulation in the same way as for one chromosome

Filter your vcf file to get only biallelic SNPs

python py_ped_sim_init/run_ped_sim.py -t filter_vcf -v vcf_file.vcf.gz

Better do it in bash. You must have your simulated pedigree structure and at least one parent for each descendant except for founders. Example code:

python py_ped_sim_init/run_ped_sim.py -t sim_genomes_exact -n gen_ped.nx -e founders.txt -v first_family_chr1_chr2_sorted_concatenated_chr2_shifted_renamed_slim_fil.vcf.gz -f combined_chr1_chr2.fa -rm recomb_pos_mat_chr1_chr2.tsv -o first_family_simul_two_chr

python run_ped_sim.py -t sim_genomes_exact -n multiple_chr_simulation_files/gen_ped.nx -e multiple_chr_simulation_files/founders.txt -v multiple_chr_simulation_files/chr1_chr2_concat.vcf.gz -f multiple_chr_simulation_files/combined_chr1_chr2_test_concatenated_chromosomes.fa -rm /Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/recomb_map_shifted.tsv -o first_family_simul_two_chr_test

## 5. Change chromosomes names for SNP positions in a vcf file with simulated SNPs 

Run this command in bash to rename chr:

bcftools view '/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/first_family_simul_two_chr_genomes.vcf'| awk -v splitpos=248956422 -v oldchr="chr1" -v newchr="chr2" 'BEGIN {OFS="\t"} /^##contig=/ {
    if ($0 ~ "ID="oldchr","){
        print $0
        sub("ID="oldchr",","ID="newchr",",$0) 
        print $0 
        next
    }
} 
!/^#/ {
    if ($1 == oldchr && $2 > splitpos) {
        $1 = newchr
        }
        } {print}' > '/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/first_family_simul_two_chr_genomes_backshifted.vcf'

After chromosome renaming:

In [144]:
def shift_back_one_chr(input_file, chr_lengths): #chr_lengths - list
    chr = chr_lengths[0]
    shift = chr_lengths[1]
    subprocess.run([f"awk 'BEGIN{{OFS=\"\\t\"; shift[\"{chr}\"]={shift}}} !/^#/ {{$2 = $2 - shift[$1]}} {{ print }}' {input_file} > multiple_chr_renamed_shifted_back.vcf"], shell=True)

In [146]:
shift_back_one_chr('/Users/kristina/Desktop/internship_biotech/py_ped_sim/multiple_chr_simulation_files/first_family_simul_two_chr_genomes_back_renamed.vcf', ['chr2', 248956422])