### __Pipeline 2: SNP processing with random sampling of the alternative allele__

#### __Requirements__
- a python environment (installed with conda for example);
- .TSV files from the obtained with pipeline 1;
- python `subprocess`;
- python `Numpy`;
- python `Pandas`;

Load some python libraries

In [1]:
import numpy as np
import pandas as pd
from snp_utils import ramdomly_sample_haplotypes, filter_short_introns_from_bed, filter_snps_by_interval, check_snp_codon_length
from sfs_utils import create_unfolded_sfs_from_df, fold_sfs
from mutational_context_utils import *

#### __Import the tables into Pandas__

We are going to use pandas to import the SNP table. Pandas is a great (if used with caution) Python package built on Numpy which allows easy dataFrame manipulations.

In [7]:
%cd ../../dgrp2dm6/tables
!ls 

/Users/tur92196/WorkDir/data/dgrp2dm6/tables
chr2L_exons_downsampled.pkl         chr3R_short_introns_downsampled.pkl
chr2L_short_introns_downsampled.pkl dgrp2dm6_chr2L_rooted_ann_table.tsv
chr2R_exons_downsampled.pkl         dgrp2dm6_chr2R_rooted_ann_table.tsv
chr2R_short_introns_downsampled.pkl dgrp2dm6_chr3L_rooted_ann_table.tsv
chr3L_exons_downsampled.pkl         dgrp2dm6_chr3R_rooted_ann_table.tsv
chr3L_short_introns_downsampled.pkl dgrp2dm6_chr4_rooted_ann_table.tsv
chr3R_exons_downsampled.pkl         dgrp2dm6_chrX_rooted_ann_table.tsv


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Upload the `.tsv` file for each chromosome (except for the __chrom4__ and __chromX__).

In [9]:
# Upload files with pd.read_table()
chr2L_table = pd.read_table("dgrp2dm6_chr2L_rooted_ann_table.tsv")
chr2R_table = pd.read_table("dgrp2dm6_chr2R_rooted_ann_table.tsv")
chr3L_table = pd.read_table("dgrp2dm6_chr3L_rooted_ann_table.tsv")
chr3R_table = pd.read_table("dgrp2dm6_chr3R_rooted_ann_table.tsv")

Check the number of SNPs in each file:

In [10]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_table.shape[0], chr2R_table.shape[0], chr3L_table.shape[0], chr3R_table.shape[0])

'chr2L = 910877, chr2R = 740993, chr3L = 896257, chr3R = 884024 SNPs!'

In [11]:
# Take a look at chr2R
chr2L_table.head()

Unnamed: 0,chrom,pos,id,ref,alt,refcount,altcount,refflank,altflank,refcodon,...,snpeff_trnscid,sift_trnscid,sift_geneid,sift_genename,sift_region,sift_vartype,sifts_core,sift_median,sift_pred,deleteriousness
0,chr2L,4998,2L_4998_SNP,G,A,117,5,TAATGACTG,TAATAACTG,,...,FBtr0475186,,,,,,,,,
1,chr2L,5002,2L_5002_SNP,G,T,127,1,GACTGCCTC,GACTTCCTC,,...,FBtr0475186,,,,,,,,,
2,chr2L,5039,2L_5039_SNP,C,T,1,118,AAATCGACA,AAATTAACA,,...,FBtr0475186,,,,,,,,,
3,chr2L,5040,2L_5040_SNP,G,A,1,118,AATCGACAA,AATTAACAA,,...,FBtr0475186,,,,,,,,,
4,chr2L,5092,2L_5092_SNP,C,T,6,119,TTTTCTCTC,TTTTTTCAC,,...,FBtr0475186,,,,,,,,,


#### __Random sampling of the alternative allele__

Get the total number of haplotypes for each SNP.

In [12]:
# Add a total count column to all datasets:
chr2L_table['totalcount'] = chr2L_table['refcount'] + chr2L_table['altcount']
chr2R_table['totalcount'] = chr2R_table['refcount'] + chr2R_table['altcount']
chr3L_table['totalcount'] = chr3L_table['refcount'] + chr3L_table['altcount']
chr3R_table['totalcount'] = chr3R_table['refcount'] + chr3R_table['altcount']

In [13]:
# Take a look at chr2L
chr2L_table.head()

Unnamed: 0,chrom,pos,id,ref,alt,refcount,altcount,refflank,altflank,refcodon,...,sift_trnscid,sift_geneid,sift_genename,sift_region,sift_vartype,sifts_core,sift_median,sift_pred,deleteriousness,totalcount
0,chr2L,4998,2L_4998_SNP,G,A,117,5,TAATGACTG,TAATAACTG,,...,,,,,,,,,,122
1,chr2L,5002,2L_5002_SNP,G,T,127,1,GACTGCCTC,GACTTCCTC,,...,,,,,,,,,,128
2,chr2L,5039,2L_5039_SNP,C,T,1,118,AAATCGACA,AAATTAACA,,...,,,,,,,,,,119
3,chr2L,5040,2L_5040_SNP,G,A,1,118,AATCGACAA,AATTAACAA,,...,,,,,,,,,,119
4,chr2L,5092,2L_5092_SNP,C,T,6,119,TTTTCTCTC,TTTTTTCAC,,...,,,,,,,,,,125


Remove SNPs if the totalcount < 160. This defines a lower bound for the random sampling (the minimum number of haplotypes used to sample alternative alleles to).

In [14]:
# Set the lower bound for the number of haplotypes
min_number_of_haplotypes = 160

In [15]:
chr2L_table = chr2L_table[chr2L_table['totalcount'] >= min_number_of_haplotypes]
chr2R_table = chr2R_table[chr2R_table['totalcount'] >= min_number_of_haplotypes]
chr3L_table = chr3L_table[chr3L_table['totalcount'] >= min_number_of_haplotypes]
chr3R_table = chr3R_table[chr3R_table['totalcount'] >= min_number_of_haplotypes]

In [16]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_table.shape[0], chr2R_table.shape[0], chr3L_table.shape[0], chr3R_table.shape[0])

'chr2L = 887541, chr2R = 723473, chr3L = 877098, chr3R = 868475 SNPs!'

Now, random sample the alternative allele count. The function will update the input DataFrame with new values for the total counts, reference, and alternative allele counts

In [17]:
# Randmly sample alternative allele counts and update the number of total counts in the table
chr2L_table_sampled = ramdomly_sample_haplotypes(chr2L_table, min_number_of_haplotypes)
chr2R_table_sampled = ramdomly_sample_haplotypes(chr2R_table, min_number_of_haplotypes)
chr3L_table_sampled = ramdomly_sample_haplotypes(chr3L_table, min_number_of_haplotypes)
chr3R_table_sampled = ramdomly_sample_haplotypes(chr3R_table, min_number_of_haplotypes)

#### __Subset SNPs__

__Subset each chromosome to retain only:__
- `SNPs in short-introns`;
- `Synonymous SNPs`;
- `Non-synonymous SNPs`;

__(1) Subset to retain only SNPs annotated as introns__

In [18]:

chr2L_introns = chr2L_table_sampled[(chr2L_table_sampled['effect'] == "INTRON")]
chr2R_introns = chr2R_table_sampled[(chr2R_table_sampled['effect'] == "INTRON")]
chr3L_introns = chr3L_table_sampled[(chr3L_table_sampled['effect'] == "INTRON")]
chr3R_introns = chr3R_table_sampled[(chr3R_table_sampled['effect'] == "INTRON")]


In [19]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_introns.shape[0], chr2R_introns.shape[0], chr3L_introns.shape[0], chr3R_introns.shape[0])

'chr2L = 159682, chr2R = 122286, chr3L = 164146, chr3R = 184795 SNPs!'

__(2)Keep only short intronic SNPs__

For each chromosome, retain only SNPs in `short introns`. To do that, you need to identify the short-intros in Dm6 genome and head/tail the sequence to remove head and trailing 8bp from each short-intron. These extremes migh be under selection constraints. Download the intron regions as `.BED` file from [here](https://genome.ucsc.edu/cgi-bin/hgTables). Select *D. melanogaster* assembly known as *Dm6* (Aug. 2014, BDGP Release 6 + ISO 1 MT/dm6). Define the region of interest as `genome`, select the output format as `BED` and name it. In the next page, select only `Introns plus 0`, then hit `Get BED`. Now you are ready to go. The next step is to create a Pandas DataFrame with short introns intervals.

In [22]:
%cd ../reference/
!ls

/Users/tur92196/WorkDir/data/dgrp2dm6/reference
dm6.fa                              dmel-all-chromosome-r6.52.fasta.fai
dm6.fa.fai                          dmel-all-chromosome-r6.52.fasta.gz
dm6_introns.bed                     dmel-all-r6.52.gff
dm6_short_introns.bed               dmel-all-r6.52.gff.gz
dmel-all-chromosome-r6.52.fasta


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [23]:
short_introns = filter_short_introns_from_bed("dm6_introns.bed", 
                                              "dm6_short_introns.bed", 
                                              ["chr2L", "chr2R", "chr3L", "chr3R"],
                                              short_intron_size=86,
                                              trailling_size=8)
short_introns.head()

Unnamed: 0,0,1,2,3,4,5
0,chr2L,8387838,8387882,NM_001201797.2_intron_5_0_chr2L_8387832_f,0,+
1,chr2L,8387838,8387882,NM_001201795.2_intron_5_0_chr2L_8387832_f,0,+
2,chr2L,8387838,8387882,NM_164812.5_intron_4_0_chr2L_8387832_f,0,+
3,chr2L,8387838,8387882,NM_205936.3_intron_5_0_chr2L_8387832_f,0,+
4,chr2L,8387838,8387882,NM_205935.3_intron_5_0_chr2L_8387832_f,0,+


Now, apply the filter based on the BED file intervals (to make sure to only keep `short-intronic` SNPs)

In [24]:
# Keep only short-intons SNPs.
chr2L_short_introns = filter_snps_by_interval(chr2L_introns, short_introns)
chr2R_short_introns = filter_snps_by_interval(chr2R_introns, short_introns)
chr3L_short_introns = filter_snps_by_interval(chr3L_introns, short_introns)
chr3R_short_introns = filter_snps_by_interval(chr3R_introns, short_introns)

Dump these DataFrames as pickle files to avoid re-running it:

In [28]:
%cd ..
!ls 

/Users/tur92196/WorkDir/data/dgrp2dm6
[1m[36mannotations[dont_use-this_is_old_script][m[m
dgrp2dm6_chr2L_rooted.vcf
dgrp2dm6_chr2R_rooted.vcf
dgrp2dm6_chr3L_rooted.vcf
dgrp2dm6_chr3R_rooted.vcf
dgrp2dm6_chr4_rooted.vcf
dgrp2dm6_chrX_rooted.vcf
[1m[36mintervals[m[m
[1m[36mreference[m[m
[1m[36msfss[m[m
[1m[36msift4g[m[m
[1m[36msnpeff[m[m
[1m[36mtables[m[m
vcfs


In [29]:
# Pickle chrm short-introns DataFrames
# chr2L_short_introns.to_pickle("tables/chr2L_short_introns_sampled.pkl")
# chr2R_short_introns.to_pickle("tables/chr2R_short_introns_sampled.pkl")
# chr3L_short_introns.to_pickle("tables/chr3L_short_introns_sampled.pkl")
# chr3R_short_introns.to_pickle("tables/chr3R_short_introns_sampled.pkl")

In [18]:
# Load pickled chrm short-introns DataFrames
# chr2L_short_introns = pd.read_pickle("tables/chr2L_short_introns_sampled.pkl")
# chr2R_short_introns = pd.read_pickle("tables/chr2R_short_introns_sampled.pkl")
# chr3L_short_introns = pd.read_pickle("tables/chr3L_short_introns_sampled.pkl")
# chr3R_short_introns = pd.read_pickle("tables/chr3R_short_introns_sampled.pkl")

Check how many short-introns SNPs were retained:

In [30]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_short_introns.shape[0], chr2R_short_introns.shape[0], chr3L_short_introns.shape[0], chr3R_short_introns.shape[0])

'chr2L = 1478, chr2R = 1318, chr3L = 1075, chr3R = 1477 SNPs!'

__(3) Take synonymous and non-synonymous SNPs__

In [31]:
chr2L_exons = chr2L_table_sampled[(chr2L_table_sampled['effect'] == "NON_SYNONYMOUS_CODING") | (chr2L_table_sampled['effect'] == "SYNONYMOUS_CODING")]
chr2R_exons = chr2R_table_sampled[(chr2R_table_sampled['effect'] == "NON_SYNONYMOUS_CODING") | (chr2R_table_sampled['effect'] == "SYNONYMOUS_CODING")]
chr3L_exons = chr3L_table_sampled[(chr3L_table_sampled['effect'] == "NON_SYNONYMOUS_CODING") | (chr3L_table_sampled['effect'] == "SYNONYMOUS_CODING")]
chr3R_exons = chr3R_table_sampled[(chr3R_table_sampled['effect'] == "NON_SYNONYMOUS_CODING") | (chr3R_table_sampled['effect'] == "SYNONYMOUS_CODING")]

In [33]:
# Pickle chrm exons DataFrames
# chr2L_exons.to_pickle("tables/chr2L_exons_sampled.pkl")
# chr2R_exons.to_pickle("tables/chr2R_exons_sampled.pkl")
# chr3L_exons.to_pickle("tables/chr3L_exons_sampled.pkl")
# chr3R_exons.to_pickle("tables/chr3R_exons_sampled.pkl")

In [23]:
# Load pickled chrm exons DataFrames
# chr2L_exons = pd.read_pickle("tables/chr2L_exons_sampled.pkl")
# chr2R_exons = pd.read_pickle("tables/chr2R_exons_sampled.pkl")
# chr3L_exons = pd.read_pickle("tables/chr3L_exons_sampled.pkl")
# chr3R_exons = pd.read_pickle("tables/chr3R_exons_sampled.pkl")

Do the same on the other chromosomes

In [34]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_exons.shape[0], chr2R_exons.shape[0], chr3L_exons.shape[0], chr3R_exons.shape[0])

'chr2L = 118921, chr2R = 113115, chr3L = 113590, chr3R = 117815 SNPs!'

#### __Get the SFSs__

For introns:

In [39]:
chr2L_short_introns_sfs = create_unfolded_sfs_from_df(chr2L_short_introns, "altcount", min_number_of_haplotypes)
chr2R_short_introns_sfs = create_unfolded_sfs_from_df(chr2R_short_introns, "altcount", min_number_of_haplotypes)
chr3L_short_introns_sfs = create_unfolded_sfs_from_df(chr3L_short_introns, "altcount", min_number_of_haplotypes)
chr3R_short_introns_sfs = create_unfolded_sfs_from_df(chr3R_short_introns, "altcount", min_number_of_haplotypes)

Then combined each chromosome short-introns SFS

In [40]:
short_introns_sfs_array = np.array([
    chr2L_short_introns_sfs,
    chr2R_short_introns_sfs,
    chr3L_short_introns_sfs,
    chr3R_short_introns_sfs
])

short_introns_sfs = np.sum(short_introns_sfs_array, 0).tolist()

For the synonymous SNPs

In [41]:
chr2L_synonymous = chr2L_exons[(chr2L_exons['effect'] == "SYNONYMOUS_CODING")]
chr2R_synonymous = chr2R_exons[(chr2R_exons['effect'] == "SYNONYMOUS_CODING")]
chr3L_synonymous = chr3L_exons[(chr3L_exons['effect'] == "SYNONYMOUS_CODING")]
chr3R_synonymous = chr3R_exons[(chr3R_exons['effect'] == "SYNONYMOUS_CODING")]

synonymous_sfs_array = np.array([
    create_unfolded_sfs_from_df(chr2L_synonymous, "altcount", min_number_of_haplotypes),
    create_unfolded_sfs_from_df(chr2R_synonymous, "altcount", min_number_of_haplotypes),
    create_unfolded_sfs_from_df(chr3L_synonymous, "altcount", min_number_of_haplotypes),
    create_unfolded_sfs_from_df(chr3R_synonymous, "altcount", min_number_of_haplotypes)
])

synonymous_sfs = np.sum(synonymous_sfs_array, 0).tolist()

And for non-synonymous ones:

In [42]:
chr2L_nonsynonymous = chr2L_exons[(chr2L_exons['effect'] == "NON_SYNONYMOUS_CODING")]
chr2R_nonsynonymous = chr2R_exons[(chr2R_exons['effect'] == "NON_SYNONYMOUS_CODING")]
chr3L_nonsynonymous = chr3L_exons[(chr3L_exons['effect'] == "NON_SYNONYMOUS_CODING")]
chr3R_nonsynonymous = chr3R_exons[(chr3R_exons['effect'] == "NON_SYNONYMOUS_CODING")]

nonsynonymous_sfs_array = np.array([
    create_unfolded_sfs_from_df(chr2L_nonsynonymous, "altcount", min_number_of_haplotypes),
    create_unfolded_sfs_from_df(chr2R_nonsynonymous, "altcount", min_number_of_haplotypes),
    create_unfolded_sfs_from_df(chr3L_nonsynonymous, "altcount", min_number_of_haplotypes),
    create_unfolded_sfs_from_df(chr3R_nonsynonymous, "altcount", min_number_of_haplotypes)
])

nonsynonymous_sfs = np.sum(nonsynonymous_sfs_array, 0).tolist()

#### __Fold the SFSs__

In [43]:
short_introns_sfs_folded = fold_sfs(short_introns_sfs)
synonymous_sfs_folded = fold_sfs(synonymous_sfs)
nonsynonymous_sfs_folded = fold_sfs(nonsynonymous_sfs)

Save the three SFSs to a file:

In [46]:
%cd sfss/
!ls 

/Users/tur92196/WorkDir/data/dgrp2dm6/sfss
[1m[36mno-pairing[m[m [1m[36mpaired[m[m


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [47]:
output_sfs_file = "no-pairing/dgrp2_sfs_si_nopairing_sampled_folded.txt"

with open(output_sfs_file, "w") as of:
    of.write("Introns, synonymous, and nonsynonymous SFS of " + str(min_number_of_haplotypes) + " samples" + "\n")
    of.write("\t".join(str(item) for item in short_introns_sfs_folded) + "\n")
    of.write("\n")
    of.write("\t".join(str(item) for item in synonymous_sfs_folded) + "\n")
    of.write("\n")
    of.write("\t".join(str(item) for item in nonsynonymous_sfs_folded) + "\n")

#### __Pair neutral and non-neutral SNPs by mutational context__

Now that we had a simple way to get the SFSs, we are going to add some complexities and pair neutral and non-neutral SNPs by their mutational context.

In [48]:
!ls

[1m[36mno-pairing[m[m [1m[36mpaired[m[m


Remove SNPs that are close to each other, as they might cause ambiguity on the sequence category.

For Short-introns:

In [49]:
# Get True for SNPs far apart and False otherwise, and insert a column on each chromosome DataFrame:
# Insert the new column at position 3
chr2L_short_introns.insert(2, "pos_to_keep", find_consecutive_positions(list(chr2L_short_introns["pos"])))
chr2R_short_introns.insert(2, "pos_to_keep", find_consecutive_positions(list(chr2R_short_introns["pos"])))
chr3L_short_introns.insert(2, "pos_to_keep", find_consecutive_positions(list(chr3L_short_introns["pos"])))
chr3R_short_introns.insert(2, "pos_to_keep", find_consecutive_positions(list(chr3R_short_introns["pos"])))

In [50]:
chr2L_short_introns = chr2L_short_introns[chr2L_short_introns['pos_to_keep'] == True]
chr2R_short_introns = chr2R_short_introns[chr2R_short_introns['pos_to_keep'] == True]
chr3L_short_introns = chr3L_short_introns[chr3L_short_introns['pos_to_keep'] == True]
chr3R_short_introns = chr3R_short_introns[chr3R_short_introns['pos_to_keep'] == True]

In [51]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_short_introns.shape[0], chr2R_short_introns.shape[0], chr3L_short_introns.shape[0], chr3R_short_introns.shape[0])

'chr2L = 1196, chr2R = 1077, chr3L = 891, chr3R = 1220 SNPs!'

For Exons:

In [52]:
# Get True for SNPs far apart and False otherwise, and insert a column on each chromosome DataFrame:
# Insert the new column at position 3
chr2L_exons.insert(2, "pos_to_keep", find_consecutive_positions(list(chr2L_exons["pos"])))
chr2R_exons.insert(2, "pos_to_keep", find_consecutive_positions(list(chr2R_exons["pos"])))
chr3L_exons.insert(2, "pos_to_keep", find_consecutive_positions(list(chr3L_exons["pos"])))
chr3R_exons.insert(2, "pos_to_keep", find_consecutive_positions(list(chr3R_exons["pos"])))

In [53]:
chr2L_exons = chr2L_exons[chr2L_exons['pos_to_keep'] == True]
chr2R_exons = chr2R_exons[chr2R_exons['pos_to_keep'] == True]
chr3L_exons = chr3L_exons[chr3L_exons['pos_to_keep'] == True]
chr3R_exons = chr3R_exons[chr3R_exons['pos_to_keep'] == True]

In [54]:
# Take a look at the number of SNPs with .shape
"chr2L = {}, chr2R = {}, chr3L = {}, chr3R = {} SNPs!".format(chr2L_exons.shape[0], chr2R_exons.shape[0], chr3L_exons.shape[0], chr3R_exons.shape[0])

'chr2L = 111133, chr2R = 105929, chr3L = 106496, chr3R = 110931 SNPs!'

##### Add the sequence category for each SNP

In [55]:
# Insert the new column at position 9
# For introns:
chr2L_short_introns.insert(8, "sequence_category", set_snp_sequence_category(chr2L_short_introns))
chr2R_short_introns.insert(8, "sequence_category", set_snp_sequence_category(chr2R_short_introns))
chr3L_short_introns.insert(8, "sequence_category", set_snp_sequence_category(chr3L_short_introns))
chr3R_short_introns.insert(8, "sequence_category", set_snp_sequence_category(chr3R_short_introns))

# For exons:
chr2L_exons.insert(8, "sequence_category", set_snp_sequence_category(chr2L_exons))
chr2R_exons.insert(8, "sequence_category", set_snp_sequence_category(chr2R_exons))
chr3L_exons.insert(8, "sequence_category", set_snp_sequence_category(chr3L_exons))
chr3R_exons.insert(8, "sequence_category", set_snp_sequence_category(chr3R_exons))

##### Create the sequence category dict

In [56]:
# First create a object for the sequence categories dictionary
sequence_categories_dict = create_sequence_categories_dict()
print(sequence_categories_dict)

{1: ['AA/CA', 'AC/AA'], 2: ['CA/CA', 'CC/AA'], 3: ['GA/CA', 'GC/AA'], 4: ['TA/CA', 'TC/AA'], 5: ['AA/CC', 'AC/AC'], 6: ['CA/CC', 'CC/AC'], 7: ['GA/CC', 'GC/AC'], 8: ['TA/CC', 'TC/AC'], 9: ['AA/CG', 'AC/AG'], 10: ['CA/CG', 'CC/AG'], 11: ['GA/CG', 'GC/AG'], 12: ['TA/CG', 'TC/AG'], 13: ['AA/CT', 'AC/AT'], 14: ['CA/CT', 'CC/AT'], 15: ['GA/CT', 'GC/AT'], 16: ['TA/CT', 'TC/AT'], 17: ['AA/GA', 'AG/AA'], 18: ['CA/GA', 'CG/AA'], 19: ['GA/GA', 'GG/AA'], 20: ['TA/GA', 'TG/AA'], 21: ['AA/GC', 'AG/AC'], 22: ['CA/GC', 'CG/AC'], 23: ['GA/GC', 'GG/AC'], 24: ['TA/GC', 'TG/AC'], 25: ['AA/GG', 'AG/AG'], 26: ['CA/GG', 'CG/AG'], 27: ['GA/GG', 'GG/AG'], 28: ['TA/GG', 'TG/AG'], 29: ['AA/GT', 'AG/AT'], 30: ['CA/GT', 'CG/AT'], 31: ['GA/GT', 'GG/AT'], 32: ['TA/GT', 'TG/AT'], 33: ['AA/TA', 'AT/AA'], 34: ['CA/TA', 'CT/AA'], 35: ['GA/TA', 'GT/AA'], 36: ['TA/TA', 'TT/AA'], 37: ['AA/TC', 'AT/AC'], 38: ['CA/TC', 'CT/AC'], 39: ['GA/TC', 'GT/AC'], 40: ['TA/TC', 'TT/AC'], 41: ['AA/TG', 'AT/AG'], 42: ['CA/TG', 'CT/AG'], 

In [57]:
#  Create three dictionaries for each chromosome
# For introns
chr2L_short_introns_seqclasses = create_snp_dict_wrapper(chr2L_short_introns, sequence_categories_dict, "introns")
chr2R_short_introns_seqclasses = create_snp_dict_wrapper(chr2R_short_introns, sequence_categories_dict, "introns")
chr3L_short_introns_seqclasses = create_snp_dict_wrapper(chr3L_short_introns, sequence_categories_dict, "introns")
chr3R_short_introns_seqclasses = create_snp_dict_wrapper(chr3R_short_introns, sequence_categories_dict, "introns")

# For exons
chr2L_nonsyns_seqclasses, chr2L_syns_seqclasses = create_snp_dict_wrapper(chr2L_exons, sequence_categories_dict, "exons")
chr2R_nonsyns_seqclasses, chr2R_syns_seqclasses = create_snp_dict_wrapper(chr2R_exons, sequence_categories_dict, "exons")
chr3L_nonsyns_seqclasses, chr3L_syns_seqclasses = create_snp_dict_wrapper(chr3L_exons, sequence_categories_dict, "exons")
chr3R_nonsyns_seqclasses, chr3R_syns_seqclasses = create_snp_dict_wrapper(chr3R_exons, sequence_categories_dict, "exons")


##### Find mutational context pairs and get the SFSs
Now, find the pairs and obtain the SFSs for each member of the pair. These are the valid pairs we are looking to have:
- `short-intron SNP` and `non-synonymous SNPs`;
- `short-intron SNP` and `synonymous SNPs`;
- `synonymous SNP` and `non-synonymous SNPs` (maybe?)

For `short-intron SNP` and `non-synonymous SNPs`

In [58]:
# Find pairs of SNPs that are closest to each other
chr2L_si_nonsyn_pairs_si, chr2L_si_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr2L_short_introns_seqclasses, chr2L_nonsyns_seqclasses)
chr2R_si_nonsyn_pairs_si, chr2R_si_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr2R_short_introns_seqclasses, chr2R_nonsyns_seqclasses)
chr3L_si_nonsyn_pairs_si, chr3L_si_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr3L_short_introns_seqclasses, chr3L_nonsyns_seqclasses)
chr3R_si_nonsyn_pairs_si, chr3R_si_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr3R_short_introns_seqclasses, chr3R_nonsyns_seqclasses)

In [59]:
# Get the SFS for the short introns SNPs
# Remember: the data is now sampled, so the max_sample_size means the lower bound set to 160.
# Use the imputed argument, knowing that the data was sampled.
short_introns_paired_nonsyn_sfs_seqclasses_array = np.array([
    create_unfolded_sfs_from_snp_dict(chr2L_si_nonsyn_pairs_si, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr2R_si_nonsyn_pairs_si, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3L_si_nonsyn_pairs_si, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3R_si_nonsyn_pairs_si, "imputed", min_number_of_haplotypes, folded=True)
])

short_introns_paired_nonsyn_seqclasses_sfs_folded = np.sum(short_introns_paired_nonsyn_sfs_seqclasses_array, 0).tolist()

# Get the SFS for the nonsynonymous SNPs
nonsynonymous_paired_si_sfs_seqclasses_array = np.array([
    create_unfolded_sfs_from_snp_dict(chr2L_si_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr2R_si_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3L_si_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3R_si_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True)
])

nonsynonymous_paired_si_seqclasses_sfs_folded = np.sum(nonsynonymous_paired_si_sfs_seqclasses_array, 0).tolist()

Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...


In [60]:
# Save the pairs to a file
si_nonsyn_pair_output_sfs_file = "paired/dgrp2_sfs_si_paired_with_nonsynynous_sampled_folded.txt"

with open(si_nonsyn_pair_output_sfs_file, "w") as of:
    of.write("Introns and nonsynonymous SFS of " + str(min_number_of_haplotypes) + " samples" + "\n")
    of.write("\t".join(str(item) for item in short_introns_paired_nonsyn_seqclasses_sfs_folded) + "\n")
    of.write("\n")
    of.write("\t".join(str(item) for item in nonsynonymous_paired_si_seqclasses_sfs_folded) + "\n")

For `short-intron SNP` and `synonymous SNPs`

In [61]:
# Find pairs of SNPs that are closest to each other
chr2L_si_syn_pairs_si, chr2L_si_syn_pairs_syn = find_closest_snp_pairs(chr2L_short_introns_seqclasses, chr2L_syns_seqclasses)
chr2R_si_syn_pairs_si, chr2R_si_syn_pairs_syn = find_closest_snp_pairs(chr2R_short_introns_seqclasses, chr2R_syns_seqclasses)
chr3L_si_syn_pairs_si, chr3L_si_syn_pairs_syn = find_closest_snp_pairs(chr3L_short_introns_seqclasses, chr3L_syns_seqclasses)
chr3R_si_syn_pairs_si, chr3R_si_syn_pairs_syn = find_closest_snp_pairs(chr3R_short_introns_seqclasses, chr3R_syns_seqclasses)

In [62]:
# Get the SFS for the short introns SNPs
short_introns_paired_syn_sfs_seqclasses_array = np.array([
    create_unfolded_sfs_from_snp_dict(chr2L_si_syn_pairs_si, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr2R_si_syn_pairs_si, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3L_si_syn_pairs_si, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3R_si_syn_pairs_si, "imputed", min_number_of_haplotypes, folded=True)
])

short_introns_paired_syn_seqclasses_sfs_folded = np.sum(short_introns_paired_syn_sfs_seqclasses_array, 0).tolist()

# Get the SFS for the synonymous SNPs
synonymous_paired_si_sfs_seqclasses_array = np.array([
    create_unfolded_sfs_from_snp_dict(chr2L_si_syn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr2R_si_syn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3L_si_syn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3R_si_syn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True)
])

synonymous_paired_si_seqclasses_sfs_folded = np.sum(synonymous_paired_si_sfs_seqclasses_array, 0).tolist()

Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...


In [63]:
# Save the pairs to a file
si_syn_pair_output_sfs_file = "paired/dgrp2_sfs_si_paired_with_synynous_sampled_folded.txt"

with open(si_syn_pair_output_sfs_file, "w") as of:
    of.write("Introns and synonymous SFS of " + str(min_number_of_haplotypes) + " samples" + "\n")
    of.write("\t".join(str(item) for item in short_introns_paired_syn_seqclasses_sfs_folded) + "\n")
    of.write("\n")
    of.write("\t".join(str(item) for item in synonymous_paired_si_seqclasses_sfs_folded) + "\n")

For `synonymous SNP` and `non-synonymous SNPs`

In [64]:
# Find pairs of SNPs that are closest to each other
chr2L_syn_nonsyn_pairs_syn, chr2L_syn_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr2L_syns_seqclasses, chr2L_nonsyns_seqclasses)
chr2R_syn_nonsyn_pairs_syn, chr2R_syn_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr2R_syns_seqclasses, chr2R_nonsyns_seqclasses)
chr3L_syn_nonsyn_pairs_syn, chr3L_syn_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr3L_syns_seqclasses, chr3L_nonsyns_seqclasses)
chr3R_syn_nonsyn_pairs_syn, chr3R_syn_nonsyn_pairs_nonsyn = find_closest_snp_pairs(chr3R_syns_seqclasses, chr3R_nonsyns_seqclasses)


In [65]:
# Get the SFS for the synonymous SNPs
synonymous_paired_nonsyn_sfs_seqclasses_array = np.array([
    create_unfolded_sfs_from_snp_dict(chr2L_syn_nonsyn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr2R_syn_nonsyn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3L_syn_nonsyn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3R_syn_nonsyn_pairs_syn, "imputed", min_number_of_haplotypes, folded=True)
])

synonymous_paired_nonsyn_seqclasses_sfs_folded = np.sum(synonymous_paired_nonsyn_sfs_seqclasses_array, 0).tolist()

# Get the SFS for the nonsynonymous SNPs
nonsynonymous_paired_syn_sfs_seqclasses_array = np.array([
    create_unfolded_sfs_from_snp_dict(chr2L_syn_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr2R_syn_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3L_syn_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True),
    create_unfolded_sfs_from_snp_dict(chr3R_syn_nonsyn_pairs_nonsyn, "imputed", min_number_of_haplotypes, folded=True)
])

nonsynonymous_paired_syn_seqclasses_sfs_folded = np.sum(nonsynonymous_paired_syn_sfs_seqclasses_array, 0).tolist()

Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...
Working with imputed data...


In [66]:
# Save the pairs to a file
syn_nonsyn_pair_output_sfs_file = "paired/dgrp2_sfs_synonymous_paired_with_nonsynynous_sampled_folded.txt"

with open(syn_nonsyn_pair_output_sfs_file, "w") as of:
    of.write("Synonymous and nonsynonymous SFS of " + str(min_number_of_haplotypes) + " samples" + "\n")
    of.write("\t".join(str(item) for item in synonymous_paired_nonsyn_seqclasses_sfs_folded) + "\n")
    of.write("\n")
    of.write("\t".join(str(item) for item in nonsynonymous_paired_syn_seqclasses_sfs_folded) + "\n")