In [1]:
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from mothur_py import Mothur
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")

import re

_nsre = re.compile('([0-9]+)')
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)] 

In [None]:
fastq_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Concatenated/20171103_FAH15473/barcode02/merged.fastq", "fastq"))

In [None]:
lengths = []
for key in fastq_dict:
    lengths.append(len(fastq_dict[key].seq))
print("The number of reads in this file is", len(fastq_dict))
ax = sns.distplot(lengths, color="k", kde=False, bins=5000)


ax.set(xlim=(250, 3500))
ax.set_title("Reads spread 20171103_FAH15473/barcode02", fontsize=15)
ax.set_xlabel("Length of read", fontsize=13)
ax.set_ylabel("Number of reads", fontsize=13)

plt.show()

In [None]:
frDNA = fastq_dict.copy()
for key in fastq_dict:
    if len(fastq_dict[key].seq) not in range(2700, 3200):
        del frDNA[key]
print("The number of reads between 2700 and 3200 bp in length is", len(frDNA))
EF1a = fastq_dict.copy()
for key in fastq_dict:
    if len(fastq_dict[key].seq) not in range(900, 1400):
        del EF1a[key]
print("The number of reads between 900 and 1400 bp in length is", len(EF1a))

In [None]:
# Write the reads defined between the frDNA cutoff into a new fasta file
SeqIO.write(frDNA.values(), "../../analysis/Python_Processing/20171103_FAH15473/barcode02/barcode02_frDNA_clipped.fastq", "fastq")

In [None]:
# Write the reads defined between the EF1a cutoff into a new fasta file
SeqIO.write(EF1a.values(), "../../analysis/Python_Processing/20171103_FAH15473/barcode02/barcode02_EF1a_clipped.fastq", "fastq")

In [None]:
frDNA_extract={k: frDNA[k] for k in list(frDNA.keys())[:500]}
EF1a_extract={k: EF1a[k] for k in list(EF1a.keys())[:500]}

In [None]:
for key in frDNA_extract:
    frDNA_extract[key].annotations = 'frDNA'
for key in EF1a_extract:
    EF1a_extract[key].annotations = 'EF1a'

In [None]:
combined_extract = {}
combined_extract.update(frDNA_extract)
combined_extract.update(EF1a_extract)
print(len(frDNA_extract))
print(len(EF1a_extract))
print(len(combined_extract))

In [None]:
SeqIO.write(frDNA_extract.values(), "../../analysis/Python_Processing/20171103_FAH15473/barcode02/barcode02_frDNA_extract_test.fastq", "fastq")
SeqIO.write(EF1a_extract.values(), "../../analysis/Python_Processing/20171103_FAH15473/barcode02/barcode02_EF1a_extract_test.fastq", "fastq")
SeqIO.write(combined_extract.values(), "../../analysis/Python_Processing/20171103_FAH15473/barcode02/barcode02_combined_extract_test.fastq", "fastq")

In [None]:
frDNA_paf = pd.read_csv("../../analysis/Python_Processing/20171103_FAH15473/barcode02/frDNA_clipped_test.paf", sep='\t', header=None, engine='python')
EF1a_paf = pd.read_csv("../../analysis/Python_Processing/20171103_FAH15473/barcode02/EF1a_clipped_test.paf", sep='\t', header=None, engine='python')
combined_paf = pd.read_csv("../../analysis/Python_Processing/20171103_FAH15473/barcode02/combined_test.paf", sep='\t', header=None, engine='python')

In [None]:
frDNA_paf.head()

In [None]:
combined_paf.head()

In [None]:
print("min len of match for frDNA is", frDNA_paf[1].min())
print("min len of match for EF1a is", EF1a_paf[1].min())
print("min len of match for combined is", combined_paf[1].min())

In [None]:
print('num matches with unique ids for frDNA is', len(frDNA_paf[0].unique()))
print('num matches with unique ids for EF1a is', len(EF1a_paf[0].unique()))
print('num matches with unique ids for combined is', len(combined_paf[0].unique()))

Compare above (using minimap2) with BLAST approach
    - BLAST may be too slow on a larger dataset


Check with other alignment programs eg. lastz, BLAT (check for others)



# Testing of full size-clipped files for alignment via minimap2
 - For each of the frDNA_clipped and EF1a_clipped files as created above 
     - Look for the number of unique ids in the resultant file
     - Determine the percentage of reads in this range that match homology given total number of reads in the clipped.fastq file

In [None]:
frDNA_clipped_paf = pd.read_csv("../../analysis/Python_Processing/20171103_FAH15473/barcode02/frDNA_clipped_test.paf", sep='\t', header=None, engine='python')

In [None]:
EF1a_clipped_paf = pd.read_csv("../../analysis/Python_Processing/20171103_FAH15473/barcode02/EF1a_clipped_test.paf", sep='\t', header=None)

In [None]:
print('Total number of matches found for frDNA is', len(frDNA_clipped_paf[0].unique()))
print('Percentage of matches in region =', "{:.3%}".format((len(frDNA_clipped_paf[0].unique())/167054)))

In [None]:
print('Total number of matches found for EF1a is', len(EF1a_clipped_paf[0].unique()))
print('False positive percentage =', "{:.3%}".format((len(EF1a_clipped_paf[0].unique())/192712)))

In [None]:
print('Total number of matches found for the total of reads is', len(combined_paf[0].unique()))
print('Percentage of matches overall =', "{:.3%}".format((len(combined_paf[0].unique())/413127)))

~~Plot distribution of matching reads as above~~

~~Repeat minimap2 for above to see if fluctuations~~

Explore non-mapping 25%

Scaling - plots saved out, file for statistics (loop over later)

In [None]:
# combined_ids = []
# for key in combined_paf[0].unique():
#     combined_ids.append(key)
combined_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Concatenated/20171103_FAH15473/barcode02/merged.fastq", "fastq"))
comb_dict = {}
for key in combined_paf[0].unique():
    comb_dict[key] = combined_dict[key]

In [None]:
lengths = []
comb_keys = []
for key in comb_dict:
    lengths.append(len(comb_dict[key].seq))
    comb_keys.append(key)

mean = np.mean(lengths)
std = np.std(lengths)
print(mean)
print(std)
    
# stats_dict = {'number of frDNA reads':len(lengths),'minimum read length':min(lengths),'maximum read length':max(lengths),'mean read length':"{:.0f}".format(np.mean(lengths)),'median read length':"{:.0f}".format(np.median(lengths))}
# stats = pd.DataFrame(stats_dict, index=['20171103_FAH15473/barcode02'])
        
              
# ax = sns.distplot(lengths, color="k", kde=False, bins=5000)
# ax.set(xlim=(250, 3500))
# ax.set_title("frDNA reads for 20171103_FAH15473/barcode02", fontsize=15)
# ax.set_xlabel("Length of read", fontsize=13)
# ax.set_ylabel("Number of reads", fontsize=13)
# plt.show()


# ax = sns.distplot(lengths, color="k", kde=False, bins=5000)
# ax.set(xlim=(2400, 3500))
# ax.set_title("frDNA reads for 20171103_FAH15473/barcode02", fontsize=15)
# ax.set_xlabel("Length of read", fontsize=13)
# ax.set_ylabel("Number of reads", fontsize=13)
# plt.show()


# display(stats)

In [None]:
fr_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Python_Processing/20171103_FAH15473/barcode02/barcode02_frDNA_clipped.fastq", 'fastq'))
print(len(fr_dict))

In [None]:
# non_dict = {}
# for key in fr_dict:
#     if key not in frDNA_clipped_paf[0].unique():
#         non_dict[key] = fr_dict[key]
# print(len(non_dict))

In [None]:
# lengths = []
# non_keys = []
# for key in non_dict:
#     lengths.append(len(non_dict[key].seq))
#     non_keys.append(key)

# non_dict_stats = {'number of reads':len(lengths),'minimum read length':min(lengths),'maximum read length':max(lengths),'mean read length':"{:.0f}".format(np.mean(lengths)),'median read length':"{:.0f}".format(np.median(lengths))}
# stats = pd.DataFrame(non_dict_stats, index=['20171103_FAH15473/barcode02'])
        
              
# ax = sns.distplot(lengths, color="k", kde=False)
# # ax.set(xlim=(250, 3500))
# ax.set_title("non-frDNA reads for 20171103_FAH15473/barcode02", fontsize=15)
# ax.set_xlabel("Length of read", fontsize=13)
# ax.set_ylabel("Number of reads", fontsize=13)
# plt.show()
# display(stats)

In [None]:
%%writefile ../summary_statistics.py

"""
The goal of this program is to examine the distribution of reads within
each file, and for the files generated after homology analysis.
The program will generate summary statistics for the result of the homology
analysis and save figures illustrating the read distribution for the
frDNA reads
"""

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
import numpy as np
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import matplotlib.pyplot as plt
import argparse


WHITE='\033[1;37m'
BLUE='\033[0;34m'
RED='\033[0;31m'
NC='\033[0m'
GREEN='\033[0;32m'
PURPLE='\033[1;35m'

parser = argparse.ArgumentParser(description="""
The goal of this program is to examine the distribution of reads within
each file, and for the files generated after homology analysis.
The program will generate summary statistics for the result of the homology
analysis and save figures illustrating the read distribution for the
frDNA reads
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("full_file", help="The full, unfiltered file containing all reads for this barcode")
parser.add_argument("input_folder", help="The destination folder within which the .paf files are generated")
parser.add_argument("output_folder", help="The destination folder for any outputs from this script - including summary statistics file and plots")
args = parser.parse_args()

print('\033[0;35m'+'START'+'\033[1;37m')

output_folder = args.output_folder.rsplit('/', 1)[-2]
input_folder = args.input_folder.rsplit('/', 1)[-2]
if args.verbose:
    print('\033[0;31m' + "Input folder is " + input_folder + '\033[1;37m')
    print('\033[0;31m' + "Output folder is " + output_folder + '\033[1;37m')
    print('\033[0;34m' + "Loading " + args.full_file + '\033[1;37m')

# Load the full file containing all reads for this barcode
full_file_dict = SeqIO.to_dict(SeqIO.parse(args.full_file, "fastq"))

if args.verbose:
    print('\033[0;34m' + "Loaded " + args.full_file + '\033[1;37m')

# Extract the information about the lengths of the sequence for each read in this barcode
full_lengths = []
for key in full_file_dict:
    full_lengths.append(len(full_file_dict[key].seq))
full_lengths_len = len(full_file_dict)


# Plot the spread of read lengths for this barcode
    # Expect to see two peaks - one for EF1a and one for frDNA
ax = sns.distplot(full_lengths, color="k", kde=False, bins=5000)
ax.set(xlim=(250, 3500))
ax.set_title("Read spread for %s" % '/'.join(args.full_file.rsplit('/')[-3:-1]), fontsize=15)
ax.set_xlabel("Length of read", fontsize=13)
ax.set_ylabel("Number of reads", fontsize=13)
figure1 = ax.get_figure()
# Save this figure out
figure1.savefig('/'.join([output_folder, 'full_read_spread.png']))
figure1.clf()
if args.verbose:
    print('\033[0;32m' + "Full spread image file saved to " + '/'.join([output_folder, 'full_read_spread.png']) + '\033[1;37m')
    print('\033[0;34m' + "Loading " + input_folder+"/combined_test.paf" + '\033[1;37m')

# Import the PAF file resulting from the minimap2 homology filtering
full_paf = pd.read_csv(input_folder+"/combined_test.paf", sep='\t', header=None, engine='python')
if args.verbose:
    print('\033[0;34m' + "Loaded " + input_folder+"/combined_test.paf" + '\033[1;37m')
# Determine all the read ids present within the homology-filtered dataset
# Then, create a dictionary extracting all the information from the full read file, but ONLY for reads present within the homology-filtered data
full_dict = {}
for key in full_paf[0].unique():
    full_dict[key] = full_file_dict[key]

# For each key in the homology-filtered dictionary, extract the sequence length and key
full_paf_lengths = []
full_keys = []
for key in full_dict:
    full_paf_lengths.append(len(full_dict[key].seq))
    full_keys.append(key)

mean = np.mean(full_paf_lengths)
std = np.std(full_paf_lengths)

if args.verbose:
    print('\033[1;33m' + 'Mean read length is %s' % mean + '\033[1;37m')
    print('\033[1;33m' + 'Standard deviation of read length is %s' % std + '\033[1;37m')
    
    
length_filt_dict = full_dict.copy()
for key in full_keys:
    if len(full_dict[key].seq) < (mean-1.645*std) or len(full_dict[key].seq) > (mean+1.645*std):
        del length_filt_dict[key]

        
        
SeqIO.write(length_filt_dict.values(), '/'.join([output_folder, 'length_restricted_reads.fasta']), "fasta")
if args.verbose:
    print('\033[1;36m' + 'Saved %s' % ('/'.join([output_folder, 'length_restricted_reads.fasta'])) + '\033[1;37m')     
    
    
    
length_filt_lens = []
len_filt_keys = []
for key in length_filt_dict:
    length_filt_lens.append(len(length_filt_dict[key].seq))
    len_filt_keys.append(key)

    
    


    
# Extract the qscores
# if args.verbose:
#     print('\033[0;34m' + "Loading " + 'Basecalled/'+'/'.join(args.full_file.rsplit('/')[-3:-1])+'/sequencing_summary.txt' + '\033[1;37m')
# summ_stats_csv = pd.read_csv('Basecalled/'+'/'.join(args.full_file.rsplit('/')[-3:-1])+'/sequencing_summary.txt', sep='\t', header=None, names=['filename', 'read_id', 'run_id', 'batch_id', 'channel', 'mux', 'start_time', 'duration', 'num_events', 'passes_filtering', 'template_start', 'num_events_template', 'template_duration', 'sequence_length_template', 'mean_qscore_template', 'strand_score_template', 'median_template', 'mad_template'], engine='python')
# summ_stats_csv = pd.DataFrame(summ_stats_csv[1:])
# summary_list = []
# for column, row in summ_stats_csv.iterrows():
#     if row['read_id'] in full_keys:
#         summary_list.append([row['read_id'], row['mean_qscore_template']])
# summary_frame = pd.DataFrame(summary_list)
# if args.verbose:
#     print('\033[0;34m' + "Finished with " + 'Basecalled/'+'/'.join(args.full_file.rsplit('/')[-3:-1])+'/sequencing_summary.txt' + '\033[1;37m')
    
# Create a dictionary containing the statistics for the filtered dataset
    # Total no. frDNA reads, Min. read length, Max. read length, Mean read length, Median read length, Quality score

stats_dict = {'number of frDNA reads':len(length_filt_lens),'minimum read length':min(length_filt_lens),'maximum read length':max(length_filt_lens),'mean read length':"{:.0f}".format(np.mean(length_filt_lens)),'std dev':"{:.0f}".format(np.std(length_filt_lens)),'median read length':"{:.0f}".format(np.median(length_filt_lens))
#               ,'min_qscore':"{:.2f}".format(min(summary_frame[1].astype(float))), 'max_qscore':"{:.2f}".format(max(summary_frame[1].astype(float))), 'mean_qscore':"{:.2f}".format(np.mean(summary_frame[1].astype(float))), 'median_qscore':"{:.2f}".format(np.median(summary_frame[1].astype(float)))
             }
stats = pd.DataFrame(stats_dict, index=['%s' % '/'.join(args.full_file.rsplit('/')[-3:-1])])    
              
bx = sns.distplot(length_filt_lens, color="k", kde=False)
bx.set(xlim=(250, 3500))
bx.set_title("frDNA reads for %s" % '/'.join(args.full_file.rsplit('/')[-3:-1]), fontsize=15)
bx.set_xlabel("Length of read", fontsize=13)
bx.set_ylabel("Number of reads", fontsize=13)
figure2 = bx.get_figure()
figure2.savefig('/'.join([output_folder, 'frDNA_len_filt_full.png']))
figure2.clf()
if args.verbose:
    print('\033[0;32m' + "frDNA spread image file saved to " + '/'.join([output_folder, 'frDNA_len_filt_full.png']) + '\033[1;37m')

cx = sns.distplot(length_filt_lens, color="k", kde=False)
cx.set(xlim=((mean-1.645*std)-100, (mean+1.645*std)+100))
cx.set_title("frDNA reads for %s" % '/'.join(args.full_file.rsplit('/')[-3:-1]), fontsize=15)
cx.set_xlabel("Length of read", fontsize=13)
cx.set_ylabel("Number of reads", fontsize=13)
figure3 = cx.get_figure()
figure3.savefig('/'.join([output_folder, 'frDNA_len_filt_limited.png']))
figure3.clf()
if args.verbose:
    print('\033[0;32m' + "Zoomed-in frDNA spread image file saved to " + '/'.join([output_folder, 'frDNA_len_filt_limited.png']) + '\033[1;37m')

stats.to_csv('/'.join([output_folder, 'frDNA_len_filt_statistics.csv']), index=False)
if args.verbose:
    print('\033[0;32m' + "Summary statistics file saved to " + '/'.join([output_folder, 'frDNA_len_filt_statistics.csv']) + '\033[1;37m')
    
print('\033[0;35m'+'END'+'\033[1;37m')

In [None]:
import glob
path = "/10tb/tmp/TE/honours/analysis/Stats/*/*/*.csv"
path_names = glob.glob(path)
full_stats = pd.DataFrame(data=None, columns = ['number of frDNA reads','minimum read length','maximum read length','mean read length','standard deviation','median read length','min_qscore','max_qscore','mean_qscore','median_qscore'])
for path in path_names:
    if path[54:-21] != 'unclassified':
        if '20171212_FAH18688/barcode10' not in path and '20171207_FAH18654/barcode10' not in path:
            path_stats_csv = pd.read_csv(path, header=0)
            full_stats = full_stats.append(path_stats_csv)
            full_stats = full_stats.rename(index={0: path[36:-21]})
full_stats = full_stats.sort_index(ascending=True)
full_stats.to_csv('../../analysis/Stats/overall_frDNA_stats.csv')
num_read_stats = pd.DataFrame(data=["{:.0f}".format(min(full_stats['number of frDNA reads'])), "{:.0f}".format(max(full_stats['number of frDNA reads'])), "{:.0f}".format(np.mean(full_stats['number of frDNA reads'])), "{:.0f}".format(np.median(full_stats['number of frDNA reads']))], index=['Min', 'Max', 'Mean', 'Median'], columns=['Number of reads'])
num_read_stats.to_csv('../../analysis/Stats/number_of_frDNA_reads_summary.csv')

In [None]:
print(sum(full_stats['number of frDNA reads']))

In [None]:
import glob
path = "/10tb/tmp/TE/honours/analysis/Length_Filtered/*/*/*.csv"
path_names = glob.glob(path)
full_stats2 = pd.DataFrame(data=None, columns = ['number of frDNA reads','minimum read length','maximum read length','mean read length','standard deviation','median read length'])
for path in path_names:
    if path[64:-30] != 'unclassified':
        if '20171212_FAH18688/barcode10' not in path and '20171207_FAH18654/barcode10' not in path:
            path_stats_csv = pd.read_csv(path, header=0)
            full_stats2 = full_stats2.append(path_stats_csv)
            full_stats2 = full_stats2.rename(index={0: path[46:-30]})
full_stats2 = full_stats2.sort_index(ascending=True)
display(full_stats2)
full_stats2.to_csv('../../analysis/Length_Filtered/overall_frDNA_stats.csv')
num_read_stats = pd.DataFrame(data=["{:.0f}".format(min(full_stats2['number of frDNA reads'])), "{:.0f}".format(max(full_stats2['number of frDNA reads'])), "{:.0f}".format(np.mean(full_stats2['number of frDNA reads'])), "{:.0f}".format(np.median(full_stats2['number of frDNA reads']))], index=['Min', 'Max', 'Mean', 'Median'], columns=['Number of reads'])
num_read_stats.to_csv('../../analysis/Length_Filtered/number_of_frDNA_reads_summary.csv')

#### Create matrix for loss-of-reads when performing length filtering as percentage

In [None]:
full_stats_1 = []
full_stats_2 = []
result = []

for i in full_stats['number of frDNA reads']:
    full_stats_1.append(i)
for i in full_stats2['number of frDNA reads']:
    full_stats_2.append(i)
for i in range (0,len(full_stats_1)):
    result.append(float("{:.2f}".format((full_stats_1[i]-full_stats_2[i])/full_stats_1[i])))
print(result)
print('The maximum loss of reads is %s%% in %s' % (100*max(result), path_names[result.index(max(result))][46:-30]))
print('The minimum loss of reads is %s%% in %s' % (100*min(result), path_names[result.index(min(result))][46:-30]))

In [None]:
FAH18688_barcode10 = SeqIO.to_dict(SeqIO.parse("../../analysis/Concatenated/20171212_FAH18688/barcode10/merged.fastq", 'fastq'))
lengths = []
b10_keys = []
for key in FAH18688_barcode10:
    lengths.append(len(FAH18688_barcode10[key].seq))
    b10_keys.append(key)

non_dict_stats = {'number of reads':len(lengths),'minimum read length':min(lengths),'maximum read length':max(lengths),'mean read length':"{:.0f}".format(np.mean(lengths)),'median read length':"{:.0f}".format(np.median(lengths))}
stats = pd.DataFrame(non_dict_stats, index=['20171212_FAH18688/barcode10'])
              
ax = sns.distplot(lengths, color="k", kde=False)
ax.set(xlim=(250, 3500))
ax.set_title("non-frDNA reads for 20171212_FAH18688/barcode10", fontsize=15)
ax.set_xlabel("Length of read", fontsize=13)
ax.set_ylabel("Number of reads", fontsize=13)
plt.show()
display(stats)

In [None]:
FAH18654_barcode10 = SeqIO.to_dict(SeqIO.parse("../../analysis/Concatenated/20171207_FAH18654/barcode10/merged.fastq", 'fastq'))
lengths = []
b10_keys = []
for key in FAH18654_barcode10:
    lengths.append(len(FAH18654_barcode10[key].seq))
    b10_keys.append(key)

non_dict_stats = {'number of reads':len(lengths),'minimum read length':min(lengths),'maximum read length':max(lengths),'mean read length':"{:.0f}".format(np.mean(lengths)),'median read length':"{:.0f}".format(np.median(lengths))}
stats = pd.DataFrame(non_dict_stats, index=['20171207_FAH18654/barcode10'])
              
ax = sns.distplot(lengths, color="k", kde=False)
ax.set(xlim=(250, 3500))
ax.set_title("non-frDNA reads for 20171207_FAH18654/barcode10", fontsize=15)
ax.set_xlabel("Length of read", fontsize=13)
ax.set_ylabel("Number of reads", fontsize=13)
plt.show()
display(stats)

In [None]:
barcode02 = SeqIO.to_dict(SeqIO.parse("../../analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_reads.fasta", "fasta"))
barcode06 = SeqIO.to_dict(SeqIO.parse("../../analysis/Length_Filtered/20171103_FAH15473/barcode06/length_restricted_reads.fasta", "fasta"))

total_lens = []
for key in barcode02:
    total_lens.append(len(barcode02[key].seq))
for key in barcode06:
    total_lens.append(len(barcode06[key].seq))
print(max(total_lens))
print(min(total_lens))

In [None]:
barcode02_numbers = {}
for key in barcode02:
    seq = []
    for element in barcode02[key].seq[30:-30]:
        if element == "A":
            seq.append(0)
        elif element == "C":
            seq.append(1)
        elif element == "G":
            seq.append(2)
        elif element == "T":
            seq.append(3)
    if len(seq) < max(total_lens):
        seq.extend([0]*(max(total_lens)-len(seq)))
    barcode02_numbers[key] = seq

In [None]:
seq2 = np.array(random.choices(list(barcode02_numbers.values()),k=10000))

In [None]:
barcode06_numbers = {}
for key in barcode06:
    seq = []
    for element in barcode06[key].seq[30:-30]:
        if element == "A":
            seq.append(0)
        elif element == "C":
            seq.append(1)
        elif element == "G":
            seq.append(2)
        elif element == "T":
            seq.append(3)
    if len(seq) < max(total_lens):
        seq.extend([0]*(max(total_lens)-len(seq)))
    barcode06_numbers[key] = seq

In [None]:
seq6 = np.array(random.choices(list(barcode06_numbers.values()),k=10000))

In [None]:
seq_comb = np.concatenate((seq2, seq6), axis=0)

In [None]:
ids2 = np.array([2]*(len(seq2)))
print(len([2]*(len(seq2))))
ids6 = np.array([6]*(len(seq6)))
print(len([2]*(len(seq6))))

In [None]:
ids_comb = np.concatenate((ids2, ids6), axis=0)

In [None]:
print(len(seq_comb))
print(len(ids_comb))

In [None]:
np.savez_compressed('../../analysis/arrays_test/20171103_FAH15473_b2+b6_ids.csv', ids_comb)

In [None]:
seq_comb[:10]

In [None]:
np.savez_compressed('../../analysis/arrays_test/20171103_FAH15473_b2+b6_seqs.csv', seq_comb)

In [None]:
seqs_test = np.load('../../analysis/arrays_test/20171103_FAH15473_b2+b6_seqs.csv.npz', allow_pickle=True)['arr_0']

In [None]:
seqs_test[:10]

In [None]:
ids_test = np.load('../../analysis/arrays_test/20171103_FAH15473_b2+b6_ids.csv.npz', allow_pickle=True)['arr_0']

In [None]:
ids_test[:10]

In [None]:
ids_test[-10:]

In [None]:
import glob
path = "/10tb/tmp/TE/honours/analysis/Concatenated/*/*/*.fastq"
path_names = glob.glob(path)
total_count = 0
can_count = 0
mis_count = 0
unc_count = 0
for path in path_names:
    temp_dict = SeqIO.to_dict(SeqIO.parse(path, "fastq"))
    if path[61:-13] != 'unclassified':
        if '20171212_FAH18688/barcode10' not in path and '20171207_FAH18654/barcode10' not in path:
            total_count += len(temp_dict)
            can_count += len(temp_dict)
        else:
            total_count += len(temp_dict)
            mis_count += len(temp_dict)
            can_count += len(temp_dict)
    else:
        total_count += len(temp_dict)
        unc_count += len(temp_dict)
print('Total number of reads assigned to a barcode by Deepbinner is %s' % can_count)
print('Number of reads assigned to non-existent barcodes by Deepbinner is %s' % mis_count)
print('Estimated number of misassigned reads in total is %s' % (23*mis_count))
print('Percentage of misassigned reads based on estimated number is %s' % (2300*(mis_count)/can_count))
print('Number of unclassified reads is %s' % unc_count)
print('Percentage of unclassified reads is %s' % (100*unc_count/total_count))

In [None]:
import glob
path = "/10tb/tmp/TE/honours/analysis/Length_Filtered/*/*/*.csv"
path_names = glob.glob(path)
full_stats2 = pd.DataFrame(data=None, columns = ['number of frDNA reads','minimum read length','maximum read length','mean read length','std dev','median read length'])
for path in path_names:
    if path[64:-30] != 'unclassified':
        if '20171212_FAH18688/barcode10' not in path and '20171207_FAH18654/barcode10' not in path:
            path_stats_csv = pd.read_csv(path, header=0)
            full_stats2 = full_stats2.append(path_stats_csv)
            full_stats2 = full_stats2.rename(index={0: path[46:-30]})
full_stats2 = full_stats2.sort_values('mean read length', ascending=False)

sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,17))
ax = sns.barplot(x=full_stats2['mean read length'], y=full_stats2.index, palette="Blues_d", xerr=full_stats2['std dev'],ci=True)
ax.set_title("Mean Read Length by Sample")
figure1 = ax.get_figure()
figure1.savefig("/10tb/tmp/TE/honours/analysis/Length_Filtered/mean_reads.png",bbox_inches = "tight")
figure1.clf()

In [None]:
import glob
path = "/10tb/tmp/TE/honours/analysis/Length_Filtered/*/*/*.csv"
path_names = glob.glob(path)
full_stats2 = pd.DataFrame(data=None, columns = ['number of frDNA reads','minimum read length','maximum read length','mean read length','std dev','median read length'])
for path in path_names:
    if path[64:-30] != 'unclassified':
        if '20171212_FAH18688/barcode10' not in path and '20171207_FAH18654/barcode10' not in path:
            path_stats_csv = pd.read_csv(path, header=0)
            full_stats2 = full_stats2.append(path_stats_csv)
            full_stats2 = full_stats2.rename(index={0: path[46:-30]})
full_stats2 = full_stats2.sort_values('number of frDNA reads', ascending=False)

sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,17))
ax = sns.barplot(x=full_stats2['number of frDNA reads'], y=full_stats2.index, palette="Blues_d")
ax.set_title("Number of Reads per Sample after Filtering")
figure1 = ax.get_figure()
figure1.savefig("/10tb/tmp/TE/honours/analysis/Length_Filtered/num_reads.png",bbox_inches = "tight")
figure1.clf()

#### Identify primers and orientation

In [None]:
m = Mothur()

In [None]:
m.help()

In [None]:
m.pcr.seqs(fasta="../../analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_reads.fasta",oligos="../../analysis/ITS_primers.oligos",pdiffs=0,rdiffs=0)

In [None]:
pcr_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_reads.pcr.fasta","fasta"))

In [None]:
ids = []

for key in pcr_dict:
    ids.append(key)
with open('../../analysis/Length_Filtered/20171103_FAH15473/barcode02/ids.txt','w') as handle:
    handle.writelines("%s\n" % name for name in ids)
print(len(ids))

In [None]:
tmp_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_reads.fasta","fasta"))
new_dict = tmp_dict.copy()
keys_list = random.sample(ids,k=200)
print(len(keys_list))
for key in new_dict:
    if key not in keys_list:
        del tmp_dict[key]
print(len(tmp_dict))

In [None]:
%%writefile ../get_cons_ids.py

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from mothur_py import Mothur
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")
import argparse


WHITE='\033[1;37m'
BLUE='\033[0;34m'
RED='\033[0;31m'
NC='\033[0m'
GREEN='\033[0;32m'
PURPLE='\033[1;35m'

parser = argparse.ArgumentParser(description="""
The goal of this program is to extract the read ids of reads that
contain both the forward and reverse primer as exact matches
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("input_file", help="The input file for extraction")
args = parser.parse_args()

if args.verbose:
    print('\033[0;31m' + "Input file is " + args.input_file + '\033[1;37m')

m = Mothur()
m.pcr.seqs(fasta=args.input_file,oligos=args.input_file[:9]+"ITS_primers.oligos",pdiffs=0,rdiffs=0)
pcr_dict = SeqIO.to_dict(SeqIO.parse(args.input_file[:-5]+"pcr.fasta","fasta"))
ids = []
for key in pcr_dict:
    ids.append(key)
with open(args.input_file[:-29]+'ids.txt','w') as handle:
    handle.writelines("%s\n" % name for name in ids)
    
if args.verbose:
    print('\033[0;34m' + "Ids file saved to " + '\033[0;35m' + (args.input_file[:-29]+'ids.txt') + '\033[1;37m')
    print('\033[0;32m' + ("The number of reads is %s" % len(ids)) + '\033[1;37m')
    
    
tmp_dict = SeqIO.to_dict(SeqIO.parse(args.input_file,"fasta"))
new_dict = tmp_dict.copy()
if len(ids) > 100:
    keys_list = random.sample(ids,k=100)
else:
    print('\033[1;37m' + "LOW READS")
for key in new_dict:
    if key not in keys_list:
        del tmp_dict[key]
SeqIO.write(tmp_dict.values(),(args.input_file[:9]+'Consensus'+args.input_file[24:-29]+'for_consensus.fasta'),'fasta')

if args.verbose:
    print('\033[0;34m' + "Ids file saved to " + '\033[0;35m' + (args.input_file[:9]+'Consensus'+args.input_file[24:-29]+'for_consensus.fasta') + '\033[1;37m')

In [None]:
from Bio.Align import AlignInfo
alignment = Bio.AlignIO.read("../../analysis/Consensus/20171103_FAH15473/barcode02/consensus_100.fasta","fasta")
summary_align = AlignInfo.SummaryInfo(alignment)

In [None]:
consensus = summary_align.dumb_consensus(threshold=0.7, ambiguous='N')

In [None]:
print(consensus)

In [None]:
database = SeqIO.to_dict(SeqIO.parse("../../database/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta", "fasta"))

In [None]:
lens = []
for key in database:
    lens.append(len(database[key].seq))

In [None]:
print(max(lens))
print(min(lens))
print(np.mean(lens))

In [None]:
print("The number of reads in this file is", len(lens))
ax = sns.distplot(lens, color="k", kde=False)


# ax.set(xlim=(250, 3500))
ax.set_title("Reads spread 20171103_FAH15473/barcode02", fontsize=15)
ax.set_xlabel("Length of read", fontsize=13)
ax.set_ylabel("Number of reads", fontsize=13)

plt.show()

In [None]:
len([1 for i in lens if i > 1000])

In [None]:
len([1 for i in lens if i > 2000])

In [None]:
len([1 for i in lens if i > 3000])

In [None]:
len([1 for i in lens if i > 500])

In [None]:
len([1 for i in lens if i < 500])

In [None]:
%%writefile ../get_align_seqs.py

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")
import argparse


WHITE='\033[1;37m'
BLUE='\033[0;34m'
RED='\033[0;31m'
NC='\033[0m'
GREEN='\033[0;32m'
PURPLE='\033[1;35m'

parser = argparse.ArgumentParser(description="""
This program extracts a specified number of reads from a fasta
file and saves them to a new fasta file
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("input_file", help="The input file for extraction")
parser.add_argument("num_reads", help="The number of reads to extract")
args = parser.parse_args()

if args.verbose:
    print('\033[0;31m' + "Input file is " + args.input_file + '\033[1;37m')
    print('\033[0;31m' + "The number of extracted reads is " + '\033[0;32m' + args.num_reads + '\033[1;37m')
    
tmp_dict = SeqIO.to_dict(SeqIO.parse(args.input_file,"fasta"))
new_dict = tmp_dict.copy()

ids=[]
for key in tmp_dict:
    ids.append(key)
    
keys_list = random.sample(ids,k=int(args.num_reads))

for key in new_dict:
    if key not in keys_list:
        del tmp_dict[key]
SeqIO.write(tmp_dict.values(),(args.input_file[:9]+'Alignment'+args.input_file[24:-29]+ args.num_reads + '_reads.fasta'),'fasta')

if args.verbose:
    print('\033[0;34m' + "Ids file saved to " + '\033[0;35m' + (args.input_file[:9]+'Alignment'+args.input_file[24:-29]+ args.num_reads + '_reads.fasta') + '\033[1;37m')

In [None]:
emboss_test = """nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnAGGTTAnnnnnAAnCAGnnACGAnCT
ACnnAAACnnnnGAAnnnTCnGAnnnnCnnAGCAnCnnnnCTGCAAGTCTGGTGCCAGCA
GCCnnnnGnCgGnTAATnTCCAGCTCnnnCAATnAGCGnnnnnTAnTATTnnAAAGnnnn
nnTTGnnnTTGCnnAGnTTAAAnnAAGCTCnnGTAnnnGTnnTGAAnCCTTGnnGGCnnC
TGGCTnnnnGGCCGGnnnTnnnnnCCGCnnCTCACCGnnnnCGTGnnnnnnnTACTnnGG
TnCCGnnnnnGCCGGnGCCTTnnnnTnnnnnnnTTCTGGGGAGCCGCnnnnATnnGCCnC
TTCACnTnGGGCGnnnTGTnnnnGGGGAACnCAGGACTTTTnnnnnnnnnnAAAAATTAn
nnnGAGnTGTTnCnnAAAGCAGGnnCnnnnCTTTGCTCGnnnAAnnTACnATTAGCATGn
nGAnATAATnnAnnnGAATAGGnnACGnTGnTnnnGGnnnTTCTATTTTnnGTTGGTTTn
nnnnCTnnAGGAnCCnGCCnnGTAAnnnTGnATTAATnAGnnnnGGATAGTnnnCGnnnn
GGGGCATnCCnnGTnATTCnnnnAATTGTnCAGAGGnnTnnnGAAAnnTTCTnnTGGAnT
TTATTnnGAnnnnAGAnCnGAACnnTACTGCnnnnGAAAGnCATnTTGCCnnAAGGnAnT
GnTTTTCAnTnnTAATnCnAGTGAnnnACnnnnGAAAGTTnAGnnGGnnGATCGnnnAAG
ACGATCAnnGnATACCnGTCnnnnGTnAGTCTnnnTAAnnnCnCATAAACnTnnnAnnnT
GCCGnnnnACnnTAGnGGATCnnGGTnnnGGATGnnTTATCnTTTTnTGACnnTCCATnn
nCGnGCACCTnTnACGAnnnGAAATCnAAAGnnnnnnTTTTTGGnnnGTTCTGnGGGGAn
nnGTATGGTnCGCnnAAGGCTnGAAnnACTTnAAAGnnAAATTGACnnGGAAnGnnGGCA
nCCACCnnAGGCnnGTGnnnnGnnnAGCnCTGCnnnnGGCTTAATTnnnnnTGAnCTnnC
AAnnnCACnnnnnGGGAAACTCAnnCCAGGTnCCAGnnACnnAnCAAGTAnGGATTnGAn
nnnnnCAGATTGAnGnnAGCTCnnTTTnCTnnTGnnnnnnATTTTnnGTGGnGTGGnTnn
nGGnnTGCnnnnnnnnnnnATGnGnnnCCGnTTnnnCTTAGnTTGGTGGnnAGTGAnTTT
GTnnCTGnnnnCTTnnAAnTTGCGAnTAAnnCnnnGAAnnCGAGACnnnnnnCTnnnTAn
nACnCTGnnnnCTAAnATnnAGCCnnAGGCCCnnGCnnnTTTGGCnnGGGnTnCGCCnnG
GCnTTCTnnnTAGAGGnnnnnACTAnnTCnGGCnnTCnnnAAGCCnGATnnGGAAnnnGT
TTnnnnnAGGCAAnnTnnnnAACAGGnTnnCTGnnTGATGCnnnCCTnnnnnnTAGAnTG
nnnTTCTGnnnGnGCnCGCACGCGCnnnnTACnACTGnnnnAnnnCnnGGAGnCCAACnG
nAGnnTTCATnnnnCACCTnTGnnnnnnnGnCCnnGAAAGGTCnnTGGGnTnnnAATCnT
TGnnnTTnAnAACTCnnCnnnnnnGTCGnnnnTGCTnnnGGGGATnAnnnnGAGCAnTTG
CnnAATTAnnTTGCTCnnnTTCAACnnnnGAGGnAATGnnCCTnnAGnTAAnnnnGCGnC
ATGnnTnCATnnCAGCATGCGnnnnTTGAnTTAnnCGTnnCCCTGCCnCTTTnnGTAnCn
nnnnACAnCnCGCCCGnnnnTCGnnCnTACTAnnCCGATTGnnnAAnnnnnTGnGnnCTC
nnnnAGTGAnnnGGCCnnTTCnnnGGAnnCTGGCTCAnnGGGAGGnnnTnnCGnGCAACn
nnnGAnnCCACnCCAGAGnnnnnnnnnnnnCCGGAAnnnnAGTTGnGTnCAAACnTnnCG
nnGTCAnnTTTAGnaGnnnGAAGnnnTAAnnAAGnnnnnnnnnTCGTnnAAnnCAAGGTn
CTCCGnTnnAnnnGnnGTGAAnCCTGCGGAnGGGATnCnnnATnnnTACnnCnnGAGCGn
nnAGGGCnnCTCCGGnGTCnnCGACnnCTCCnAACCCTTTGnnnTGAnAnnnCACATCCn
nnnCnnnnGTTGCTTCnnnGGGGnnnnnGCGAnCCnnCTGCCnGGGCGnnCCCCGGnnAG
GnnnnnnCACCnnAAAAAACACTGnCATCTnCTGnnCGTnnCGGAnGTTTAnnCGnAGnn
nnnnTnAAATCnnGAAACAnnnAAAnnCTTTCAAnnnCAAnnnCnnGGAnnTnnnCTCnT
TGGnTTCnTGGCATnnCGAnTGnnAAGAAnnnCGnCnnnnnAGCnnnGAAAnnnnTGCnn
GATnnnAAGnTnAATGTGnAnnATTGCAnnnnnGAATTnCnnnAGTnnGAATCnATnnCG
nnnAATnnCTTTnGAACnnGCACAnTTGCGCnnCCCCTGnnnnnnGTATTnnCCGGGGnn
nnGCAnnTGCCnCGnnnTTCnnGnAGCGnTCAnnnTTACACnnCACTCnCAGnCnnnCTC
GCnnnTGGGnnTAnTTGGGCGnnnnnTCTTTTnTCGnnnCGGGGAnTCACTnCCCnCGCG
CnnGCCTCAAnnnnnnTCTCCnnnnGGnCTnnnnnGnnAGCGGnnnnnnnTCTCnnGTCT
nnnCCCAGCnGTnnTGnTGGCATnCAnnCGTnCTCGnCCGCnnnGGnnnnnnnnGTTnnC
nACGAGCCCTnnCACGGnnCCGTTnAnAATnCAnCACCTCAGnnnnGTTnnnGAnCCTnn
nCGnGATCGGnnnGTAGGGnATACCnnCGCTnnGAACTTAAnGCnnAnTATnCAATnnnA
AGCnnnnGGAnnnGGAAAAnnGnnAAACCAACnnAGGGATTGCCCnnnnTAGnTAAnnCG
GCnnnnGAGTnnnnnnnnnGAAGCnnGnnnnGCnnAACnnAGCnnTCnAAATTTnnGAAA
TnnnCTGGCCCCnnCGGCnnnCGAnnGTTGTAnnAnnnTTTGnTnnAGAGGAnTGnnnnC
TTCTGGGnnTnnnAGCnnGACnnCGnGTnnnnnnCnnnnTAAGTTnnnCCTTnGGAACnn
nnAGGAnCnnnGTnCATAGAGnnnnGnnGTGAGAATCCnnCnnnnnGTAnnTGnCGACCn
nnnGGCCnnCGCGCCnCTCCAnnnCGnTAGCTCnnCnTTnCnGACnGnAnGTCnnGAGnT
TGTTTnGGGnnnAATGCnAGCTCTnnAAATGGGAGnGTAnnAATTTnCTTCTnnnnAnnA
AGCTAAATnnACCGnnnGCCnnnnnnnnnAGAGACCnnnGAnTAnnnGCGCnAnCAAGTn
nnAnGAGTGATnnnCnGAAAGAnnTGnAnnnnnnAAAGCACnnTTTGGnnnnAAGAGAGT
TnnAAAAAGCACGTnnnGAnnnnAATTGnTnnnTGAAAGGGAAnGCGCnTTACnnAAnnC
CnAGACnnTTTGGGnnnnnGCGGnnTGTTnnnnnnnCCGCCnnGGnnnTCTTnnCTGAnn
nCCnnGGTnCTACTnnnnCTCCGTCCnnnnGAnnGGnnnCCAAnnnnCnnnnnATnnCAT
CTnnnGGGAnCCGCCnGGACAnnnAGACCnnTCnAGnGAATnnnGnTAGCTCCCCCnnnn
nnGGGAGTnnnnnnnGnTnnTAnnnTAACnnnCTnGTGnnnGTnGATnnGCGGCGnnnnn
nCGTCCCGGnnnnnnnnnnTCCGnnCGCTTnnnCGGCAAGnnGATGnTTGGCnnnnGTAA
TnnGGTTnnGTCnnnnnAGCGnnGnCCCGTnCTTGAAnnACACGGACCnnnAAGGAGTnn
CTAnnnnnACATnnCTATGCGnnAGTGTTCnGGGnnnnnTGnnnTCAnAACCCCTAnCGC
nnnGGAATGAnnnnAAGTnnGAACnnnnnnGnnGAGGTnGGGnnnAGGGGCnnAAnnnCC
nnCTGnnCAnCCATnnCGAnnnnnnCCGnATCCnnnTGnnATGTnnCnCTCnGnnnnnnn
nnnnnnGATGnnnnnnGATTTGnAnnnGTAAGAnnnnGCAnnnTAGCTnGnnnTTnnGGG
ACCCGnnnAAAGATGnnGTGnnAACnnTnnnnATGnCCTGAAnnTnAGnnnGGTnnGAnA
GCCnnAGAGGAAACnnTCTGGnnTnGGAGGnnCTCGnCAGCnGnnnTnTCTnnnnGACGn
nnnTGCnnAAATCnGnnATCnGTCnnnAnnAATTTGGnnnGTAnnTAGGnGnGCGAnnAn
AGACTnnAATnCnnGAACnnnCATnCTnnAGnnnTnAGCnnnnnnTGGnnnnnnTTCCnT
GCnnCnnnGAAGnTTTnnnCCCTnnCAGGAnTAnnnnnGCnAGnnnnnTAACnnGTTTTn
nCAGnTTTTnnnnATGAGGnnTAnnnnnnAAGCnnnnnGAAnTGATTnnAGAGnnnGCnn
nnCTTGGGGnTTnnnGAAACAAnCCTnTAnnnACCTATnnTnnCTCAAAnCTTTnnAAAT
ATGnTnnAAGnAAGTCCnnTTGnTTnnACTTnAGTTnnnGAACGnnTGGnnAnnnCAnTT
TnnnGAATGTnnAnnTCnnnnnGnnnTTACTnAGTnGnnnnGGCCAnTTTTnTGGTAAGC
AGAACTGGCGAGnnGnnnTGCnTGnnTCGnATTCCnnGnTTTnnnnGTAnnnGTnnCGTn
CnnTGnTTTAACnnnCTTAnGCAnAnnTACnnGnTAACnnnnnnn"""

In [None]:
emboss_new = emboss_test.replace('N','').replace('n','').replace('\n','')

In [None]:
len(emboss_new)

In [None]:
emboss_new

##### Run loop over EMBOSS cons -identity option from 0 to 100 to get a range of different consensus sequences [x]
##### Compare each consensus with the GENEIOUS consensus and with ~20 randomly selected reads (same reads for each consensus)
###### Extract the score and determine what identity gives the best mean match score

In [None]:
test_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Consensus/20171103_FAH15473/barcode02/test1.fasta","fasta"))
emboss_test = str(test_dict['test'].seq)
emboss_new = emboss_test.replace('N','').replace('n','').replace('\n','')

scores_matrix = []
test100 = SeqIO.to_dict(SeqIO.parse("../../analysis/Alignment/20171103_FAH15473/barcode02/100_reads.fasta","fasta"))
for key in test100:
    tmp = []
    alignments = pairwise2.align.globalxx(emboss_new, test100[key].seq, score_only=True)
    tmp.append(alignments)
    scores_matrix.append(int(alignments))

In [None]:
%%writefile ../consensus_test.py

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import seaborn as sns
import csv
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")
import argparse

WHITE='\033[1;37m'
BLUE='\033[0;34m'
RED='\033[0;31m'
NC='\033[0m'
GREEN='\033[0;32m'
PURPLE='\033[1;35m'

parser = argparse.ArgumentParser(description="""
This program extracts a specified number of reads from a fasta
file and saves them to a new fasta file
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("input_file", help="The input file for extraction")
args = parser.parse_args()

if args.verbose:
    print('\033[0;31m' + "Input file is " + args.input_file + '\033[1;37m')

keys = []
test_dict = SeqIO.to_dict(SeqIO.parse(args.input_file,"fasta"))
for key in test_dict:
    keys.append(key)
emboss_test = str(test_dict[keys[0]].seq)
emboss_new = emboss_test.replace('N','').replace('n','').replace('\n','')
print(emboss_new)

scores_matrix = []
test100 = SeqIO.to_dict(SeqIO.parse("analysis/Alignment/20171103_FAH15473/barcode02/100_reads.fasta","fasta"))
for key in test100:
    alignments = pairwise2.align.globalxx(emboss_new, test100[key].seq, score_only=True)
    scores_matrix.append(int(alignments))

print(scores_matrix)
mean = np.mean(scores_matrix)
median=np.median(scores_matrix)
f=open((args.input_file[:47]+"scores.txt"),"a+")
f.write(">%s\t%s\t%s\t\n" % (keys[0], mean, median))

tmp = pd.DataFrame(scores_matrix)
tmp.to_csv(args.input_file[:47]+"%s.csv" % keys[0],index=False,header=False)


with open((args.input_file[:47]+"%s_new.fasta" % keys[0]),"w+") as handle:
    handle.write(">%s\n" % keys[0] +emboss_new)

In [None]:
with open(args.input_file[:-29]+'ids.txt','w') as handle:
    handle.writelines("%s\n" % name for name in ids)

In [None]:
f=open("../../analysis/Consensus/20171103_FAH15473/barcode02/scores.txt","r")
if f.mode == "r":
    contents=f.read()

In [None]:
tmp=contents.replace("\t",",").replace("\n","").replace(",>",">").split(">")[1:]
print(len(tmp))

In [None]:
data=[]
indices=[]
for item in tmp:
    indices.append(int(item.split(",")[0][4:]))
    data.append(((float(item.split(",")[1])),float(item.split(",")[2])))
frame = pd.DataFrame(index=indices,data=data,columns=["Mean Score","Median Score"])
print(frame.loc[frame['Mean Score'].idxmax()].name)
print(len(frame))

In [None]:
frame = frame.sort_index(ascending=True)

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(frame)
    
#     # Index 0 is the Geneious-generated Consensus

In [None]:
from scipy.stats import ranksums
geneious = pd.read_csv("../../analysis/Consensus/20171103_FAH15473/barcode02/test0.csv",header=None)
test20 = pd.read_csv("../../analysis/Consensus/20171103_FAH15473/barcode02/test20.csv",header=None)
test1 = pd.read_csv("../../analysis/Consensus/20171103_FAH15473/barcode02/test1.csv",header=None)
test52 = pd.read_csv("../../analysis/Consensus/20171103_FAH15473/barcode02/test52.csv",header=None)
test75 = pd.read_csv("../../analysis/Consensus/20171103_FAH15473/barcode02/test75.csv",header=None)

print(ranksums(geneious,test1))
print(ranksums(geneious,test20))
print(ranksums(geneious,test52))
print(ranksums(geneious,test75))

In [None]:
plot = pd.DataFrame(columns=['geneious','test1','test20','test52','test75'])
plot['geneious'] = geneious[0]
plot['test1'] = test1[0]
plot['test20'] = test20[0]
plot['test52'] = test52[0]
plot['test75'] = test75[0]

In [None]:
ax = sns.violinplot(data=plot)
ax.set_title("Different consensus sequence generation methos for 20171103_FAH15473/barcode02", fontsize=12)
ax.set_xlabel("Consensus Generation Method", fontsize=10)
ax.set_ylabel("Score", fontsize=10)

plt.show()

In [None]:
test100 = SeqIO.to_dict(SeqIO.parse("../../analysis/Alignment/20171103_FAH15473/barcode02/100_reads.fasta","fasta"))
test_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Consensus/20171103_FAH15473/barcode02/test0_new.fasta","fasta"))

In [None]:
scores_matrix = []
tmp = []
for key in test100:
    alignments = pairwise2.align.globalxx(emboss_new, test100[key].seq)
    scores_matrix.append(alignments)
    tmp.append(key)

1. Import paf
2. Extract columns 10/11 for each row
3. Save the alignment percentage identity to a list

In [None]:
f=open("../../analysis/Consensus/20171103_FAH15473/barcode02/test0.paf","r")
if f.mode == "r":
    contents=f.read()  
tmp=contents.replace("\t",",").split('\n')
tmp_dict = {}
for line in tmp[:-1]:
    tmp_dict[line.split(",")[0]] = [int(line.split(",")[9]),int(line.split(",")[10])]
geneious = pd.DataFrame.from_dict(tmp_dict,orient='index',columns=['matching bases','total bases'])
geneious['alignment identity'] = 100*(geneious['matching bases']/geneious['total bases'])

In [None]:
f=open("../../analysis/Consensus/20171103_FAH15473/barcode02/test20.paf","r")
if f.mode == "r":
    contents=f.read()  
tmp=contents.replace("\t",",").split('\n')
tmp_dict = {}
for line in tmp[:-1]:
    tmp_dict[line.split(",")[0]] = [int(line.split(",")[9]),int(line.split(",")[10])]
test20 = pd.DataFrame.from_dict(tmp_dict,orient='index',columns=['matching bases','total bases'])
test20['alignment identity'] = 100*(test20['matching bases']/test20['total bases'])

In [None]:
plot = pd.DataFrame(columns=['geneious','test20'])
plot['geneious'] = geneious['alignment identity']
plot['test20'] = test20['alignment identity']

ax = sns.violinplot(data=plot)
ax.set_title("Different consensus sequence generation methods for 20171103_FAH15473/barcode02", fontsize=12)
ax.set_xlabel("Consensus Generation Method", fontsize=10)
ax.set_ylabel("Alignment Percentage Identity", fontsize=10)

plt.show()

In [None]:
%%writefile ../cleanup.py

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import seaborn as sns
import csv
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")
import argparse


parser = argparse.ArgumentParser(description="""
This program extracts a specified number of reads from a fasta
file and saves them to a new fasta file
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("input_file", help="The input file for extraction")
args = parser.parse_args()

if args.verbose:
    print("Input file is " + args.input_file + "\n")

keys = []
test_dict = SeqIO.to_dict(SeqIO.parse(args.input_file,"fasta"))
for key in test_dict:
    keys.append(key)
emboss_test = str(test_dict[keys[0]].seq)
emboss_new = emboss_test.replace('N','').replace('n','').replace('\n','')

with open(args.input_file[:47]+"clean_consensus.fasta","w+") as handle:
    handle.write(">%s\n" % keys[0] + emboss_new)

In [None]:
%%writefile ../1000_minimap_result.py

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import seaborn as sns
import csv
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")
import argparse


parser = argparse.ArgumentParser(description="""
This program extracts a specified number of reads from a fasta
file and saves them to a new fasta file
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("input_file", help="The input file for extraction")
args = parser.parse_args()

if args.verbose:
    print("Input file is " + args.input_file + "\n")

f=open(args.input_file,"r")
if f.mode == "r":
    contents=f.read()

tmp=contents.replace("\t",",").split('\n')
tmp_dict = {}
for line in tmp[:-1]:
    tmp_dict[line.split(",")[0]] = str(line.split(",")[5])

count_dict = {}
correct = 0
incorrect = 0
for item in tmp_dict:
    if tmp_dict[item] not in count_dict:
        count_dict[tmp_dict[item]] = 1
    else:
        count_dict[tmp_dict[item]] = count_dict[tmp_dict[item]] + 1
        
tmp = pd.DataFrame.from_dict(count_dict,orient='index',columns=["Count"])
tmp["Percentage Match"] = tmp.apply(lambda row: 100*row.Count/1000,axis=1)
tmp.index.names = ['analysis/Consensus/'+args.input_file[19:-18]]
tmp = tmp.sort_values(by="Count",ascending=False)

if args.verbose:
    print(tmp)

tmp.to_csv(args.input_file[:-18]+'match_distribution.csv',sep=',')

    1. Move all base files from Length_Restricted to the appropriate location
    2. For each file, alter the record.name for all elements to ensure each record is associated with the barcode/species/strain (as below) and rewrite the file
    3. Create the equal_number files and combined all-read files in the tree as was done above

In [None]:
%%writefile ../record_name.py

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord as SR
from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
import pandas as pd
from pandas import DataFrame as df
# import seaborn as sns
import csv
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import os
from shutil import copy
import random
import warnings
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# from ipysankeywidget import SankeyWidget
warnings.filterwarnings("ignore")
import argparse


parser = argparse.ArgumentParser(description="""
This program alters the record.name of entries in a fasta file
""")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
parser.add_argument("input_file", help="The input file for extraction")
args = parser.parse_args()

if args.verbose:
    print("Input file is " + args.input_file + "\n")

    
with open(args.input_file) as original, open(args.input_file[:-15]+'labelled_read_pool.fasta', 'w') as corrected:
    records = SeqIO.parse(original, 'fasta')
    for record in records:
        record.description = (args.input_file.split('/')[-3]+'_'+args.input_file.split('/')[-2])
        SeqIO.write(record, corrected, 'fasta')

### Basidiomycetes

In [2]:
gattii = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/gattii/labelled_read_pool.fasta", "fasta"))
neoformans = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/neoformans/labelled_read_pool.fasta", "fasta"))
zero = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/zero/labelled_read_pool.fasta", "fasta"))
lens = [len(gattii), len(neoformans), len(zero)]
print(min(lens))

new_gattii_keys = random.sample(list(gattii), min(lens))
new_neoformans_keys = random.sample(list(neoformans), min(lens))
new_zero_keys = random.sample(list(zero), min(lens))

new_gattii = {key: gattii[key] for key in new_gattii_keys}
new_neoformans = {key: neoformans[key] for key in new_neoformans_keys}
new_zero = {key: zero[key] for key in new_zero_keys}

print(len(new_gattii))
print(len(new_neoformans))
print(len(new_zero))

full = dict(gattii)
full.update(neoformans)
full.update(zero)
print(len(full))

SeqIO.write(new_gattii.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/gattii/equal_read_pool.fasta", "fasta")
SeqIO.write(new_neoformans.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/neoformans/equal_read_pool.fasta", "fasta")
SeqIO.write(new_zero.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/zero/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/Cryptococcus/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/Tremellaceae/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/full_read_pool.fasta", "fasta")

23660
23660
23660
23660
215631


215631

In [3]:
A = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Agaricomycetes/Agaricales/full_read_pool.fasta", "fasta"))
M = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Exobasidiomycetes/Microstromatales/full_read_pool.fasta", "fasta"))
S = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Microbotryomycetes/Sporidiobolales/full_read_pool.fasta", "fasta"))
P = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Pucciniomycetes/Pucciniales/full_read_pool.fasta", "fasta"))
T = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/Tremellomycetes/Tremellales/full_read_pool.fasta", "fasta"))
lens = [len(A), len(M), len(S), len(P), len(T)]
print(min(lens))

new_A_keys = random.sample(list(A), min(lens))
new_M_keys = random.sample(list(M), min(lens))
new_S_keys = random.sample(list(S), min(lens))
new_P_keys = random.sample(list(P), min(lens))
new_T_keys = random.sample(list(T), min(lens))

new_A = {key: A[key] for key in new_A_keys}
new_M = {key: M[key] for key in new_M_keys}
new_S = {key: S[key] for key in new_S_keys}
new_P = {key: P[key] for key in new_P_keys}
new_T = {key: T[key] for key in new_T_keys}

print(len(new_A))
print(len(new_M))
print(len(new_S))
print(len(new_P))
print(len(new_T))

full = dict(A)
full.update(M)
full.update(S)
full.update(P)
full.update(T)
print(len(full))

SeqIO.write(new_A.values(), "../../analysis/Fungi/Basidiomycota/Agaricomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_M.values(), "../../analysis/Fungi/Basidiomycota/Exobasidiomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_S.values(), "../../analysis/Fungi/Basidiomycota/Microbotryomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_P.values(), "../../analysis/Fungi/Basidiomycota/Pucciniomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_T.values(), "../../analysis/Fungi/Basidiomycota/Tremellomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Basidiomycota/full_read_pool.fasta", "fasta")

46171
46171
46171
46171
46171
46171
610030


610030

### Ascomycota

In [4]:
B = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Dothideomycetes/Botryosphaeriales/Botryosphaeriaceae/full_read_pool.fasta", "fasta"))
C = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Dothideomycetes/Capnodiales/Mycosphaerellaceae/full_read_pool.fasta", "fasta"))
P = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Dothideomycetes/Pleosporales/Pleosporaceae/full_read_pool.fasta", "fasta"))
lens = [len(B), len(C), len(P)]
print(min(lens))

new_B_keys = random.sample(list(B), min(lens))
new_C_keys = random.sample(list(C), min(lens))
new_P_keys = random.sample(list(P), min(lens))

new_B = {key: B[key] for key in new_B_keys}
new_C = {key: C[key] for key in new_C_keys}
new_P = {key: P[key] for key in new_P_keys}

print(len(new_B))
print(len(new_C))
print(len(new_P))

full = dict(B)
full.update(C)
full.update(P)
print(len(full))

SeqIO.write(new_B.values(), "../../analysis/Fungi/Ascomycota/Dothideomycetes/Botryosphaeriales/equal_read_pool.fasta", "fasta")
SeqIO.write(new_C.values(), "../../analysis/Fungi/Ascomycota/Dothideomycetes/Capnodiales/equal_read_pool.fasta", "fasta")
SeqIO.write(new_P.values(), "../../analysis/Fungi/Ascomycota/Dothideomycetes/Pleosporales/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Dothideomycetes/full_read_pool.fasta", "fasta")

44257
44257
44257
44257
267361


267361

In [5]:
F = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/flavus/labelled_read_pool.fasta", "fasta"))
N = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/niger/labelled_read_pool.fasta", "fasta"))
S = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/sp./labelled_read_pool.fasta", "fasta"))
lens = [len(F), len(N), len(S)]
print(min(lens))

new_F_keys = random.sample(list(F), min(lens))
new_N_keys = random.sample(list(N), min(lens))
new_S_keys = random.sample(list(S), min(lens))

new_F = {key: F[key] for key in new_F_keys}
new_N = {key: N[key] for key in new_N_keys}
new_S = {key: S[key] for key in new_S_keys}

print(len(new_F))
print(len(new_N))
print(len(new_S))

full = dict(F)
full.update(N)
full.update(S)
print(len(full))

SeqIO.write(new_F.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/flavus/equal_read_pool.fasta", "fasta")
SeqIO.write(new_N.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/flavus/equal_read_pool.fasta", "fasta")
SeqIO.write(new_S.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/flavus/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/Aspergillus/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/full_read_pool.fasta", "fasta")

39988
39988
39988
39988
150734


150734

In [6]:
C = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Eurotiomycetes/Chaetothyriales/Herpotrichiellaceae/full_read_pool.fasta", "fasta"))
E = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/Trichocomaceae/full_read_pool.fasta", "fasta"))
lens = [len(C), len(E)]
print(min(lens))

new_C_keys = random.sample(list(C), min(lens))
new_E_keys = random.sample(list(E), min(lens))

new_C = {key: C[key] for key in new_C_keys}
new_E = {key: E[key] for key in new_E_keys}

print(len(new_C))
print(len(new_E))

full = dict(C)
full.update(E)
print(len(full))

SeqIO.write(new_C.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Chaetothyriales/equal_read_pool.fasta", "fasta")
SeqIO.write(new_E.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/Eurotiales/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/full_read_pool.fasta", "fasta")

25917
25917
25917
176651


176651

In [7]:
t29 = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/Tapesia/yallundae/CCL029/labelled_read_pool.fasta", "fasta"))
t31 = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/Tapesia/yallundae/CCL031/labelled_read_pool.fasta", "fasta"))
lens = [len(t29), len(t31)]
print(min(lens))

new_t29_keys = random.sample(list(t29), min(lens))
new_t31_keys = random.sample(list(t31), min(lens))

new_t29 = {key: t29[key] for key in new_t29_keys}
new_t31 = {key: t31[key] for key in new_t31_keys}

print(len(new_t29))
print(len(new_t31))

full = dict(t29)
full.update(t31)
print(len(full))

SeqIO.write(new_t29.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/Tapesia/yallundae/CCL029/equal_read_pool.fasta", "fasta")
SeqIO.write(new_t31.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/Tapesia/yallundae/CCL031/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/Tapesia/yallundae/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/Tapesia/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/Dermateaceae/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/Helotiales/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/full_read_pool.fasta", "fasta")

49481
49481
49481
105070


105070

In [8]:
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/mexicana/labelled_read_pool.fasta", "fasta"))
s = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/scolyti/labelled_read_pool.fasta", "fasta"))
lens = [len(m), len(s)]
print(min(lens))

new_m_keys = random.sample(list(m), min(lens))
new_s_keys = random.sample(list(s), min(lens))

new_m = {key: m[key] for key in new_m_keys}
new_s = {key: s[key] for key in new_s_keys}

print(len(new_m))
print(len(new_s))

full = dict(m)
full.update(s)
print(len(full))

SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/mexicana/equal_read_pool.fasta", "fasta")
SeqIO.write(new_s.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/scolyti/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/full_read_pool.fasta", "fasta")

34841
34841
34841
77210


77210

In [9]:
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Meyerozyma/full_read_pool.fasta", "fasta"))
y = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/full_read_pool.fasta", "fasta"))
lens = [len(m), len(y)]
print(min(lens))

new_m_keys = random.sample(list(m), min(lens))
new_y_keys = random.sample(list(y), min(lens))

new_m = {key: m[key] for key in new_m_keys}
new_y = {key: y[key] for key in new_y_keys}

print(len(new_m))
print(len(new_y))

full = dict(m)
full.update(y)
print(len(full))

SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Meyerozyma/equal_read_pool.fasta", "fasta")
SeqIO.write(new_y.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/Yamadazyma/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/full_read_pool.fasta", "fasta")

18944
18944
18944
96154


96154

In [10]:
ga = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/Galactomyces/full_read_pool.fasta", "fasta"))
ge = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/Geotrichum/full_read_pool.fasta", "fasta"))
y = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/Yarrowia/full_read_pool.fasta", "fasta"))
lens = [len(ga), len(ge), len(y)]
print(min(lens))

new_ga_keys = random.sample(list(ga), min(lens))
new_ge_keys = random.sample(list(ge), min(lens))
new_y_keys = random.sample(list(y), min(lens))

new_ga = {key: ga[key] for key in new_ga_keys}
new_ge = {key: ge[key] for key in new_ge_keys}
new_y = {key: y[key] for key in new_y_keys}

print(len(new_ga))
print(len(new_ge))
print(len(new_y))

full = dict(ga)
full.update(ge)
full.update(y)
print(len(full))

SeqIO.write(new_ga.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/Galactomyces/equal_read_pool.fasta", "fasta")
SeqIO.write(new_ge.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/Geotrichum/equal_read_pool.fasta", "fasta")
SeqIO.write(new_y.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/Yarrowia/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/full_read_pool.fasta", "fasta")

7805
7805
7805
7805
68773


68773

In [11]:
c = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/Clavispora/full_read_pool.fasta", "fasta"))
k = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/Kodamaea/full_read_pool.fasta", "fasta"))
lens = [len(c), len(k)]
print(min(lens))

new_c_keys = random.sample(list(c), min(lens))
new_k_keys = random.sample(list(k), min(lens))

new_c = {key: c[key] for key in new_c_keys}
new_k = {key: k[key] for key in new_k_keys}

print(len(new_c))
print(len(new_k))

full = dict(c)
full.update(k)
print(len(full))

SeqIO.write(new_c.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/Clavispora/equal_read_pool.fasta", "fasta")
SeqIO.write(new_k.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/Kodamaea/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/full_read_pool.fasta", "fasta")

36478
36478
36478
168414


168414

In [12]:
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/Pichia/membranifaciens/labelled_read_pool.fasta", "fasta"))
k = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/Pichia/kudriavzevii/labelled_read_pool.fasta", "fasta"))
lens = [len(m), len(k)]
print(min(lens))

new_m_keys = random.sample(list(m), min(lens))
new_k_keys = random.sample(list(k), min(lens))

new_m = {key: m[key] for key in new_m_keys}
new_k = {key: k[key] for key in new_k_keys}

print(len(new_m))
print(len(new_k))

full = dict(m)
full.update(k)
print(len(full))

SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/Pichia/membranifaciens/equal_read_pool.fasta", "fasta")
SeqIO.write(new_k.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/Pichia/kudriavzevii/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/Pichia/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/full_read_pool.fasta", "fasta")

26937
26937
26937
60181


60181

In [13]:
a = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/albicans/labelled_read_pool.fasta", "fasta"))
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/metapsilosis/labelled_read_pool.fasta", "fasta"))
o = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/orthopsilosis/labelled_read_pool.fasta", "fasta"))
p = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/parapsilosis/labelled_read_pool.fasta", "fasta"))
z = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/zeylanoides/labelled_read_pool.fasta", "fasta"))
lens = [len(a), len(m),len(o),len(p),len(z)]
print(min(lens))

new_a_keys = random.sample(list(a), min(lens))
new_m_keys = random.sample(list(m), min(lens))
new_o_keys = random.sample(list(o), min(lens))
new_p_keys = random.sample(list(p), min(lens))
new_z_keys = random.sample(list(z), min(lens))

new_a = {key: a[key] for key in new_a_keys}
new_m = {key: m[key] for key in new_m_keys}
new_o = {key: o[key] for key in new_o_keys}
new_p = {key: p[key] for key in new_p_keys}
new_z = {key: z[key] for key in new_z_keys}

print(len(new_a))
print(len(new_m))
print(len(new_o))
print(len(new_p))
print(len(new_z))

full = dict(a)
full.update(m)
full.update(o)
full.update(p)
full.update(z)
print(len(full))

SeqIO.write(new_a.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/albicans/equal_read_pool.fasta", "fasta")
SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/metapsilosis/equal_read_pool.fasta", "fasta")
SeqIO.write(new_o.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/orthopsilosis/equal_read_pool.fasta", "fasta")
SeqIO.write(new_p.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/parapsilosis/equal_read_pool.fasta", "fasta")
SeqIO.write(new_z.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/zeylanoides/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/full_read_pool.fasta", "fasta")

25597
25597
25597
25597
25597
25597
168625


168625

In [14]:
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/marxianus/labelled_read_pool.fasta", "fasta"))
l = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/lactis/labelled_read_pool.fasta", "fasta"))
lens = [len(m), len(l)]
print(min(lens))

new_m_keys = random.sample(list(m), min(lens))
new_l_keys = random.sample(list(l), min(lens))

new_m = {key: m[key] for key in new_m_keys}
new_l = {key: l[key] for key in new_l_keys}

print(len(new_m))
print(len(new_l))

full = dict(m)
full.update(l)
print(len(full))

SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/marxianus/equal_read_pool.fasta", "fasta")
SeqIO.write(new_l.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/lactis/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/full_read_pool.fasta", "fasta")

28382
28382
28382
63502


63502

In [15]:
c = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/full_read_pool.fasta", "fasta"))
k = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/full_read_pool.fasta", "fasta"))
s = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Saccharomyces/full_read_pool.fasta", "fasta"))
lens = [len(c), len(k), len(s)]
print(min(lens))

new_c_keys = random.sample(list(c), min(lens))
new_k_keys = random.sample(list(k), min(lens))
new_s_keys = random.sample(list(s), min(lens))

new_c = {key: c[key] for key in new_c_keys}
new_k = {key: k[key] for key in new_k_keys}
new_s = {key: s[key] for key in new_s_keys}

print(len(new_c))
print(len(new_k))
print(len(new_s))

full = dict(c)
full.update(k)
full.update(s)
print(len(full))

SeqIO.write(new_c.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Candida/equal_read_pool.fasta", "fasta")
SeqIO.write(new_k.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Kluyveromyces/equal_read_pool.fasta", "fasta")
SeqIO.write(new_s.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/Saccharomyces/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/full_read_pool.fasta", "fasta")

30260
30260
30260
30260
262387


262387

In [16]:
b = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/Blastobotrys/full_read_pool.fasta", "fasta"))
z = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/Zygoascus/full_read_pool.fasta", "fasta"))
lens = [len(b), len(z)]
print(min(lens))

new_b_keys = random.sample(list(b), min(lens))
new_z_keys = random.sample(list(z), min(lens))

new_b = {key: b[key] for key in new_b_keys}
new_z = {key: z[key] for key in new_z_keys}

print(len(new_b))
print(len(new_z))

full = dict(b)
full.update(z)
print(len(full))

SeqIO.write(new_b.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/Blastobotrys/equal_read_pool.fasta", "fasta")
SeqIO.write(new_z.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/Zygoascus/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/full_read_pool.fasta", "fasta")

34002
34002
34002
71317


71317

In [17]:
de = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/full_read_pool.fasta", "fasta"))
di = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/full_read_pool.fasta", "fasta"))
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/full_read_pool.fasta", "fasta"))
ph = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Phaffomycetaceae/full_read_pool.fasta", "fasta"))
pi = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/full_read_pool.fasta", "fasta"))
s = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/full_read_pool.fasta", "fasta"))
t = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/full_read_pool.fasta", "fasta"))
lens = [len(de), len(di),len(m),len(ph),len(pi),len(s),len(t)]
print(min(lens))

new_de_keys = random.sample(list(de), min(lens))
new_di_keys = random.sample(list(di), min(lens))
new_m_keys = random.sample(list(m), min(lens))
new_ph_keys = random.sample(list(ph), min(lens))
new_pi_keys = random.sample(list(pi), min(lens))
new_s_keys = random.sample(list(s), min(lens))
new_t_keys = random.sample(list(t), min(lens))

new_de = {key: de[key] for key in new_de_keys}
new_di = {key: di[key] for key in new_di_keys}
new_m = {key: m[key] for key in new_m_keys}
new_ph = {key: ph[key] for key in new_ph_keys}
new_pi = {key: pi[key] for key in new_pi_keys}
new_s = {key: s[key] for key in new_s_keys}
new_t = {key: t[key] for key in new_t_keys}

print(len(new_de))
print(len(new_di))
print(len(new_m))
print(len(new_ph))
print(len(new_pi))
print(len(new_s))
print(len(new_t))

full = dict(de)
full.update(di)
full.update(m)
full.update(ph)
full.update(pi)
full.update(s)
full.update(t)
print(len(full))

SeqIO.write(new_de.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Debaryomycetaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(new_di.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Dipodascaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Metschnikowiaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(new_ph.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Phaffomycetaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(new_pi.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Pichiaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(new_s.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Saccharomycetaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(new_t.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/Trichomonascaceae/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/Saccharomycetales/full_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/full_read_pool.fasta", "fasta")

42589
42589
42589
42589
42589
42589
42589
42589
769815


769815

In [18]:
d = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Sordariomycetes/Diaporthales/full_read_pool.fasta", "fasta"))
h = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Sordariomycetes/Hypocreales/full_read_pool.fasta", "fasta"))
m = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Sordariomycetes/Microascales/full_read_pool.fasta", "fasta"))
x = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Sordariomycetes/Xylariales/full_read_pool.fasta", "fasta"))
lens = [len(d), len(h), len(m),len(x)]
print(min(lens))

new_d_keys = random.sample(list(d), min(lens))
new_h_keys = random.sample(list(h), min(lens))
new_m_keys = random.sample(list(m), min(lens))
new_x_keys = random.sample(list(x), min(lens))

new_d = {key: d[key] for key in new_d_keys}
new_h = {key: h[key] for key in new_h_keys}
new_m = {key: m[key] for key in new_m_keys}
new_x = {key: x[key] for key in new_x_keys}

print(len(new_d))
print(len(new_h))
print(len(new_m))
print(len(new_x))

full = dict(d)
full.update(h)
full.update(m)
full.update(x)
print(len(full))

SeqIO.write(new_d.values(), "../../analysis/Fungi/Ascomycota/Sordariomycetes/Diaporthales/equal_read_pool.fasta", "fasta")
SeqIO.write(new_h.values(), "../../analysis/Fungi/Ascomycota/Sordariomycetes/Hypocreales/equal_read_pool.fasta", "fasta")
SeqIO.write(new_m.values(), "../../analysis/Fungi/Ascomycota/Sordariomycetes/Microascales/equal_read_pool.fasta", "fasta")
SeqIO.write(new_x.values(), "../../analysis/Fungi/Ascomycota/Sordariomycetes/Xylariales/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/Sordariomycetes/full_read_pool.fasta", "fasta")

27941
27941
27941
27941
27941
276762


276762

In [19]:
d = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Dothideomycetes/full_read_pool.fasta", "fasta"))
e = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Eurotiomycetes/full_read_pool.fasta", "fasta"))
l = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Leotiomycetes/full_read_pool.fasta", "fasta"))
p = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Pezizomycetes/full_read_pool.fasta", "fasta"))
sa = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Saccharomycetes/full_read_pool.fasta", "fasta"))
so = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/Sordariomycetes/full_read_pool.fasta", "fasta"))
lens = [len(d), len(e),len(l),len(p),len(sa),len(so)]
print(min(lens))

new_d_keys = random.sample(list(d), min(lens))
new_e_keys = random.sample(list(e), min(lens))
new_l_keys = random.sample(list(l), min(lens))
new_p_keys = random.sample(list(p), min(lens))
new_sa_keys = random.sample(list(sa), min(lens))
new_so_keys = random.sample(list(so), min(lens))

new_d = {key: d[key] for key in new_d_keys}
new_e = {key: e[key] for key in new_e_keys}
new_l = {key: l[key] for key in new_l_keys}
new_p = {key: p[key] for key in new_p_keys}
new_sa = {key: sa[key] for key in new_sa_keys}
new_so = {key: so[key] for key in new_so_keys}

print(len(new_d))
print(len(new_e))
print(len(new_l))
print(len(new_p))
print(len(new_sa))
print(len(new_so))

full = dict(d)
full.update(e)
full.update(l)
full.update(p)
full.update(sa)
full.update(so)
print(len(full))

SeqIO.write(new_d.values(), "../../analysis/Fungi/Ascomycota/Dothideomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_e.values(), "../../analysis/Fungi/Ascomycota/Eurotiomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_l.values(), "../../analysis/Fungi/Ascomycota/Leotiomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_p.values(), "../../analysis/Fungi/Ascomycota/Pezizomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_sa.values(), "../../analysis/Fungi/Ascomycota/Saccharomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(new_so.values(), "../../analysis/Fungi/Ascomycota/Sordariomycetes/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/Ascomycota/full_read_pool.fasta", "fasta")

74232
74232
74232
74232
74232
74232
74232
1669891


1669891

In [20]:
a = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Ascomycota/full_read_pool.fasta", "fasta"))
b = SeqIO.to_dict(SeqIO.parse("../../analysis/Fungi/Basidiomycota/full_read_pool.fasta", "fasta"))
lens = [len(a), len(b)]
print(min(lens))

new_a_keys = random.sample(list(a), min(lens))
new_b_keys = random.sample(list(b), min(lens))

new_a = {key: a[key] for key in new_a_keys}
new_b = {key: b[key] for key in new_b_keys}

print(len(new_a))
print(len(new_b))

full = dict(a)
full.update(b)
print(len(full))

SeqIO.write(new_a.values(), "../../analysis/Fungi/Ascomycota//equal_read_pool.fasta", "fasta")
SeqIO.write(new_b.values(), "../../analysis/Fungi/Basidiomycota/equal_read_pool.fasta", "fasta")
SeqIO.write(full.values(), "../../analysis/Fungi/full_read_pool.fasta", "fasta")

610030
610030
610030
2279921


2279921