In [None]:
# Script name       : NV_viruses_filter_reads_out_script
# Written by        : Yael Hazan
# Date              : 13-May-2018
# Script description: This script filters out reads from one file according to data on anoter file. 
#                     Step 1: Load files into df's. 
#                     Step 2: removing NV reads from source data, using merge.
#                     Step 3: Writing the resulted df's to csv files

In [None]:
DIR = r'/cs/labs/michall/yaelh/Nematostella_viruses_project/Processed_files'
       
import os
import numpy as np
import pandas as pd 
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import back_transcribe

In [None]:
# Step 1: Load files into df's. 

# Load the Input R1 & R2 fasta files (NV transcriptome reads after filtering out NV reads using bowtie,
# will be called source_reads)

def parse_fasta(dir,file_name):
    fasta_dict = {}
    
    for record in SeqIO.parse(os.path.join(dir, file_name), 'fasta'):
        fasta_dict[record.id] = [str(record.seq), len(record.seq)]
    
    return fasta_dict


source_reads_dict = parse_fasta(DIR,'Input_R1_file.fasta')
source_R1_reads_df = pd.DataFrame.from_dict(source_reads_dict, orient = 'index')

source_reads_dict = parse_fasta(DIR,'Input_R2_file.fasta')
source_R2_reads_df = pd.DataFrame.from_dict(source_reads_dict, orient = 'index')


#Load the blast R1 & R2 output files (blast of NV transcriptome againts NM genome):

blast_R1_out_df = pd.read_table(os.path.join(DIR, 'Nematostella_R1_blast_tabular.out'), header = None)
blast_R2_out_df = pd.read_table(os.path.join(DIR, 'Nematostella_R2_blast_tabular.out'), header = None)


# Rename columns:
source_R1_reads_df = source_R1_reads_df.rename(columns = {0:'Sequence', 1:'Sequence Length'})
source_R2_reads_df = source_R2_reads_df.rename(columns = {0:'Sequence', 1:'Sequence Length'})
blast_R1_out_df = blast_R1_out_df.rename(columns = {0:'Sequence id'})
blast_R2_out_df = blast_R2_out_df.rename(columns = {0:'Sequence id'})


# Changing the index to be numerical: 
source_R1_reads_df = source_R1_reads_df.reset_index()
source_R2_reads_df = source_R2_reads_df.reset_index()
source_R1_reads_df = source_R1_reads_df.rename(columns = {'index':'Sequence id'})
source_R2_reads_df = source_R2_reads_df.rename(columns = {'index':'Sequence id'})




In [None]:
# Step 2: removing NV reads from source data, using merge:

# Finding the requested reads in the source data using merge:
merged_R1_reads_df = pd.merge(source_R1_reads_df, blast_R1_out_df, on =['Sequence id'], how = 'left', indicator = 'NV_sequence')
merged_R1_reads_df['NV_sequence'] = np.where(merged_R1_reads_df.NV_sequence =='both',True, False)

merged_R2_reads_df = pd.merge(source_R2_reads_df, blast_R2_out_df, on =['Sequence id'], how = 'left', indicator = 'NV_sequence')
merged_R2_reads_df['NV_sequence'] = np.where(merged_R2_reads_df.NV_sequence =='both',True, False)


# Removing the requested reads from the source data:
no_NV_R1_reads_df = merged_R1_reads_df.loc[merged_R1_reads_df['NV_sequence'] == False]
no_NV_R2_reads_df = merged_R2_reads_df.loc[merged_R2_reads_df['NV_sequence'] == False]


# Removing unnecessary reads from the resulted df's:
no_NV_R1_reads_df = no_NV_R1_reads_df.drop([1,2,3,4,5,6,7,8,9,10,11,'NV_sequence'], axis = 1)
no_NV_R2_reads_df = no_NV_R2_reads_df.drop([1,2,3,4,5,6,7,8,9,10,11,'NV_sequence'], axis = 1)

In [None]:
# Step 3: Writing the resulted df's to csv or fasta files:

no_NV_R1_reads_df.to_csv(os.path.join(DIR,'no_NV_R1_reads.csv.gz'),index = False, compression = 'gzip')
no_NV_R2_reads_df.to_csv(os.path.join(DIR,'no_NV_R2_reads.csv.gz'),index = False, compression = 'gzip')

def df_to_fasta(df, filename):
    with open(os.path.join(DIR, filename), "w") as f:
        for i in range(len(df)):
            f.write('>' + df.loc[i][0] + '\n' + df.loc[i][1] + '\n')
            
            
df_to_fasta(no_NV_R1_reads_df, 'no_NV_R1_reads_df.fasta')
df_to_fasta(no_NV_R2_reads_df, 'no_NV_R2_reads_df.fasta')