In [9]:
from Bio import SeqIO
import tqdm
import os
import pandas as pd
from collections import Counter
genome = list(SeqIO.parse("../data/VEE_genome.fasta", format="fasta"))[0]
match_seq_len, len_5utr, len3_utr = 7, 44, 118
match_5utr=str(genome.seq[44:][:match_seq_len])
match_3utr=str(genome.seq[:11305][-match_seq_len:])
match_3utr_new='catggtcctgctggagttcgtgaccgccgccgggatcactctcggcatggacgagctgtacaagtaa'.upper()[-match_seq_len:]
polya='aaaaaaaaaaaaaaaaaaaaaaaaaa'.upper()[:match_seq_len]
print("old ", match_5utr, "new ", match_3utr_new, "polya ", polya)

def merge_3utr(fn1, fn2):
    target_seqs = []
    for v in tqdm.tqdm(SeqIO.parse(fn1, format="fasta")):
        seq = str(v.seq)
        idx = seq.find(match_3utr_new)
        if idx != -1 and (idx+match_seq_len+len3_utr+match_seq_len)<len(v.seq) and seq[idx+match_seq_len+len3_utr:][:match_seq_len]==polya:
            target_seq = seq[idx+match_seq_len:][:len3_utr]
            target_seqs.append(target_seq)
    d1 = Counter(target_seqs)
    target_seqs = []
    for v in tqdm.tqdm(SeqIO.parse(fn2, format="fasta")):
        seq = str(v.seq)
        idx = seq.find(match_3utr_new)
        if idx != -1 and (idx+match_seq_len+len3_utr+match_seq_len)<len(v.seq) and seq[idx+match_seq_len+len3_utr:][:match_seq_len]==polya:
            target_seq = seq[idx+match_seq_len:][:len3_utr]
            target_seqs.append(target_seq)
    d2 = Counter(target_seqs)
    abundance_count_dict = dict(d1+d2)
    abundance_count_df = pd.DataFrame(columns=["seq", "counts"])
    abundance_count_df.seq = list(abundance_count_dict.keys())
    abundance_count_df.counts = list(abundance_count_dict.values())
    abundance_count_df = abundance_count_df[abundance_count_df["counts"]>=5]
    abundance_count_df["length"] = abundance_count_df.seq.apply(len)
    seq_number_sum = abundance_count_df.counts.sum()
    abundance_count_df["freq"] = abundance_count_df.counts.apply(
        lambda x: x / seq_number_sum
    )
    print(len(abundance_count_df))
    base = os.path.dirname(fn1)
    fname = os.path.basename(fn1)
    name = fname[:fname.find(".")]
    abundance_count_df.to_csv(os.path.join(base,"3UTR-merge.csv"),index=False)
    


def gen_abundance_utr(fn="../data/VEE_3UTR/Cleandata/3-UTR-library-temDNA/3UTR-temDNA.fastq",is_3utr=True):
    target_seqs = []
    for v in tqdm.tqdm(SeqIO.parse(fn, format="fasta")):
        seq = str(v.seq)
        if is_3utr:
            idx = seq.find(match_3utr_new)
            if idx != -1 and (idx+match_seq_len+len3_utr+match_seq_len)<len(v.seq) and seq[idx+match_seq_len+len3_utr:][:match_seq_len]==polya:
                target_seq = seq[idx+match_seq_len:][:len3_utr]
                target_seqs.append(target_seq)
        else:
            idx = seq.find(match_5utr)
            if idx != -1:
                target_seq = seq[:idx][-len_5utr:]
                target_seqs.append(target_seq)
    abundance_count_dict = dict(Counter(target_seqs))
    abundance_count_df = pd.DataFrame(columns=["seq", "counts"])
    abundance_count_df.seq = list(abundance_count_dict.keys())
    abundance_count_df.counts = list(abundance_count_dict.values())
    abundance_count_df = abundance_count_df[abundance_count_df["counts"]>=5]
    abundance_count_df["length"] = abundance_count_df.seq.apply(len)
    seq_number_sum = abundance_count_df.counts.sum()
    abundance_count_df["freq"] = abundance_count_df.counts.apply(
        lambda x: x / seq_number_sum
    )
    print(len(abundance_count_df))
    base = os.path.dirname(fn)
    fname = os.path.basename(fn)
    name = fname[:fname.find(".")]
    abundance_count_df.to_csv(os.path.join(base,name+".csv"),index=False)


old  ATGGAGA new  CAAGTAA polya  AAAAAAA


In [10]:
# gen_abundance_utr(fn="../data/VEE_3UTR/Cleandata/3-UTR-library-temDNA/3UTR-temDNA.fastq")

# gen_abundance_utr(fn="../data/VEE_3UTR/add/VEEV-3-UTR-library-cDNA/3UTR-cDNA.fastq")
# gen_abundance_utr(fn="../data/VEE_3UTR/RNA/VEEV-3-UTR-library-cDNA/3UTR-cDNA.fastq")
# gen_abundance_utr("/Users/john/git/UTR/data/VEE_5UTR_0429/1ug-PVL-0412/1ug-PVL-0412.fasta",is_3utr=False)
# gen_abundance_utr("/Users/john/git/UTR/data/VEE_5UTR_0429/VL-CDNAKZ-1/VL-CDNAKZ-1.fasta",is_3utr=False)
merge_3utr("../data/VEE_3UTR/Cleandata/3-UTR-library-temDNA/3UTR-temDNA.fastq","../data/VEE_3UTR/add/VEEV-3-UTR-library-cDNA/3UTR-cDNA.fastq")

15458372it [01:53, 136043.37it/s]
17651486it [02:16, 129750.49it/s]


184073


In [12]:
import Levenshtein
import numpy as np
plasmid_data="../data/VEE_3UTR/Cleandata/3-UTR-library-temDNA/3UTR-temDNA.csv"
# plasmid_data="../data/VEE_3UTR/Cleandata/3-UTR-library-temDNA/3UTR-merge.csv"
rna_data="../data/VEE_3UTR/RNA/VEEV-3-UTR-library-cDNA/3UTR-cDNA.csv"
# plasmid_data="/Users/john/git/UTR/data/VEE_5UTR_0429/1ug-PVL-0412/1ug-PVL-0412.csv"
# rna_data="/Users/john/git/UTR/data/VEE_5UTR_0429/VL-CDNAKZ-1/VL-CDNAKZ-1.csv"
columns=["seq","counts","length","freq"]

def calculate_replication_score(length=40):

    # Load plasmid and RNA counts data
    print("Merging plasmid and RNA sequencing data ...")
    plasmid_freq_df = pd.read_csv(plasmid_data)[columns]
    plasmid_seq_dict = {
        seq: count
        for seq, count in zip(
            plasmid_freq_df.seq.tolist(), plasmid_freq_df.counts.tolist()
        )
    }
    rna_freq_df = pd.read_csv(rna_data)[columns]
    rna_seq_dict = {
        seq: count
        for seq, count in zip(rna_freq_df.seq.tolist(), rna_freq_df.counts.tolist())
    }
    print("\tRNA\t%d" % len(rna_freq_df))
    print("\tPlasmid\t%d" % len(plasmid_freq_df))

    # Find common sequences
    common_seqs = list(set(plasmid_freq_df.seq) & set(rna_freq_df.seq))
    common_seq_idx = {seq: i for i, seq in enumerate(common_seqs)}

    # Filter sequences
    for data in (plasmid_freq_df, rna_freq_df):
        data["seqid"] = data.seq.map(common_seq_idx)
        data.dropna(how="any", inplace=True)
        data["seq"] =data["seq"].apply(lambda x:x[:length])
        data.seqid = data.seqid.apply(int)
        data.set_index("seqid", inplace=True)
    print("\tRNA & Plasmid\t%d" % (len(common_seqs)))
    print(plasmid_freq_df.columns)

    # Merge dataset
    plasmid_freq_df.columns = ["seq", "plasmid_counts", "length", "plasmid_freq"]
    rna_freq_df.columns = ["seq", "rna_counts", "length", "rna_freq"]
    rna_freq_df.drop(columns=["seq", "length"], inplace=True)
    merged_freq_df = plasmid_freq_df.join(rna_freq_df)

    # Calculate replication score and edit distance
    print("Done.\nCalculating replication scores ...", end=" ")
    merged_freq_df["score"] = np.log(
        merged_freq_df.rna_freq / merged_freq_df.plasmid_freq
    )

    # Sorting data according to score.
    print("Done.\nSorting data ...", end=" ")
    merged_freq_df.sort_values(by="score", inplace=True, ascending=False)
    print("Done.")
    return merged_freq_df
        

df = calculate_replication_score(length=118)
df.to_csv("/Users/john/git/UTR/data/VEE_3UTR/VEE_3UTR.csv")

Merging plasmid and RNA sequencing data ...
	RNA	105959
	Plasmid	75838
	RNA & Plasmid	21873
Index(['seq', 'counts', 'length', 'freq'], dtype='object')
Done.
Calculating replication scores ... Done.
Sorting data ... Done.
