# 合并测序数据

In [None]:
BASE_DIR="/Users/john/git/UTR/data/prompt_sev"
fwd_fastq="${BASE_DIR}/15-8-25-1_raw_1.fq.gz"
rev_fastq="${BASE_DIR}/15-8-25-2_raw_1.fq.gz"
merge_fasta="${BASE_DIR}/sev240222_v1.fasta"
merge_fastq="${BASE_DIR}/sev240222_v1.fastq"

# 1. 利用pandaseq处理测序数据
pandaseq -f ${fwd_fastq} -r ${rev_fastq} -w ${merge_fasta} -g log.txt

# 2. 利用pear和seqkit处理测序数据
pear -f ${fwd_fastq} -r ${rev_fastq} -p 0.001 -j 20 -n 110 -o ${merge_fastq}

# 用于pear输出的fastq格式，需要seqkit进一步处理测序数据
seqkit fq2fa ${merge_fastq} -o ${merge_fasta}

# 处理DNA数据

In [None]:
from Bio import SeqIO
import tqdm
import os
import pandas as pd
import re
from collections import Counter

# DNA_regexp = re.compile(r"ggtgcctgaaactag([a-z]+)atg([a-z]{9,21})ggtggttcgggcggt",re.I) # 0610
DNA_regexp = re.compile(r"gaccaactag([a-z]+)atg([a-z]{15})ggtggttcgg",re.I) # 11



def record_dna(match, m,d):
    target_seq = match.group(1)
    barcode = match.group(2)
    d[barcode]=d.get(barcode,0)+1
    counter = m.get(barcode,Counter())
    counter.update([target_seq])
    m[barcode]=counter


def gendna_abundance(fn):
    d, m= {}, {}
    reverse, forward = 0,0
    for i, v in enumerate(tqdm.tqdm(SeqIO.parse(fn, format="fasta"))):
#         if i>200000:
#             break
        seq = str(v.seq)
        for match in re.finditer(DNA_regexp, seq):
            forward +=1
            record_dna(match,m,d)
        seq_reverse = str(v.seq.reverse_complement())
        for match in re.finditer(DNA_regexp, seq_reverse):
            reverse +=1
            record_dna(match,m,d)
    barcodes,seqs,counts = [],[],[]
    for b,c in m.items():
        for s,n in c.items():
            barcodes.append(b)
            seqs.append(s)
            counts.append(n)
    abundance_count_df = pd.DataFrame(data={"barcode":barcodes, "dna_seq":seqs,"plasmid_counts":counts})
    print(len(abundance_count_df))
    base,fname = os.path.dirname(fn),os.path.basename(fn)
    name = fname[:fname.find(".")]
    abundance_count_df.to_csv(os.path.join(base,name+".csv"),index=False)
    print(f"found {forward} sequences forward, {reverse} reverse")
    print(f"Converting {forward+reverse} sequences to dataframe and saving")
    return abundance_count_df
    
gendna_abundance(fn="/data/home/jinyalong/data/SeV_231101/DNA.fa")

# 处理RNA数据

In [None]:
from Bio import SeqIO
import tqdm
import os
import pandas as pd
import re
from collections import Counter

# RNA_regexp = re.compile(r"GCAGAGTACAT([a-z]+)atg([a-z]{9,21})ggtggttcgg",re.I)
RNA_regexp = re.compile(r"^([a-z]+)atg([a-z]{15})ggtggttcgg",re.I)

def record_rna(match, m,d):
    target_seq = match.group(1)
    barcode = match.group(2)
    d[barcode]=d.get(barcode,0)+1
    counter = m.get(barcode,Counter())
    counter.update([target_seq])
    m[barcode]=counter
    

def genrna_abundance(fn):
    m, d={}, {}
    reverse, forward = 0,0
    for i, v in enumerate(tqdm.tqdm(SeqIO.parse(fn, format="fasta"))):
#         if i>200000:
#             break
        seq = str(v.seq)
        for match in re.finditer(RNA_regexp, seq):
            forward +=1
            record_rna(match,m,d)
        seq_reverse = str(v.seq.reverse_complement())
        for match in re.finditer(RNA_regexp, seq_reverse):
            reverse +=1
            record_rna(match,m,d)
    barcodes,seqs,counts = [],[],[]
    for b,c in m.items():
        for s,n in c.items():
            barcodes.append(b)
            seqs.append(s)
            counts.append(n)
    abundance_count_df = pd.DataFrame(data={"barcode":barcodes, "rna_seq":seqs,"rna_counts":counts})
    print(len(abundance_count_df))
    base,fname = os.path.dirname(fn),os.path.basename(fn)
    name = fname[:fname.find(".")]
    print(f"found {forward} sequences forward, {reverse} reverse")
    print(f"Converting {forward+reverse} sequences to dataframe and saving")
    abundance_count_df.to_csv(os.path.join(base,name+".csv"),index=False)
    return abundance_count_df

# genrna_abundance(fn="/data/home/jinyalong/data/SeV_240618/pl3-2-1.fa")
# genrna_abundance(fn="/data/home/jinyalong/data/SeV_240618/pl3-2-2.fa")
genrna_abundance(fn="/data/home/jinyalong/data/SeV_231101/RNA1.fa")
genrna_abundance(fn="/data/home/jinyalong/data/SeV_231101/RNA2.fa")

# 检查DNA和RNA数据

In [None]:
import pandas as pd
# dna_seq = pd.read_csv("/data/home/jinyalong/data/SeV_240530/DNA2.csv")
# rna_seq = pd.read_csv("/data/home/jinyalong/data/SeV_240530/pl3-2-1.csv")

dna_seq = pd.read_csv("/data/home/jinyalong/data/SeV_231101/DNA.csv")
rna_seq = pd.read_csv("/data/home/jinyalong/data/SeV_231101/RNA1.csv")

original_db = set([ v.upper() for v in pd.read_csv("mutation_db_118.csv")["seq"].values])
dna_seq_db = dna_seq[dna_seq.dna_seq.isin(original_db)]
dna_seq_dedup = set(dna_seq["dna_seq"].drop_duplicates().values)
dna_seq_dedup_db = original_db & dna_seq_dedup
dna_seq_barcode_dedup = dna_seq["barcode"].drop_duplicates().values
dna_seq_barcode_dedup_db = dna_seq_db["barcode"].drop_duplicates().values
print(f'''Read DNAs {dna_seq["plasmid_counts"].sum()} reads found, {dna_seq_db["plasmid_counts"].sum()} in db; 
      {len(dna_seq_dedup)} types, {len(dna_seq_dedup_db)} in db; 
      {len(dna_seq_barcode_dedup)} barcodes, {len(dna_seq_barcode_dedup_db)} in db''')


rna_seq_dedup = rna_seq.groupby(['rna_seq'])["rna_counts"].sum().reset_index(name='rna_counts')
rna_seq_barcode_dedup = rna_seq.groupby(['barcode'])["rna_counts"].sum().reset_index(name='barcode_counts')
print(f'''Read RNAs {rna_seq["rna_counts"].sum()} reads found, 
      {len(rna_seq_dedup)} types, {len(rna_seq_barcode_dedup)} barcodes''')

# 清洗DNA和RNA数据

In [None]:
import pandas as pd
df=pd.read_csv("/data/home/jinyalong/data/SeV_240530/DNA2.csv")
# df = df[df["plasmid_counts"]>5]
barcodes, seqs, counts = [],[],[]
bar1,barm, hit=0,0,0
for k, g in df.groupby(["barcode"]):
    if g.shape[0]==1:
        bar1+=1
        barcodes.append(g.iloc[0].barcode)
        seqs.append(g.iloc[0].dna_seq)
        counts.append(g.iloc[0].plasmid_counts)
    else:
        barm+=1
        lst = sorted(g["plasmid_counts"].values)
        gdf = g.sort_values(by=['plasmid_counts'],ascending=False)
        top1, top2 = gdf.iloc[0].plasmid_counts,gdf.iloc[1].plasmid_counts
        if top1>10*top2:
            hit += 1
            barcodes.append(gdf.iloc[0].barcode)
            seqs.append(gdf.iloc[0].dna_seq)
            counts.append(gdf.iloc[0].plasmid_counts)
print(bar1, barm, hit)
cdf = pd.DataFrame({"barcode":barcodes,"dna_seq":seqs,"plasmid_counts":counts})
cdf.to_csv("/data/home/jinyalong/data/SeV_240530/DNA2_clean.csv",index=False)

# 合并DNA和RNA数据

In [None]:
rna1=pd.read_csv("/data/home/jinyalong/data/SeV_240530/pl3-1-1.csv")
rna2=pd.read_csv("/data/home/jinyalong/data/SeV_240530/pl3-1-2.csv")
original_db = set([ v.upper() for v in pd.read_csv("./data/mutation_db_118.csv")["seq"].values])
ddf = pd.read_csv("/data/home/jinyalong/data/SeV_240530/DNA2_clean.csv")
ddf = ddf[ddf["dna_seq"].isin(original_db)]
rna1 = rna1.groupby(['barcode'])["rna_counts"].sum().reset_index(name='rna1')
rna2 = rna2.groupby(['barcode'])["rna_counts"].sum().reset_index(name='rna2')
mdf = pd.merge(ddf,rna1,on=["barcode"], how="left")
mdf = pd.merge(mdf,rna2,on=["barcode"], how="left")
mdf.fillna(0, inplace=True)
mdf = mdf[(mdf["rna1"]>0)|(mdf["rna2"]>0)]

def std(r1,r2):
    if r1 and r2:
        return max(r1/r2,r2/r1)
    else:
        return 100
    
mdf["std"] = mdf.apply(lambda x:std(x["rna1"],x["rna2"]),axis=1)
mdf["score1"] = mdf["rna1"]/mdf["plasmid_counts"]
mdf["score2"] = mdf["rna2"]/mdf["plasmid_counts"]
mean = (mdf["score1"].mean()+mdf["score2"].mean())/2
mdf["sstd"] = mdf.apply(lambda x:abs(x["score1"]-x["score2"])/mean,axis=1)
cdf = mdf
# cdf = mdf[(mdf["sstd"]<0.2)]
cdf = mdf[(mdf["sstd"]<0.2)]
# fdf = cdf.groupby("dna_seq")["plasmid_counts","rna1","rna2"].sum().reset_index()
fdf = cdf
fdf["seq"] = fdf.apply(lambda x:x["dna_seq"]+"ATG"+x["barcode"],axis=1)
fdf["rna_counts"]=(fdf["rna1"]+fdf["rna2"])/2
# mrna=cdf.groupby("barcode")["rna2"].sum().to_dict()
# cdf["rna_counts"] = cdf["barcode"].map(mrna)
fdf.to_csv("./data/DNA_RNA_DAY5_BARCODE.csv",index=False)
fdf.shape

In [None]:
from tqdm import tqdm
import Levenshtein
import numpy as np
original_seq = 'atcccgggtgaggcatcccaccatcctcagtcacagagagacccaatctaccatcagcatcagccagtaaagattaagaaaaacttagggtgaaagaaatttcacctaacacggcgca'.upper()
dna_seq = pd.read_csv("/data/home/jinyalong/data/SeV_240530/DNA2.csv")
rna_seq = pd.read_csv("/data/home/jinyalong/data/SeV_240530/pl3-1-1.csv")
original_db = set([ v.upper() for v in pd.read_csv("mutation_db_118.csv")["seq"].values])
dna_seq_db = dna_seq[dna_seq.dna_seq.isin(original_db)]
rna_barcode = rna_seq.groupby(['barcode'])["rna_counts"].sum().reset_index(name='rna_counts')
dna_rna_db = dna_seq_db[dna_seq_db.barcode.isin(rna_barcode.barcode)]
barcode_1vn_threshold = 10  # 对barcode一对多的情况，只有最多的比第二多出barcode_1vn_threshold倍才可用于计数
raw_path = "raw_data.csv"
score_path = "score_file.csv"
errors = []
errors_group_count = 0
dna_rna_db.rename(columns={"dna_seq":"seq"},inplace=True)
dna_seq_db=dna_seq_db.copy()
dna_seq_db.rename(columns={"dna_seq":"seq"},inplace=True)

with open(raw_path, 'w') as wfh:
    for g, group in tqdm(dna_rna_db.groupby('barcode')):
        group['dna_types'] = len(set(group['seq']))
        group = group[['barcode', 'dna_types', 'seq']]
        reads = group.groupby('seq').size().reset_index(name='dna_reads')
        group = group.merge(reads).drop_duplicates('seq')
        group = group.merge(rna_barcode)
        group = group[group.dna_reads > (group.dna_reads.max() / barcode_1vn_threshold)]
        if len(group) > 1:
            errors.append(group)
            errors_group_count += 1
        else:
            group.to_csv(wfh, index=False, header=None, sep='\t')

dna_rna_db_merge = pd.read_csv(raw_path, sep='\t', header=None)
dna_rna_db_merge.columns = ['barcode', 'barcode_matched_dna_types', 'seq', 'plasmid_counts', 'rna_counts']
dna_rna_db_merge = dna_rna_db_merge.sort_values(by=['rna_counts', 'plasmid_counts', 'barcode_matched_dna_types'], ascending=False)
dna_rna_db_merge.to_csv(raw_path, sep='\t', index=False)
dna_rna_db_merge = dna_rna_db_merge.groupby('seq').sum().reset_index(drop=False)[['seq', 'plasmid_counts', 'rna_counts']]
dna_rna_db_merge['length'] = dna_rna_db_merge.seq.apply(len)
plasmid_counts_dict = dna_seq_db.groupby('seq')["plasmid_counts"].sum().to_dict()
dna_rna_db_merge['plasmid_counts'] = dna_rna_db_merge.seq.apply(lambda x: plasmid_counts_dict[x])
dna_rna_db_merge['plasmid_freq'] = dna_rna_db_merge.plasmid_counts / dna_rna_db_merge.plasmid_counts.sum()
dna_rna_db_merge['rna_freq'] = dna_rna_db_merge.rna_counts / dna_rna_db_merge.rna_counts.sum()
dna_rna_db_merge['score'] = np.log(dna_rna_db_merge.rna_freq / dna_rna_db_merge.plasmid_freq)
dna_rna_db_merge['distance'] = dna_rna_db_merge.seq.apply(lambda x: Levenshtein.distance(original_seq, x))
distance_bins = np.arange(0, dna_rna_db_merge.distance.max() // 5 * 4, dna_rna_db_merge.distance.max() // 5)
distance_bins = np.append(distance_bins, np.inf)
dna_rna_db_merge['distance_cat'] = pd.cut(dna_rna_db_merge.distance, bins=distance_bins, labels=list(range(1, len(distance_bins))), right=False)
dna_rna_db_merge = dna_rna_db_merge[['seq', 'plasmid_counts', 'length', 'plasmid_freq', 'rna_counts', 'rna_freq', 'score', 'distance', 'distance_cat']]

dna_rna_db_merge.to_csv(score_path)
errors = pd.concat(errors)
print(f'finally used {dna_rna_db_merge.plasmid_counts.sum()} DNA reads, {dna_rna_db_merge.rna_counts.sum()} RNA reads, {len(dna_rna_db_merge)} different sequences')
print(f'{errors_group_count} types of RNA cannot be matched to DNA sequence due to 1vn problem, {errors.rna_counts.sum()} reads in summary')
print('wild type exist', str(bool(len(dna_rna_db_merge[dna_rna_db_merge.seq == original_seq]))))