In [51]:
import os
os.environ["NUM_THREADS"] = "1"
import pandas as pd
import numpy as np

In [22]:
from Bio import SeqIO

In [27]:
# read trinity fasta
fasta_file = "/data6/xzg_data/trinity.Trinity.fasta"

fasta_dict = {}
for seq_record in SeqIO.parse(fasta_file, "fasta"):
    fasta_dict[seq_record.id] = seq_record


In [42]:
# read salmon output 
salmon_output_sf = [
    "/data6/xzg_data/salmon/L_quant.sf",
    "/data6/xzg_data/salmon/St_quant.sf",
    "/data6/xzg_data/salmon/Tip_quant.sf"
]


# read salmon output and merged them 
expr_df = pd.DataFrame()

for sf in salmon_output_sf:
    sample_name = sf.split("/")[-1].split("_")[0]
    tmp = pd.read_csv(sf, sep="\t")
    # add sample name column
    tmp["sample"] = sample_name
    expr_df = pd.concat([expr_df, tmp])

In [29]:
# read transdecoder output

# Define a function to parse the line and extract information
def parse_bed_line(line):
    fields = line.strip().split('\t')
    
    # Extract information from fields
    transcript_name = fields[0]
    transcript_start = int(fields[1])
    transcript_end = int(fields[2])
    info = fields[3]
    strand = fields[5]
    orf_start = int(fields[6])
    orf_end = int(fields[7])
    
    # Parse the info field to extract ORF ID and ORF type
    info_parts = info.split(';')
    orf_id = info_parts[0].split('=')[1]
    orf_type_with_score = info_parts[2].split(',')[0].split(':')[1].strip()
    
    return {
        'transcript_name': transcript_name,
        'transcript_start': transcript_start,
        'transcript_end': transcript_end,
        'orf_id': orf_id,
        'orf_type': orf_type_with_score,
        'strand': strand,
        'orf_start': orf_start,
        'orf_end': orf_end
    }

# Read and parse the BED file
bed_file = "/data6/xzg_data/transdecoder.standard.code/trinity.Trinity.fasta.transdecoder.bed"

bed_dict = {}

with open(bed_file, 'r') as file:
    # Skip the header
    next(file)
    
    # Parse each line
    for line in file:
        parsed_data = parse_bed_line(line)
        transcript_name = parsed_data['transcript_name']
        if transcript_name not in bed_dict:
            bed_dict[transcript_name] = []
        bed_dict[transcript_name].append(parsed_data)


# omit

In [31]:
bed_dict['TRINITY_DN0_c0_g2_i1']

[{'transcript_name': 'TRINITY_DN0_c0_g2_i1',
  'transcript_start': 0,
  'transcript_end': 870,
  'orf_id': 'TRINITY_DN0_c0_g2_i1.p3',
  'orf_type': 'complete_(+)',
  'strand': '+',
  'orf_start': 127,
  'orf_end': 664}]

In [147]:
pivot_df = expr_df.pivot_table(index='Name', columns='sample', values='TPM', aggfunc='max')
# 使用idxmax获取TPM值最大的sample
pivot_df['max_tpm'] = pivot_df.max(axis=1)
tpm_dict = pivot_df["max_tpm"].to_dict()

In [202]:
# 筛选标准
# 1. 

keep_tx = []
min_tpm = 1.0

for tx, predictions in bed_dict.items():
    for orf in predictions:
        if "complete" in orf['orf_type']:
            keep_tx.append(tx)
            break
    if tpm_dict[tx] > min_tpm:
        keep_tx.append(tx)


In [205]:
keep_tx

['TRINITY_DN0_c0_g2_i1',
 'TRINITY_DN0_c0_g2_i1',
 'TRINITY_DN0_c0_g2_i2',
 'TRINITY_DN0_c0_g2_i4',
 'TRINITY_DN0_c0_g2_i5',
 'TRINITY_DN0_c0_g2_i5',
 'TRINITY_DN0_c0_g2_i6',
 'TRINITY_DN0_c0_g2_i6',
 'TRINITY_DN0_c0_g2_i7',
 'TRINITY_DN10001_c0_g1_i3',
 'TRINITY_DN10001_c0_g1_i3',
 'TRINITY_DN10001_c0_g1_i4',
 'TRINITY_DN10001_c0_g3_i2',
 'TRINITY_DN100059_c0_g1_i1',
 'TRINITY_DN10005_c0_g2_i1',
 'TRINITY_DN10005_c0_g2_i2',
 'TRINITY_DN10005_c0_g2_i2',
 'TRINITY_DN10005_c0_g2_i3',
 'TRINITY_DN10005_c0_g2_i4',
 'TRINITY_DN10005_c0_g2_i6',
 'TRINITY_DN10005_c0_g2_i6',
 'TRINITY_DN10005_c0_g2_i7',
 'TRINITY_DN10006_c0_g1_i1',
 'TRINITY_DN10006_c0_g1_i1',
 'TRINITY_DN10008_c0_g1_i4',
 'TRINITY_DN10008_c0_g1_i5',
 'TRINITY_DN10008_c1_g1_i1',
 'TRINITY_DN10008_c1_g1_i1',
 'TRINITY_DN10008_c1_g1_i2',
 'TRINITY_DN10008_c1_g2_i1',
 'TRINITY_DN10008_c1_g2_i1',
 'TRINITY_DN10009_c0_g1_i1',
 'TRINITY_DN10009_c0_g1_i1',
 'TRINITY_DN10009_c0_g2_i1',
 'TRINITY_DN1000_c0_g1_i1',
 'TRINITY_DN1000_c0_g

In [207]:
ss = set(["_".join(x.split("_")[:-1]) for x in keep_tx])

In [52]:
# 如果都是小于


True

In [121]:
paf_file = "/data5/xzg_data/2023/09/xhc-sj-trnscriptome-assembly/output.paf"


base_column_names = ["q_name", "q_length", "q_start", "q_end", "strand",
                     "t_name", "t_length", "t_start", "t_end",
                     "num_matches", "block_length", "mapq"]

data = []

with open(paf_file, "r") as file:
    for line in file:
        parts = line.strip().split("\t")
        base_data = parts[:12]
        tags = parts[12:]

        # 解析标签为字典
        tag_dict = {}
        for tag in tags:
            key, _, value = tag.split(":")
            tag_dict[key] = value

        # 合并基础数据和标签字典
        row_data = base_data + [tag_dict]
        data.append(row_data)

import pandas as pd

# 定义列的数据类型字典
dtype_dict = {
    "q_name": str,
    "q_length": int,
    "q_start": int,
    "q_end": int,
    "strand": str,
    "t_name": str,
    "t_length": int,
    "t_start": int,
    "t_end": int,
    "num_matches": int,
    "block_length": int,
    "mapq": float,  # 如果mapq是浮点数
    "tags": object  # 如果tags是字典
}

# 创建DataFrame并指定列的数据类型
df = pd.DataFrame(data, columns=base_column_names + ["tags"]).astype(dtype_dict)

df['q2t'] =  df['q_length'] / df['t_length'] 

df['q_cov'] =  df['num_matches'] / df['q_length']
df['t_cov'] =  df['num_matches'] / df['t_length']
