In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pyfastx

In [45]:
def telomeres_from_tidehunter(sample: Path):
    left = pd.read_csv(list(sample.rglob("tidehunter/left_telomeres.csv"))[0])
    right = pd.read_csv(list(sample.rglob("tidehunter/right_telomere.csv"))[0])
    both = pd.concat([left, right])
    return both


def extract_just_telomeres(sample: Path, fraction: float = 0.7):
    return (
        telomeres_from_tidehunter(sample)
        .assign(fraction_telomere=lambda x: x.region_length / x.read_len)
        .assign(n_telomeres=lambda x: x.shape[0])
        .loc[lambda x: x.fraction_telomere >= fraction]
    )

def aligned_telomeres_from_tidehunter(sample: Path, NUM_FROM_END = 10_000):
    chr_lengths = pd.read_csv("chm13v2_chr_lengths.csv")
    telomere_df = (
        pd.read_csv(
            list(sample.rglob("results/*telomere_results.csv"))[0]
        )
        .rename(columns={"region_length": "telomere_length"})
        .merge(chr_lengths, on="chr")
        .rename(columns={
            "total_length": "chr_length",
            }
        )
        .assign(
            arm=lambda x: np.select(
                [x.align_start < NUM_FROM_END, x.align_start > x.chr_length - NUM_FROM_END],
                ["left", "right"],
                default=pd.NA,
            )
        )
        .assign(fraction_telomere=lambda x: x.telomere_length / x.read_len)
        .dropna()
        .reset_index()
        .sort_values("telomere_length", ascending=False)
    )
    
    return telomere_df
    
def all_reads_aligned_telomeres(sample: Path, NUM_FROM_END: int = 10_000):
    
    chr_lengths = pd.read_csv("chm13v2_chr_lengths.csv")
    telomere_df = (
        pd.read_csv(
            list(sample.rglob("all_reads_aligned/*all_reads.bed"))[0],
            sep="\t",
            header=None,
            names=["chr", "start", "end", "id", "number", "strand"]
        )
        .assign(read_length=lambda x: x.end - x.start)
        .merge(chr_lengths, on="chr")
        .assign(
            arm=lambda x: np.select(
                [x.start < NUM_FROM_END, x.start > x.total_length - NUM_FROM_END],
                ["left", "right"],
                default=pd.NA,
            )
        )
        .dropna()
        .reset_index()
        .sort_values("start", ascending=True)
    )
    return telomere_df

def attach_fasta_sequence_all(sample: Path):
    
    telomere_df = all_reads_aligned_telomeres(sample)
    fastq_file = list(sample.rglob("filtered-fastq/*raw_porechop.fastq.gz"))[0] 
    #fastq_file_telomeres = list(sample.rglob("telomeres/*telomeres_porechop.fastq.gz"))[0]
    
    telomere_ids = telomere_df["id"].values
    #telomere_fastq = pyfastx.Fastq(str(fastq_file_telomeres), build_index=False)
    telomere_fastq = pyfastx.Fastq(str(fastq_file), build_index=False)
    
    sequence_name = [read[0] for read in telomere_fastq if read[0] in telomere_ids]
    sequence_fasta = [read[1] for read in telomere_fastq if read[0] in telomere_ids]
    # PHRED score
    #qualities = [list(read[2]) for read in telomere_fastq if read[0] in telomere_ids]
    #qualities = [[ord(x) - 33 for x in y] for y in qualities]
    
    sequence_df =  (
        pd.DataFrame()
        .assign(
            id=sequence_name,
            fasta=sequence_fasta,
            #PHRED=qualities,
        )
    )
    return sequence_df.merge(telomere_df, on="id")
    
    

In [14]:
import altair as alt
from altair_saver import save
def plot_telomeres(sample: Path):
    """
    Plots information about found telomeres
    """
    telomere_df = aligned_telomeres_from_tidehunter(sample)
    plot = (
        alt.Chart(telomere_df)
        .mark_circle()
        .encode(
            alt.X("chr:N", title="Chromosome"),
            alt.Color("telomere_type:N"),
            alt.Y("telomere_length", title="Length of Telomere"),
            alt.Size("read_len:Q", scale=alt.Scale(range=[50, 500]), title= "Read length"),
            tooltip=[alt.Tooltip("telomere_length:Q", title="Telomere length"),
                     alt.Tooltip("telomere_type:N", title="Telomere type"),
                     alt.Tooltip("read_len:Q", title="Read length"),
                     alt.Tooltip("id:N", title="ID"),
                     alt.Tooltip("telomere_pattern_start:Q", title="Start of telomere in read"),
                     alt.Tooltip("telomere_pattern_end:Q", title="End of telomere in read"),
                     alt.Tooltip("fraction_telomere:Q", title="Fraction of telomere / read"),
                    ]
        )
        .properties(
            height=700,
        )
        .facet(column="arm", title=f"Number of telomeric reads: {telomere_df.shape[0]}")
    )
    return plot


In [None]:
# function to extract fasta file from the above df of telomeres
import pyfastx
import matplotlib.pyplot as plt
import pandas as pd

def extract_fasta_telomeres(fastq: str, telomere_df: pd.DataFrame) -> None:
    """
    """
    telomere_ids = telomere_df["id"].values
    telomere_fastq = pyfastx.Fastq(fastq, build_index=False)
    
    sequence_name = [read[0] for read in telomere_fastq if read[0] in telomere_ids]
    sequence_fasta = [read[1] for read in telomere_fastq if read[0] in telomere_ids]
    # PHRED score
    qualities = [list(read[2]) for read in telomere_fastq if read[0] in telomere_ids]
    qualities = [[ord(x) - 33 for x in y] for y in qualities]
    
    PHRED_df =  (
        pd.DataFrame()
        .assign(
            id=sequence_name,
            fasta=sequence_fasta,
            PHRED=qualities,
        )
    )
    return PHRED_df.merge(telomere_df, on="id")


def plot_telomere_score(phred_df: pd.DataFrame) -> plt.Figure:
    """
    Plots the telomere score
    """
    fig, ax = plt.subplots()
    fig.set_figwidth(15)
    ax.plot(phred_df.PHRED)
    ax.axvspan(
        phred_df.telomere_pattern_start,
        phred_df.telomere_pattern_end, 
        facecolor='lightgreen', 
        alpha=0.4,
    )
    ax.set_ylabel('PHRED score')
    ax.set_xlabel('Position')
    # To put the sequence on the x-axis
    #plt.xticks(list(range(len(phred_df.PHRED))), phred_df.fasta, fontsize=5)
    
    plt.title(f"""
    Read: {phred_df.id}
    Motif: {phred_df.telomere_type}
    Telomere Length: {phred_df.telomere_length}
    """
    )
    return fig, phred_df.fasta
    
    


#plot.savefig("telomere-3'-example.jpg")

# ONTs way to calculate PHRED:
# https://labs.epi2me.io/notebooks/Introduction_to_fastq_file.html


# open the file and iterate through its records
#with FastxFile("all_records.fastq") as fq:
#    for rec in fq:
#        # ONT calculation for "mean Q score"
#        quals = np.fromiter(
#            (ord(x) - 33 for x in rec.quality),
#            dtype=int, count=len(rec.quality))
#        mean_p = np.mean(np.power(10, quals/-10))
#        mean_qualities.append(-10*np.log10(mean_p))
#        # all qualities
#        qualities.extend(quals)
#        lengths.append(len(quals))

In [47]:
sample_dir = Path("/home/nanopore/M622339_ONT_DATA/analyzed-data/")
samples = [x for x in sample_dir.iterdir() if x.is_dir() and not x.stem.startswith(".") and not "Lambda" in x.stem]

for sample in samples[1:]:
    
    destination = Path("/home/nanopore/analyzed-data/dorado-excel-and-plots/") / sample.stem
    destination.mkdir(exist_ok=True)
    
    # creating files
    aligned_telomeres_tidehunter = aligned_telomeres_from_tidehunter(sample)
    sequence_df = attach_fasta_sequence_all(sample)
    just_telomeres = extract_just_telomeres(sample)
    plot = plot_telomeres(sample)
    
    # saving files
    aligned_telomeres_tidehunter.to_excel(f"{destination}/{sample.stem}_aligned_telomeres_tidehunter.xlsx")

    (
        sequence_df
        .assign(sequence=lambda x: x.fasta)
        .drop(columns=["fasta", "index", "number", "total_length"])
        .to_excel(f"{destination}/{sample.stem}_aligned_to_ends_sequence.xlsx")

    )

    just_telomeres.to_excel(f"{destination}/{sample.stem}_just_telomeres.xlsx")

    save(plot, f"{destination}/{sample.stem}_tidehunter_aligned_telomere_plot.html")

# New plot and df

In [103]:
def all_reads_aligned_telomeres_merged_with_tidehunter(sample: Path): 

    telomeres_tidehunter = (
        telomeres_from_tidehunter(sample)
        .rename(
            columns={
                "start": "telomere_start",
                "end": "telomere_end",
                "region_length": "telomere_length",
                "read_len": "read_length"
            }
        )
        .assign(
            telomere_type=lambda x: np.select(
                [x.cons_seq.str.contains("CCCTAA"), x.cons_seq.str.contains("TTAGGG")],
                ["CCCTAA", "TTAGGG"]
            )
        )
        .assign(fraction_telomere=lambda x: x.telomere_length / x.read_length)
    )
    
    fasta_df = attach_fasta_sequence_all(sample).drop(columns="read_length")
    fasta_df = fasta_df.merge(telomeres_tidehunter, on="id").drop(columns="index")
    return fasta_df
    

def plot_all_reads_aligned_telomeres_merged_with_tidehunter(telomere_df: pd.DataFrame): 
    plot = (
        alt.Chart(telomere_df)
        .mark_circle()
        .encode(
            alt.X("chr:N", title="Chromosome"),
            alt.Color("telomere_type:N"),
            alt.Y("telomere_length", title="Length of Telomere"),
            alt.Size("read_length:Q", scale=alt.Scale(range=[50, 500]), title= "Read length"),
            tooltip=[alt.Tooltip("telomere_length:Q", title="Telomere length"),
                     alt.Tooltip("telomere_type:N", title="Telomere type"),
                     alt.Tooltip("read_length:Q", title="Read length"),
                     alt.Tooltip("id:N", title="ID"),
                     alt.Tooltip("telomere_start:Q", title="Start of telomere in read"),
                     alt.Tooltip("telomere_end:Q", title="End of telomere in read"),
                     alt.Tooltip("fraction_telomere:Q", title="Fraction of telomere / read"),
                    ]
        )
        .properties(
            height=700,
        )
        .facet(column="arm", title=f"{sample.stem}, Number telomeres: {telomere_df.shape[0]}")
    )
    
    return plot

In [104]:
sample_dir = Path("/home/nanopore/M622339_ONT_DATA/analyzed-data/")
samples = [x for x in sample_dir.iterdir() if x.is_dir() and not x.stem.startswith(".") and not "Lambda" in x.stem]

for sample in samples:
    
    destination = Path("/home/nanopore/analyzed-data/dorado-excel-and-plots/") / sample.stem
    destination.mkdir(exist_ok=True, parents=True)
    
    fasta_df = all_reads_aligned_telomeres_merged_with_tidehunter(sample)
    plot = plot_all_reads_aligned_telomeres_merged_with_tidehunter(fasta_df) 
    
    fasta_df.to_excel(f"{destination}/{sample.stem}_aligned_to_ends_merged_tidehunter_and_sequence.xlsx")
    save(plot, f"{destination}/{sample.stem}_aligned_to_ends_merged_tidehunter_telomere_plot.html")

# Telsize tool