In [1]:
from collections import Counter

from preprocessing import preprocess
import plotly.express as px
import pandas as pd
from pathlib import Path
from Bio import SeqIO
import matplotlib.pyplot as plt
from itertools import pairwise

In [2]:
datasets = {
    str(filename): list(SeqIO.parse(filename, format="fasta"))
    for filename in Path("../data/raw").glob("*.fasta")
}
datasets = {
    f'{filename}_{len(bin_[0])}': bin_ for filename, genes in datasets.items() for bin_ in preprocess(list(genes), bin_width=100, min_bin_size=30) 
}

In [4]:
df = pd.DataFrame.from_records(
    [
        (filename, len(gene.seq))
        for filename, genes in datasets.items()
        for gene in genes
    ],
    columns=["filename", "gene_length"],
)

px.box(df, color="filename", x="gene_length")

In [None]:
records = []
for filename, genes in datasets.items():
    all_nucs = "".join(str(gene.seq) for gene in genes)
    counts = Counter(all_nucs)
    for nuc, count in counts.items():
        records.append((filename.split("_")[0], nuc, count / len(all_nucs)))


df = pd.DataFrame.from_records(
    records, columns=["filename", "nucleotide", "Relative Frequency"]
)

px.bar(
    df,
    barmode="group",
    x="nucleotide",
    y="Relative Frequency",
    color="filename",
)

In [None]:
def kmerize(sequence: str, k=3):
    return [sequence[i : i + k] for i in range(len(sequence) - k)]


records = []
for filename, genes in datasets.items():
    all_kmers = [kmer for gene in genes for kmer in kmerize(str(gene.seq))]
    counts = Counter(all_kmers)
    for kmer, count in counts.items():
        records.append((filename, kmer, count / len(all_kmers)))


df = pd.DataFrame.from_records(
    records, columns=["filename", "kmer", "Relative Frequency"]
)

px.violin(
    df,
    violinmode="group",
    hover_name="kmer",
    y="Relative Frequency",
    color="filename",
    points="all",
)

In [None]:
import ssw

In [5]:
for dataset_name, bin_ in datasets.items():
    path = Path(dataset_name)
    
    SeqIO.write(
        bin_,
        Path(f"../data/preprocessed/{path.name}"),
        format="fasta",
    )

In [None]:
help(ssw.force_align)

In [None]:
mgr = ssw.AlignmentMgr()
help(mgr.build_dna_score_matrix())

In [None]:
aligned = list(SeqIO.parse("../data/random.fasta_1203", format="fasta"))


In [None]:
import numpy as np

matrix = np.array([np.array(list(gene.seq)) for gene in aligned])

In [None]:
matrix.shape

In [None]:
matrix[:, 10:20]

In [None]:
from collections import Counter

px.bar(
    pd.DataFrame.from_dict(
        Counter(Counter(column)["-"] for column in matrix.T), orient="index"
    )
)

In [None]:
lesser_sparsed_columns = [
    i
    for i, column in enumerate(matrix.T)
    if Counter(column)["-"] < len(column) * 0.75
]
denser_matrix = matrix.T[lesser_sparsed_columns].T

In [None]:
denser_matrix.shape

In [None]:
def average_neighborhood(matrix: np.ndarray):
    averaged = np.pad(np.zeros_like(matrix, dtype=np.float32), pad_width=(1, 1))
    for i in range(len(matrix) - 1):
        for j in range(len(matrix[i]) - 1):
            averaged[i, j] = (
                sum(
                    [
                        matrix[i, j + 1],
                        matrix[i, j - 1],
                        matrix[i + 1, j],
                        matrix[i - 1, j],
                    ]
                )
                / 4
            )
    return averaged


averaged = denser_matrix != "-"
for _ in range(2):
    averaged = average_neighborhood(averaged)

plt.matshow(averaged)

In [None]:
lesser_sparsed_columns

In [None]:
intervals = []

beginning = lesser_sparsed_columns[0]
for n, m in pairwise(lesser_sparsed_columns[1:]):
    if n == m - 1:
        continue
    else:
        intervals.append((beginning, n))
        beginning = m

In [None]:
cool_intervals = [(i, j) for i, j in intervals if i + 8 < j]

In [None]:
len(cool_intervals)

In [None]:
for i, j in cool_intervals:
    plt.matshow(matrix[:, i:j] != "-")

In [None]:
plt.matshow(matrix != "-")