In [4]:
import numpy as np
import os

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

## Dataset curation

In [2]:
# Read all signals and sequences from dataset
datapath = "popavg_reacts_dataset"
n_seq = len([file.path for file in os.scandir(datapath) if os.path.isfile(file.path)])

# conversion between integer encoding and base character
base2idx = {
    "A": 2,
    "C": 1,
    "U": -1,
    "T": -1,
    "G": -2,
}

idx2base = np.array(['G', 'U', 'X', 'C', 'A'])

# Go over all files in dataset
sequences, full_sequence = [], np.empty(0).astype(np.int8)
signals, full_signal = [], np.empty(0)
for i, file in enumerate(os.scandir(datapath)):

    data = np.loadtxt(file.path, delimiter=",", skiprows=1, dtype=str)
    signal = data[:, 2].astype(np.float64)
    signal = np.nan_to_num(signal)

    sequence = np.array([base2idx[base] for base in data[:,3]]).astype(np.int8)

    sequences.append(sequence)
    signals.append(signal)

    full_sequence = np.concatenate((full_sequence, sequence))
    full_signal = np.concatenate((full_signal, signal))


In [3]:
## Create dataset of signals for various sliding windows

# The dataset is a dictionary, each key corresponding to one possible window of bases
win_len = 3
seq_window_set = np.unique(np.vstack([full_sequence[i:i+win_len] for i in range(len(full_sequence)-win_len)]), axis=0)
seq_window_set = seq_window_set[seq_window_set[:, 1]>0]
seq_window_set_key = [''.join(idx2base[seq_window+2]) for seq_window in seq_window_set]
print('Number of unique windows', len(seq_window_set))

dataset = {key:[] for key in seq_window_set_key}

# Go over each sequence, extract sliding window array, and find matches with key window
n_points = 0
for sequence, signal in zip(sequences, signals):
    idx_win = np.arange(len(sequence)-win_len+1)
    sliding_win_seq = sequence[idx_win[:, np.newaxis] + np.arange(win_len)]

    for seq_window, seq_key in zip(seq_window_set, seq_window_set_key):
        
        idx = np.where((sliding_win_seq == seq_window).all(axis=1))[0]
        dataset[seq_key] += list(signal[idx])
        
        n_points += len(idx)

print('Length of curated dataset', n_points)
print('Length of original dataset', len(full_sequence) - (win_len-1)*len(sequences))

Number of unique windows 32
Length of curated dataset 70149
Length of original dataset 142397


## Plots of histogram

In [47]:
fig = px.histogram(full_signal, title="Histogram of DMS signal for full dataset")
fig.update_layout(showlegend=False)

In [57]:
dataset['A'] = []
dataset['C'] = []

for key, value in dataset.items():
    if len(key) == 3:
        if key[1] =='A':
            dataset['A'] += value

        if key[1] =='C':
            dataset['C'] += value

fig = make_subplots(rows=1, cols=2)

for i, key in enumerate(['A', 'C']):
    fig.add_trace(
        go.Histogram(x=dataset[key], showlegend=False) , row=1, col=i+1)

    fig.update_xaxes(title_text=key, row=1, col=i+1)
fig.update_layout(
    height=500,
    width=1200,
    title_text="Histogram of DMS signals for A and C")
fig.show()

In [55]:
fig = make_subplots(rows=len(dataset)//2, cols=2)

row_A = 1
row_C = 1
for i, key in enumerate(dataset.keys()):
    if len(key) == 3:

        col = 1 if key[1]=='A' else 2

        if key[1]=='A':
            row = row_A
            row_A +=1
        else:
            row = row_C
            row_C +=1

        fig.add_trace(
            go.Histogram(x=dataset[key], showlegend=False) , row=row, col=col)

        fig.update_xaxes(title_text=key, row=row, col=col)

fig.update_layout(
    height=4000,
    width=1200,
    title_text="Histogram of DMS signals for each triplet bases")
fig.show()