# 0) Data Loading for Efficacy Model

This notebook loads the CRISPR gRNA dataset and performs basic inspection.


## v0: Basic Data Manipulation

In [None]:
# IMPORTS

from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mypackage.config import *
from mypackage.utils import *

In [None]:
# LOADING IN DATASET (csv file from GitHub)

efficacy_data = DATA_RAW / "efficacy.csv"
df = pd.read_csv(efficacy_data)

df

In [None]:
# BASIC EXPLORATION

# df.head()
# df.info()

# df.size # 184239: num oftotal elements
# df.shape # (16849, 11): (rows, columns)

# df.columns # ['chr', 'start', 'end', 'direction', '23-nt sequence', 'ctcf', 'dnase',
             #  'h3k4me3', 'rrbs', 'Normalized efficacy', 'Efficacy']

# df.describe() # information about efficiency scores

In [None]:
# CLEANING COLUMN NAMES
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_", regex=False) # making lowercase and replacing spaces with underscores
df.columns

In [None]:
df["23-nt_sequence"].str.len().unique() # is always 23

In [None]:
# ARE THERE ANY INCONSISTENCIES IN THE EFFICACY SCORES?
related_cols =['normalized_efficacy', 'efficacy']
df[(df.normalized_efficacy < 0.5) & (df.efficacy == 1)][related_cols]  # Yes, the boundary of the normalized efficacy score must be below 0.5

In [None]:
# Get minimum value of normalized_efficacy where efficacy is 1
mn = df[df.efficacy == 1]['normalized_efficacy'].min()

# Check for normalized_efficacy values smaller than this minimum where efficacy is 1 -> should be none
df[(df.efficacy == 1) & (df.normalized_efficacy < mn)][related_cols] # None found 

## v1: Data Manipulation for Dataset without consideration of epigenetics

### v1.0

In [None]:
df1_0 = df.copy()
columns_to_drop = ["chr", "start", "end", "direction", "efficacy", "ctcf", "dnase", "h3k4me3", "rrbs"]
df1_0 = df1_0.drop(columns=columns_to_drop, inplace=False)
df1_0_small = df1_0.sample(n=5, random_state=42).copy()
df1_0_small

In [None]:
# def add_onehot_columns(row) -> None:
#     sequence = str(row["23-nt_sequence"])
#     for pos, nucleotide in enumerate(sequence):
#         for base in ["A", "C", "G", "T"]:
#             row[f"pos_{pos}_{base}"] = 1 if nucleotide == base else 0

# df1_small.apply(add_onehot_columns, axis=1)
# df1_small
df1_0[df1_0["23-nt_sequence"].str.contains("TTT")].shape[0] / df1_0.shape[0]

In [None]:
def onehot_encode_sequences(df, seq_col="23-nt_sequence"):
    sequences = df[seq_col].values
    n_sequences = len(sequences)
    seq_length = len(sequences[0])
    bases = ["A", "C", "G", "T"]

    # Create an empty array: shape (num_sequences, seq_length, 4)
    onehot = np.zeros((n_sequences, seq_length, 4), dtype=int)

    # Fill in the one-hot array
    for i, base in enumerate(bases):
        onehot[:, :, i] = (np.array([list(seq) for seq in sequences]) == base)

    # Flatten to columns
    col_names = [f"pos_{pos}_{base}" for pos in range(seq_length) for base in bases]
    onehot_flat = onehot.reshape(n_sequences, seq_length * 4)

    # Return a new DataFrame with one-hot columns
    #return pd.DataFrame(onehot_flat, columns=col_names, index=df.index).merge(df.drop(columns=[seq_col]), left_index=True, right_index=True)
    return df.drop(columns=[seq_col]).merge(pd.DataFrame(onehot_flat, columns=col_names, index=df.index), left_index=True, right_index=True)

# Apply
df1_0_small = onehot_encode_sequences(df1_0_small)
#df1_0_small

In [None]:
df1_0 = onehot_encode_sequences(df1_0)
#df1_0

In [None]:
# EXPORT DATA
export_as_csv_and_pkl(df1_0, "v1.0", DATA_PROCESSED_EFFICACY)

### v1.1

In [None]:
df1_1 = df1_0.copy()
df1_1 = df1_1.merge(df[["23-nt_sequence"]], left_index=True, right_index=True)
#df1_1

In [None]:
def count_nucleotide(row, nucleotide: str, length: float = 23.0) -> float:
    return row["23-nt_sequence"].str.count(nucleotide) / length

def apply_nucleotide_content(df1, df2):
    for nucleotide in BASES:
        df1[f"pct_{nucleotide}"] = count_nucleotide(df2, nucleotide) 

apply_nucleotide_content(df1_1, df)
# df1_1

In [None]:
df1_1["gc_content"] = (count_nucleotide(df1_1, "G", 1) + count_nucleotide(df1_1, "C", 1)) / 23.0

In [None]:
drop_seq(df1_1)
df1_1

In [None]:
export_as_csv_and_pkl(df1_1, "v1.1", DATA_PROCESSED_EFFICACY)

## v2: Data Manipulation for NNs (with consideration of epigenetics)

In [None]:
df2 = df.copy()

def onehote(row: str) -> np.ndarray:
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in row]
    return np.eye(4)[seq2]

df2["23-nt_sequence"] = df2["23-nt_sequence"].astype(str)
df2["23-nt_sequence_onehot"] = df2["23-nt_sequence"].apply(onehote) # type: ignore
df2