# 0 Data Loading for Efficacy Model

This notebook loads the CRISPR gRNA dataset and performs basic inspection.


## Basic Data Mainpualtion

In [24]:
# IMPORTS
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
# LOADING IN DATASET (csv file from GitHub)

FILE_PATH = "https://raw.githubusercontent.com/VKonstantakos/CRISPR-Deep-Learning/refs/heads/main/Data/2.%20Training%20datasets/1.%20DeepCRISPR-CNN-CRNNCrispr.csv"

df = pd.read_csv(FILE_PATH)
df

Unnamed: 0,chr,start,end,direction,23-nt sequence,ctcf,dnase,h3k4me3,rrbs,Normalized efficacy,Efficacy
0,chr17,33469132,33469154,-,CTTGCTCGCGCAGGACGAGGCGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.164020,1
1,chr4,184605984,184606006,-,ACATCAGGTTACCTCTACCAAGG,AAAAAAAAAAAAANNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.311196,1
2,chr1,11736866,11736888,+,CTGATGCCAGCTAGTGGGCGAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.179397,0
3,chr4,25379067,25379089,+,CTGTTTCCCATCCTTCCGGGTGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNAANNNNN,0.498712,1
4,chr12,57936737,57936759,-,AATGTATGCACAGGGAACAGAGG,AAAAAAAAAAAAAAANNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.233023,1
...,...,...,...,...,...,...,...,...,...,...,...
16744,chr19,54710145,54710167,+,CAACGCCCTGCTGCGGCGGCTGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.287346,1
16745,chr19,54710220,54710242,-,CTAAGAAATCCTCTATCTTCAGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.209103,0
16746,chr19,54710309,54710331,+,TGATCCGCCAGCGCCATATCAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.061508,0
16747,chr19,54710322,54710344,-,ATCCGAGGTGGTACCTGATATGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.181219,0


In [28]:
# BASIC EXPLORATION

# df.head()
# df.info()

# df.size # 184239: num oftotal elements
# df.shape # (16849, 11): (rows, columns)

# df.columns # ['chr', 'start', 'end', 'direction', '23-nt sequence', 'ctcf', 'dnase',
             #  'h3k4me3', 'rrbs', 'Normalized efficacy', 'Efficacy']

# df.describe() # information about efficiency scores

In [29]:
df["23-nt sequence"].str.len().unique() # is always 23

array([23])

In [30]:
# CLEANING COLUMN NAMES
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_", regex=False) # making lowercase and replacing spaces with underscores
df.columns

Index(['chr', 'start', 'end', 'direction', '23-nt_sequence', 'ctcf', 'dnase',
       'h3k4me3', 'rrbs', 'normalized_efficacy', 'efficacy'],
      dtype='object')

## Data Manipulation for Dataset without consideration of epigenetics

In [31]:
df1 = df.copy()
columns_to_drop = ["chr", "start", "end", "direction", "efficacy", "ctcf", "dnase", "h3k4me3", "rrbs"]
df1.drop(columns=columns_to_drop, inplace=True)
df1
df1_small = df1.sample(n=5, random_state=42).copy()
df1_small

Unnamed: 0,23-nt_sequence,normalized_efficacy
3314,CTTCTTGAACCCACGCAAGGTGG,0.214443
3623,AGGCGGAATCGCTCACCCGACGG,0.457338
11529,GACCAACACTCCAGCTGAGCTGG,0.144906
10389,CCCAGCAATCGCCAATCCCAGGG,0.540867
15057,CCGGACTCAGGACTCCGAACTGG,0.51267


In [14]:
# def add_onehot_columns(row) -> None:
#     sequence = str(row["23-nt_sequence"])
#     for pos, nucleotide in enumerate(sequence):
#         for base in ["A", "C", "G", "T"]:
#             row[f"pos_{pos}_{base}"] = 1 if nucleotide == base else 0

# df1_small.apply(add_onehot_columns, axis=1)
# df1_small


In [32]:
def onehot_encode_sequences(df, seq_col="23-nt_sequence"):
    sequences = df[seq_col].values
    n_sequences = len(sequences)
    seq_length = len(sequences[0])
    bases = ["A", "C", "G", "T"]

    # Create an empty array: shape (num_sequences, seq_length, 4)
    onehot = np.zeros((n_sequences, seq_length, 4), dtype=int)

    # Fill in the one-hot array
    for i, base in enumerate(bases):
        onehot[:, :, i] = (np.array([list(seq) for seq in sequences]) == base)

    # Flatten to columns
    col_names = [f"pos_{pos}_{base}" for pos in range(seq_length) for base in bases]
    onehot_flat = onehot.reshape(n_sequences, seq_length * 4)

    # Return a new DataFrame with one-hot columns
    #return pd.DataFrame(onehot_flat, columns=col_names, index=df.index).merge(df.drop(columns=[seq_col]), left_index=True, right_index=True)
    return df.drop(columns=[seq_col]).merge(pd.DataFrame(onehot_flat, columns=col_names, index=df.index), left_index=True, right_index=True)

# Apply
df1_small = onehot_encode_sequences(df1_small)
df1_small

Unnamed: 0,normalized_efficacy,pos_0_A,pos_0_C,pos_0_G,pos_0_T,pos_1_A,pos_1_C,pos_1_G,pos_1_T,pos_2_A,...,pos_20_G,pos_20_T,pos_21_A,pos_21_C,pos_21_G,pos_21_T,pos_22_A,pos_22_C,pos_22_G,pos_22_T
3314,0.214443,0,1,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3623,0.457338,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
11529,0.144906,0,0,1,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
10389,0.540867,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,1,0
15057,0.51267,0,1,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [37]:
df1 = onehot_encode_sequences(df1)
df1

Unnamed: 0,normalized_efficacy,pos_0_A,pos_0_C,pos_0_G,pos_0_T,pos_1_A,pos_1_C,pos_1_G,pos_1_T,pos_2_A,...,pos_20_G,pos_20_T,pos_21_A,pos_21_C,pos_21_G,pos_21_T,pos_22_A,pos_22_C,pos_22_G,pos_22_T
0,0.164020,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0.311196,1,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0
2,0.179397,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
3,0.498712,0,1,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
4,0.233023,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16744,0.287346,0,1,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,1,0
16745,0.209103,0,1,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,0,1,0
16746,0.061508,0,0,0,1,0,0,1,0,1,...,0,0,0,0,1,0,0,0,1,0
16747,0.181219,1,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0


In [39]:
#df1.info()
df1.to_csv("DeepCRISPR_CNN_CRNN_Crispr_preprocessed.csv", index=False)
df1.to_pickle("DeepCRISPR_CNN_CRNN_Crispr_preprocessed.pkl")
print("Preprocessed data saved to DeepCRISPR_CNN_CRNN_Crispr_preprocessed.pkl")

Preprocessed data saved to DeepCRISPR_CNN_CRNN_Crispr_preprocessed.pkl


## Data Manipulation for NNs (with consideration of epigenetics)

In [36]:
df2 = df.copy()

def onehote(row: str) -> np.ndarray:
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in row]
    return np.eye(4)[seq2]

df2["23-nt_sequence"] = df2["23-nt_sequence"].astype(str)
df2["23-nt_sequence_onehot"] = df2["23-nt_sequence"].apply(onehote) # type: ignore
df2

Unnamed: 0,chr,start,end,direction,23-nt_sequence,ctcf,dnase,h3k4me3,rrbs,normalized_efficacy,efficacy,23-nt_sequence_onehot
0,chr17,33469132,33469154,-,CTTGCTCGCGCAGGACGAGGCGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.164020,1,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
1,chr4,184605984,184606006,-,ACATCAGGTTACCTCTACCAAGG,AAAAAAAAAAAAANNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.311196,1,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
2,chr1,11736866,11736888,+,CTGATGCCAGCTAGTGGGCGAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.179397,0,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
3,chr4,25379067,25379089,+,CTGTTTCCCATCCTTCCGGGTGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNAANNNNN,0.498712,1,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
4,chr12,57936737,57936759,-,AATGTATGCACAGGGAACAGAGG,AAAAAAAAAAAAAAANNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.233023,1,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
...,...,...,...,...,...,...,...,...,...,...,...,...
16744,chr19,54710145,54710167,+,CAACGCCCTGCTGCGGCGGCTGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.287346,1,"[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
16745,chr19,54710220,54710242,-,CTAAGAAATCCTCTATCTTCAGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.209103,0,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
16746,chr19,54710309,54710331,+,TGATCCGCCAGCGCCATATCAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.061508,0,"[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [..."
16747,chr19,54710322,54710344,-,ATCCGAGGTGGTACCTGATATGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.181219,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
