# 0) Data Loading for Efficacy Model

This notebook loads the CRISPR gRNA dataset and performs basic inspection.


## v0: Basic Data Manipulation

In [None]:
#IMPORTS

from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mypackage.config import *
from mypackage.utils import *

In [2]:
# LOADING IN DATASET (csv file from GitHub)

efficacy_data = DATA_RAW / "efficacy.csv"
df = pd.read_csv(efficacy_data)

df

Unnamed: 0,chr,start,end,direction,23-nt sequence,ctcf,dnase,h3k4me3,rrbs,Normalized efficacy,Efficacy
0,chr17,33469132,33469154,-,CTTGCTCGCGCAGGACGAGGCGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.164020,1
1,chr4,184605984,184606006,-,ACATCAGGTTACCTCTACCAAGG,AAAAAAAAAAAAANNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.311196,1
2,chr1,11736866,11736888,+,CTGATGCCAGCTAGTGGGCGAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.179397,0
3,chr4,25379067,25379089,+,CTGTTTCCCATCCTTCCGGGTGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNAANNNNN,0.498712,1
4,chr12,57936737,57936759,-,AATGTATGCACAGGGAACAGAGG,AAAAAAAAAAAAAAANNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.233023,1
...,...,...,...,...,...,...,...,...,...,...,...
16744,chr19,54710145,54710167,+,CAACGCCCTGCTGCGGCGGCTGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.287346,1
16745,chr19,54710220,54710242,-,CTAAGAAATCCTCTATCTTCAGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.209103,0
16746,chr19,54710309,54710331,+,TGATCCGCCAGCGCCATATCAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.061508,0
16747,chr19,54710322,54710344,-,ATCCGAGGTGGTACCTGATATGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.181219,0


In [3]:
# BASIC EXPLORATION

# df.head()
# df.info()

# df.size # 184239: num oftotal elements
# df.shape # (16849, 11): (rows, columns)

# df.columns # ['chr', 'start', 'end', 'direction', '23-nt sequence', 'ctcf', 'dnase',
             #  'h3k4me3', 'rrbs', 'Normalized efficacy', 'Efficacy']

# df.describe() # information about efficiency scores

In [4]:
# CLEANING COLUMN NAMES
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_", regex=False) # making lowercase and replacing spaces with underscores
df.columns

Index(['chr', 'start', 'end', 'direction', '23-nt_sequence', 'ctcf', 'dnase',
       'h3k4me3', 'rrbs', 'normalized_efficacy', 'efficacy'],
      dtype='object')

In [5]:
df["23-nt_sequence"].str.len().unique() # is always 23

array([23])

In [6]:
# ARE THERE ANY INCONSISTENCIES IN THE EFFICACY SCORES?
related_cols =['normalized_efficacy', 'efficacy']
df[(df.normalized_efficacy < 0.5) & (df.efficacy == 1)][related_cols]  # Yes, the boundary of the normalized efficacy score must be below 0.5

Unnamed: 0,normalized_efficacy,efficacy
0,0.164020,1
1,0.311196,1
3,0.498712,1
4,0.233023,1
5,0.250399,1
...,...,...
16714,0.244959,1
16717,0.234877,1
16738,0.231043,1
16741,0.319152,1


In [7]:
# Get minimum value of normalized_efficacy where efficacy is 1
mn = df[df.efficacy == 1]['normalized_efficacy'].min()

# Check for normalized_efficacy values smaller than this minimum where efficacy is 1 -> should be none
df[(df.efficacy == 1) & (df.normalized_efficacy < mn)][related_cols] # None found 

Unnamed: 0,normalized_efficacy,efficacy


## v1: Data Manipulation for Dataset without consideration of epigenetics

### v1.0

In [8]:
df1_0 = df.copy()
columns_to_drop = ["chr", "start", "end", "direction", "efficacy", "ctcf", "dnase", "h3k4me3", "rrbs"]
df1_0 = df1_0.drop(columns=columns_to_drop, inplace=False)
df1_0_small = df1_0.sample(n=5, random_state=42).copy()
df1_0_small

Unnamed: 0,23-nt_sequence,normalized_efficacy
3314,CTTCTTGAACCCACGCAAGGTGG,0.214443
3623,AGGCGGAATCGCTCACCCGACGG,0.457338
11529,GACCAACACTCCAGCTGAGCTGG,0.144906
10389,CCCAGCAATCGCCAATCCCAGGG,0.540867
15057,CCGGACTCAGGACTCCGAACTGG,0.51267


In [9]:
# def add_onehot_columns(row) -> None:
#     sequence = str(row["23-nt_sequence"])
#     for pos, nucleotide in enumerate(sequence):
#         for base in ["A", "C", "G", "T"]:
#             row[f"pos_{pos}_{base}"] = 1 if nucleotide == base else 0

# df1_small.apply(add_onehot_columns, axis=1)
# df1_small
df1_0[df1_0["23-nt_sequence"].str.contains("TTT")].shape[0] / df1_0.shape[0]

0.11582781061555913

In [10]:

# Apply
df1_0_small = onehot_encode_sequences(df1_0_small)
#df1_0_small

In [11]:
df1_0 = onehot_encode_sequences(df1_0)
#df1_0

In [12]:
# EXPORT DATA
export_as_csv_and_pkl(df1_0, "v1.0", DATA_PROCESSED_EFFICACY)

Data saved to /Users/thewildchip/Desktop/Coding/crispr-genie-lab-jf/data/processed/efficacy/v1.0.csv and /Users/thewildchip/Desktop/Coding/crispr-genie-lab-jf/data/processed/efficacy/v1.0.pkl


### v1.1

In [13]:
df1_1 = df1_0.copy()
df1_1 = df1_1.merge(df[["23-nt_sequence"]], left_index=True, right_index=True)
#df1_1

In [14]:
def count_nucleotide(row, nucleotide: str, length: float = 23.0) -> float:
    return row["23-nt_sequence"].str.count(nucleotide) / length

def apply_nucleotide_content(df1, df2):
    for nucleotide in BASES:
        df1[f"pct_{nucleotide}"] = count_nucleotide(df2, nucleotide) 

apply_nucleotide_content(df1_1, df)
# df1_1

In [15]:
df1_1["gc_content"] = (count_nucleotide(df1_1, "G", 1) + count_nucleotide(df1_1, "C", 1)) / 23.0

In [16]:
drop_seq(df1_1)
df1_1

Unnamed: 0,normalized_efficacy,pos_0_A,pos_0_C,pos_0_G,pos_0_T,pos_1_A,pos_1_C,pos_1_G,pos_1_T,pos_2_A,...,pos_21_T,pos_22_A,pos_22_C,pos_22_G,pos_22_T,pct_A,pct_C,pct_G,pct_T,gc_content
0,0.164020,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0.130435,0.304348,0.434783,0.130435,0.739130
1,0.311196,1,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0.304348,0.304348,0.173913,0.217391,0.478261
2,0.179397,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0.173913,0.217391,0.434783,0.173913,0.652174
3,0.498712,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0.043478,0.347826,0.260870,0.347826,0.608696
4,0.233023,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0.391304,0.130435,0.347826,0.130435,0.478261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16744,0.287346,0,1,0,0,1,0,0,0,1,...,0,0,0,1,0,0.086957,0.391304,0.391304,0.130435,0.782609
16745,0.209103,0,1,0,0,0,0,0,1,1,...,0,0,0,1,0,0.304348,0.260870,0.130435,0.304348,0.391304
16746,0.061508,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0.217391,0.347826,0.260870,0.173913,0.608696
16747,0.181219,1,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0.217391,0.173913,0.347826,0.260870,0.521739


In [17]:
export_as_csv_and_pkl(df1_1, "v1.1", DATA_PROCESSED_EFFICACY)

Data saved to /Users/thewildchip/Desktop/Coding/crispr-genie-lab-jf/data/processed/efficacy/v1.1.csv and /Users/thewildchip/Desktop/Coding/crispr-genie-lab-jf/data/processed/efficacy/v1.1.pkl


### v1.2

In [18]:
df1_2  = df1_1.copy()
df1_2 = df1_2.merge(df[["23-nt_sequence"]], left_index=True, right_index=True)

In [19]:
def add_poly_t_determination(df):
    df["poly_T"] = df["23-nt_sequence"].str.contains("TTTT").astype(int) * -1
    return df

df1_2 = add_poly_t_determination(df1_2)
print(df1_2[df1_2["poly_T"] == -1]["normalized_efficacy"].describe())
print(df1_2[df1_2["poly_T"] == 0]["normalized_efficacy"].describe())

count    9.000000
mean     0.338181
std      0.150255
min      0.094516
25%      0.234846
50%      0.421583
75%      0.454260
max      0.469433
Name: normalized_efficacy, dtype: float64
count    16740.000000
mean         0.250446
std          0.164877
min          0.000000
25%          0.129401
50%          0.209588
75%          0.344391
max          1.000000
Name: normalized_efficacy, dtype: float64


In [20]:
drop_seq(df1_2)
export_as_csv_and_pkl(df1_2, "v1.2", DATA_PROCESSED_EFFICACY)

Data saved to /Users/thewildchip/Desktop/Coding/crispr-genie-lab-jf/data/processed/efficacy/v1.2.csv and /Users/thewildchip/Desktop/Coding/crispr-genie-lab-jf/data/processed/efficacy/v1.2.pkl


## v2: Data Manipulation for NNs (with consideration of epigenetics)

In [21]:
df2 = df.copy()

def onehote(row: str) -> np.ndarray:
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in row]
    return np.eye(4)[seq2]

df2["23-nt_sequence"] = df2["23-nt_sequence"].astype(str)
df2["23-nt_sequence_onehot"] = df2["23-nt_sequence"].apply(onehote) # type: ignore
df2

Unnamed: 0,chr,start,end,direction,23-nt_sequence,ctcf,dnase,h3k4me3,rrbs,normalized_efficacy,efficacy,23-nt_sequence_onehot
0,chr17,33469132,33469154,-,CTTGCTCGCGCAGGACGAGGCGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.164020,1,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
1,chr4,184605984,184606006,-,ACATCAGGTTACCTCTACCAAGG,AAAAAAAAAAAAANNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.311196,1,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
2,chr1,11736866,11736888,+,CTGATGCCAGCTAGTGGGCGAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.179397,0,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
3,chr4,25379067,25379089,+,CTGTTTCCCATCCTTCCGGGTGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNAANNNNN,0.498712,1,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
4,chr12,57936737,57936759,-,AATGTATGCACAGGGAACAGAGG,AAAAAAAAAAAAAAANNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.233023,1,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
...,...,...,...,...,...,...,...,...,...,...,...,...
16744,chr19,54710145,54710167,+,CAACGCCCTGCTGCGGCGGCTGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.287346,1,"[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
16745,chr19,54710220,54710242,-,CTAAGAAATCCTCTATCTTCAGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.209103,0,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
16746,chr19,54710309,54710331,+,TGATCCGCCAGCGCCATATCAGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.061508,0,"[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [..."
16747,chr19,54710322,54710344,-,ATCCGAGGTGGTACCTGATATGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,0.181219,0,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
