<a href="https://colab.research.google.com/github/thewildchip/crispr-genie-lab/blob/main/efficiency/0_data_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0 Data Loading for Efficacy Model

This notebook loads the CRISPR gRNA dataset and performs basic inspection.


## Basic Data Mainpualtion

In [7]:
# IMPORTS
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
# LOADING IN DATASET (csv file from GitHub)

FILE_PATH = "https://raw.githubusercontent.com/VKonstantakos/CRISPR-Deep-Learning/refs/heads/main/Data/2.%20Training%20datasets/1.%20DeepCRISPR-CNN-CRNNCrispr.csv"

df = pd.read_csv(FILE_PATH)

In [9]:
# BASIC EXPLORATION

# df.head()
# df.info()

# df.size # 184239: num oftotal elements
# df.shape # (16849, 11): (rows, columns)

# df.columns # ['chr', 'start', 'end', 'direction', '23-nt sequence', 'ctcf', 'dnase',
             #  'h3k4me3', 'rrbs', 'Normalized efficacy', 'Efficacy']

# df.describe() # information about efficiency scores

In [10]:
df["23-nt sequence"].str.len().unique() # is always 23

array([23])

In [11]:
# CLEANING COLUMN NAMES
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_", regex=False) # making lowercase and replacing spaces with underscores
df.columns

Index(['chr', 'start', 'end', 'direction', '23-nt_sequence', 'ctcf', 'dnase',
       'h3k4me3', 'rrbs', 'normalized_efficacy', 'efficacy'],
      dtype='object')

In [12]:
def onehote(row: str) -> np.ndarray:
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = [mapping[i] for i in row]
    return np.eye(4)[seq2]

df["23-nt_sequence"] = df["23-nt_sequence"].astype(str)
df["23-nt_sequence_onehot"] = df["23-nt_sequence"].apply(onehote) # type: ignore
df["23-nt_sequence_onehot"]


0        [[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...
1        [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...
2        [[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...
3        [[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...
4        [[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...
                               ...                        
16744    [[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...
16745    [[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...
16746    [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [...
16747    [[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [...
16748    [[0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [...
Name: 23-nt_sequence_onehot, Length: 16749, dtype: object

## Data Manipulation for Dataset without consideration of epigenetics

In [15]:
df1 = df.copy()
columns_to_drop = ["chr", "start", "end", "direction", "23-nt_sequence", "efficacy", "ctcf", "dnase", "h3k4me3", "rrbs"]
df1.drop(columns=columns_to_drop, inplace=True)
df1

Unnamed: 0,normalized_efficacy,23-nt_sequence_onehot
0,0.164020,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
1,0.311196,"[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [..."
2,0.179397,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
3,0.498712,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
4,0.233023,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
...,...,...
16744,0.287346,"[[0.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [..."
16745,0.209103,"[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
16746,0.061508,"[[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0], [..."
16747,0.181219,"[[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0], [..."
