<a href="https://colab.research.google.com/github/thewildchip/crispr-genie-lab/blob/main/off_target/00_data_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0) Data Loading for Off Target Model

In [3]:
from pathlib import Path
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
#from mypackage.utils import  *
#from mypackage.config import *
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

In [5]:
def is_valid_sequence(seq, length = 23):
    if len(seq) != length:
        return False
    return all(base in ["A", "G", "C", "T"] for base in seq)

def onehot_encode_sequences(df: pd.DataFrame, seq_col="23-nt_sequence"):
    sequences = df[seq_col].values
    n_sequences = len(sequences)
    seq_length = len(sequences[0])
    bases = ["A", "C", "G", "T"]

    # Create an empty array: shape (num_sequences, seq_length, 4)
    onehot = np.zeros((n_sequences, seq_length, 4), dtype=int)

    # Fill in the one-hot array
    for i, base in enumerate(bases):
        onehot[:, :, i] = (np.array([list(seq) for seq in sequences]) == base)

    # Flatten to columns
    col_names = [f"pos_{pos}_{base}" for pos in range(seq_length) for base in bases]
    onehot_flat = onehot.reshape(n_sequences, seq_length * 4)

    # Return a new DataFrame with one-hot columns
    #return pd.DataFrame(onehot_flat, columns=col_names, index=df.index).merge(df.drop(columns=[seq_col]), left_index=True, right_index=True)
    return df.drop(columns=[seq_col]).merge(pd.DataFrame(onehot_flat, columns=col_names, index=df.index), left_index=True, right_index=True)

In [None]:
off_target_data = DATA_RAW/"off_target.csv"
df = pd.read_csv(off_target_data)
df = df.dropna()
X = df[["sgRNA_seq", "off_seq", "Read"]]
y = df["label"]

df = df[
    df["sgRNA_seq"].apply(is_valid_sequence) &
    df["off_seq"].apply(is_valid_sequence)
]


X_sq = onehot_encode_sequences(df, seq_col="sgRNA_seq")
X_sq = X_sq.drop(columns=["off_seq", "label"])

X_off = onehot_encode_sequences(df, seq_col="off_seq")
X_off = X_off.drop(columns=["sgRNA_seq", "label"])

X = pd.concat([X_sq,X_off], axis = 1)
y = df["label"]

df.head()
train_X, X_test, train_y, y_test= train_test_split(
    X, y, test_size=0.2, random_state=42
    ) 

clf = DecisionTreeClassifier(
    max_depth=6,
    min_samples_leaf=20,
    random_state=42
)


clf.fit(train_X, train_y)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

