In [1]:
%load_ext autoreload
%autoreload 2
import glob
import numpy as np
from sklearn.metrics import r2_score
from sklearn import preprocessing
import xgboost as xgb
from collections import defaultdict
from matplotlib import pyplot as plt
import pandas as pd
from itertools import product
import sys
sys.path.append('../project/genotype')
from data import GenotypeDataModule
np.random.seed(42)

In [2]:
path_pattern = "../datasets/genotype/cas9/cas9_pairs_10nm_%s.csv"

In [3]:
gene = "GACGCATAAAGATGAGACGCTGG"
pair2int = {pair: idx for  idx, pair in enumerate(product(['A', 'C', 'G', 'T'], ['A', 'C', 'G', 'T']))}
int2pair = {idx: pair for idx, pair in enumerate(product(['A', 'C', 'G', 'T'], ['A', 'C', 'G', 'T']))}

In [4]:
pp = path_pattern 
dm = GenotypeDataModule(paths = [pp%"train", pp%"valid", pp%"test"])

In [5]:
dm.prepare_data()
dm.setup()

In [6]:
X_train = dm.X_train.numpy()
X_valid = dm.X_valid.numpy()
X_test = dm.X_test.numpy()
Y_train = dm.y_train.numpy()[:, 0]
Y_valid = dm.y_valid.numpy()[:, 0]
Y_test  = dm.y_test.numpy()[:, 0]

In [11]:
base2int = {'A': 1, 'C': 2, 'G': 3, 'T':4}

In [7]:
def to_binary(X):
    mask0 = (X == pair2int[('A', 'A')]) + (X == pair2int[('C', 'C')]) + (X == pair2int[('G', 'G')]) + (X == pair2int[('T', 'T')])
    binary = X.copy()
    binary[mask0] = 0
    binary[~mask0] = 1
    return binary

In [16]:
def to_string(X):
    string = []
    for row in X:
        string += [[base2int[int2pair[x][1]] for x in row]]
    return np.asarray(string)

In [32]:
def to_hamming(X):
    mask0 = (X == pair2int[('A', 'A')]) + (X == pair2int[('C', 'C')]) + (X == pair2int[('G', 'G')]) + (X == pair2int[('T', 'T')])
    hamming = []
    for row in X:
        hamming += [[base2int[int2pair[x][1]] for x in row]]
    hamming = np.asarray(hamming)
    hamming[mask0] = 0
    return hamming

In [26]:
X_train_string, X_valid_string, X_test_string = to_string(X_train), to_string(X_valid), to_string(X_test)

# XGBoost string

In [27]:
from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV

In [28]:
param_grid = {'gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [25,50,65,80,100,115,130,150,200,400,800],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0]}

In [29]:
X_train_valid_string = np.concatenate((X_train_string, X_valid_string), axis=0)
cv = RandomizedSearchCV(xgb.XGBRegressor(), param_grid, n_iter=1000, cv=ps)
cv.fit(X_train_valid_string, Y_train_valid)
print(cv.best_params_)

{'subsample': 0.8, 'reg_lambda': 200, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 14, 'learning_rate': 0.4, 'gamma': 0.2, 'colsample_bytree': 0.6}


In [37]:
#est = xgb.XGBRegressor(**cv.best_params_)
est = xgb.XGBRegressor(**{'subsample': 0.8, 'reg_lambda': 200, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 14, 'learning_rate': 0.4, 'gamma': 0.2, 'colsample_bytree': 0.6})
est.fit(X_train_string, Y_train)
Y_pred = est.predict(X_test_string)
Y_pred2 = est.predict(X_train_string)
r2_train = r2_score(Y_train, Y_pred2)
rel_train = np.linalg.norm(Y_train - Y_pred2)/np.linalg.norm(Y_train)
r2 = r2_score(Y_test, Y_pred)
rel = np.linalg.norm(Y_test - Y_pred)/np.linalg.norm(Y_test)
print('TRAIN  R2 score: %2.4f, relative error: %2.4f'%(r2_train, rel_train))
print('TEST   R2 score: %2.4f, relative error: %2.4f'%(r2, rel))

TRAIN  R2 score: 0.6220, relative error: 0.6147
TEST   R2 score: 0.4033, relative error: 0.7700


In [31]:
# {'subsample': 0.8, 'reg_lambda': 200, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 14, 'learning_rate': 0.4, 'gamma': 0.2, 'colsample_bytree': 0.6}
# TRAIN  R2 score: 0.6220, relative error: 0.6147
# TEST   R2 score: 0.4033, relative error: 0.7700

In [34]:
X_train_hamming, X_valid_hamming, X_test_hamming = to_hamming(X_train), to_hamming(X_valid), to_hamming(X_test)

In [42]:
X_train_valid_hamming = np.concatenate((X_train_hamming, X_valid_hamming), axis=0)
cv = RandomizedSearchCV(xgb.XGBRegressor(), param_grid, n_iter=1000, cv=ps)
cv.fit(X_train_valid_hamming, Y_train_valid)
print(cv.best_params_)

{'subsample': 0.8, 'reg_lambda': 12.8, 'reg_alpha': 0.2, 'n_estimators': 400, 'max_depth': 14, 'learning_rate': 0.300000012, 'gamma': 1.6, 'colsample_bytree': 1.0}


In [43]:
est = xgb.XGBRegressor(**cv.best_params_)
est.fit(X_train_hamming, Y_train)
Y_pred = est.predict(X_test_hamming)
Y_pred2 = est.predict(X_train_hamming)
r2_train = r2_score(Y_train, Y_pred2)
rel_train = np.linalg.norm(Y_train - Y_pred2)/np.linalg.norm(Y_train)
r2 = r2_score(Y_test, Y_pred)
rel = np.linalg.norm(Y_test - Y_pred)/np.linalg.norm(Y_test)
print('TRAIN  R2 score: %2.4f, relative error: %2.4f'%(r2_train, rel_train))
print('TEST   R2 score: %2.4f, relative error: %2.4f'%(r2, rel))

TRAIN  R2 score: 0.7139, relative error: 0.5348
TEST   R2 score: 0.5050, relative error: 0.7013


In [44]:
# Hamming
# {'subsample': 0.8, 'reg_lambda': 12.8, 'reg_alpha': 0.2, 'n_estimators': 400, 'max_depth': 14, 'learning_rate': 0.300000012, 'gamma': 1.6, 'colsample_bytree': 1.0}
# TRAIN  R2 score: 0.7139, relative error: 0.5348
# TEST   R2 score: 0.5050, relative error: 0.7013