In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
import tensorflow.keras as k
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, Conv2D, MaxPool2D, Flatten, Reshape

In [3]:
import silence_tensorflow

In [4]:
from keras_bed_sequence import BedSequence
from keras_mixed_sequence import MixedSequence

# Data

In [5]:
df = pd.read_csv("./mendelian_snv.csv.gz")

In [6]:
chromosomes = df.chrom.unique()

In [7]:
train_bed = df.head(int(1e5))

In [8]:
X = BedSequence(
    assembly="hg19",
    bed=train_bed,
    batch_size=128
)

HBox(children=(IntProgress(value=0, description='Loading chromosomes for genome hg19', layout=Layout(flex='2')…



HBox(children=(IntProgress(value=0, description='Rendering sequences in hg19', layout=Layout(flex='2'), max=11…



HBox(children=(IntProgress(value=0, description='Converting nucleotides to numeric classes', layout=Layout(fle…



In [9]:
y = train_bed.labels.values

In [10]:
X[0].shape

(128, 500, 4)

In [11]:
y.shape

(100000,)

In [12]:
mixed = MixedSequence(X, y, 128)

In [13]:
mixed.on_epoch_end()

# Model

In [14]:
def build_model(x, filters, kernels, pools, dense):
    # Convolutional part
    for  _filter, _kernel, _pool in zip(filters, kernels, pools):
        x = Conv2D(_filter, _kernel, activation="relu", padding="same")(x)
        x = MaxPool2D(_pool)(x)

    x = Flatten()(x)
    
    # Dense part
    for _dense in dense:
        x = Dense(_dense, activation="relu")(x)
        
    x = Dense(1, activation="sigmoid")(x)
    return x

In [15]:
filters = [128, 64, 64, 32]
kernels = [(9, 4), (6, 2), (3, 1), (3, 1)]
pools   = [(3, 1), (2, 1), (2, 2), (2, 2)]
dense   = [256, 64, 32]

In [16]:
i = Input(
    shape=(500, 4)
)
x = Reshape((500, 4, 1))(i)
classifier = Model(
    inputs=i,
    outputs=build_model(
        x,
        filters,
        kernels,
        pools,
        dense
    )
)
classifier.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 500, 4)]          0         
_________________________________________________________________
reshape (Reshape)            (None, 500, 4, 1)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 500, 4, 128)       4736      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 166, 4, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 166, 4, 64)        98368     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 83, 4, 64)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 83, 4, 64)         12352 

In [23]:
def balanced_binary_crossentropy(weight=1):
    def loss(y_true, y_pred):
        l1 = k.backend.categorical_crossentropy(y_true[y_true==1], y_pred[y_true==1])
        l0 = k.backend.categorical_crossentropy(y_true[y_true==0], y_pred[y_true==0])
        return l0 + weight * l1
    return loss

In [24]:
classifier.compile(
    optimizer="nadam",
    loss=balanced_binary_crossentropy(2)
)

In [25]:
histoty = classifier.fit_generator(
    generator=mixed,
    steps_per_epoch=mixed.steps_per_epoch // 5,
    epochs=100,
    verbose=1,
    use_multiprocessing=False,
    shuffle=True
).history

Train for 156 steps
Epoch 1/100
  1/156 [..............................] - ETA: 10s

TypeError: Input 'y' of 'Mul' Op has type float32 that does not match type int64 of argument 'x'.