# Experiments with ConvNet Module for CpGNet

In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D
from keras.layers import Embedding, GlobalAveragePooling1D, MaxPooling1D, MaxPooling2D,Flatten,Input,LeakyReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model

from CpG_Net import CpGNet
from CpG_Bin import Bin
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from random import shuffle
from sklearn.metrics import roc_curve, auc

import random
from sklearn.metrics import roc_curve, auc
%load_ext autoreload
%autoreload 2
%matplotlib inline

ValueError: Attempted relative import in non-package

In [None]:
CPG_DENSITY=6

In [None]:
data = pickle.load(open("HAMbins.p","rb")) 

In [None]:
net = CpGNet(CPG_DENSITY)

Create complete bins

In [None]:
# Preliminary filter, useful to speed up computation
min_read_depth = 20
read_filtered_data = [bin_ for bin_ in data if bin_.matrix.shape[0] >= min_read_depth]
cpg_bins = [bin_ for bin_ in read_filtered_data if bin_.matrix.shape[1]==CPG_DENSITY]
shuffle(cpg_bins)
# Filters out missing data
cpg_bins_complete = net.filter_bad_reads(cpg_bins)
# secondary filter
cpg_bins_complete_depth = [bin_ for bin_ in cpg_bins_complete if bin_.matrix.shape[0] >= min_read_depth]



Create masks

In [None]:
masks = net.extract_masks(cpg_bins)

In [None]:
# apply masks
ready_bins = []
for Bin in cpg_bins_complete_depth:
    truth_matrix = Bin.matrix
    m_shape = truth_matrix.shape
    if m_shape in masks:
        if len(masks[m_shape]) > 0:
            mask = random.choice(masks[m_shape])
            observed = np.minimum(truth_matrix, mask)
            Bin.tag2 = {"truth":truth_matrix, "observed":observed, "mask":mask}
            ready_bins.append(Bin)

In [None]:
len(masks)

In [None]:
X,Y,Z = net.advanced_feature_collect(ready_bins)

Preprocessing

In [None]:
print X.shape
print Y.shape
print Z.shape

In [None]:
X[0]

In [None]:
max_depth = max([len(m) for m in X])

X_pad = np.zeros((len(X),max_depth, CPG_DENSITY))
temp = -5
for i,x in tqdm(enumerate(X)):
    X_pad[i] = np.pad(x, ((0, max_depth-len(x)),(0,0)), "constant", constant_values=(temp))
# convert 1 to 3, 0 to -1, missing to 1, since we are 0 padding

X_pad[X_pad==1]=3
X_pad[X_pad==-1]=1
X_pad[X_pad==0]=-1

X_pad[X_pad==temp]=0

In [None]:
X[100]

In [None]:
X_pad[100]

In [None]:
X_exp = np.expand_dims(X_pad, axis=2) # add extra dimesion to make keras happy
X_exp=X_exp.reshape(len(X_exp),max_depth, CPG_DENSITY, 1)
Y_norm = preprocessing.scale(Y)
Z_exp = np.expand_dims(Z,-1)
#Y_exp = np.expand_dims(Y, axis=2) # add extra dimesion to make keras happy

In [None]:
print X_exp.shape
print Y_norm.shape
print Z.shape

In [None]:
Xf = X_exp.flatten()
Xfr = Xf.reshape(49229,498)

In [None]:
len(np.unique(X_exp,axis=0))

In [None]:
Xfr

In [None]:
np.save("npX",X_exp)

In [None]:
np.save("npY",Y_norm)

In [None]:
np.save("npZ",Z)

In [None]:
# Conv Module
convInput = Input(shape=(max_depth,CPG_DENSITY,1), dtype='float', name='input2')

filter_size = CPG_DENSITY
stride = filter_size
convLayer = Conv2D(32, kernel_size=(4,4), strides=2, padding="same",activation="linear")(convInput)
convLayer = LeakyReLU(alpha=.001)(convLayer)
convLayer = Conv2D(16, kernel_size=(2,2), strides=2, padding="same",activation="linear")(convLayer)
convLayer = LeakyReLU(alpha=.001)(convLayer)
convLayer = Conv2D(8, kernel_size=(2,2), strides=2, padding="same",activation="linear")(convLayer)
convLayer = LeakyReLU(alpha=.001)(convLayer)

#convLayer = MaxPooling2D()(convLayer)

convLayer = Flatten()(convLayer)

#convLayer = Flatten()(convInput)
#convLayer = Dense(1000, activation="relu")(convLayer)

# Numerical Module
numericalInput = Input(shape=(Y[0].size,), dtype='float', name='input1')
layer1 = Dense(1000, activation="linear")(numericalInput)
layer1 = LeakyReLU(alpha=.01)(layer1)
combined = Dropout(0.9)(combined)
layer1 = Dense(800, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)
combined = Dropout(0.9)(combined)

layer1 = Dense(600, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)
combined = Dropout(0.9)(combined)
layer1 = Dense(200, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)


layer1 = Dense(100, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)

layer1 = Dense(10, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)

layer1 = Dense(3, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)

layer1 = Dense(10, activation="linear")(layer1)
layer1 = LeakyReLU(alpha=.01)(layer1)

# Combined Module

combined = keras.layers.concatenate([convLayer, numericalInput])
combined = Dense(1000, activation="linear")(combined)
combined = LeakyReLU(alpha=.01)(combined)
combined = Dropout(0.9)(combined)

combined = Dense(800, activation="linear")(combined)
combined = LeakyReLU(alpha=.01)(combined)
combined = Dropout(0.9)(combined)
combined = Dense(400, activation="linear")(combined)

combined = LeakyReLU(alpha=.01)(combined)
combined = Dropout(0.9)(combined)
combined = Dense(1, activation="sigmoid")(combined)


In [None]:
model = Model(inputs=[convInput, numericalInput], outputs=[combined])
adam = keras.optimizers.Adam(lr=0.0001)

model.compile(optimizer="adam",loss = "binary_crossentropy",metrics=["acc"])


In [None]:
u, indices = np.unique(Xfr,axis=0, return_index=True)
X_u = X_exp[indices]
Y_u = Y_norm[indices]
Z_u = Z[indices]
history = model.fit([X_exp, Y_norm], [Z], epochs=100, validation_split=0.2, batch_size=16)
#history = model.fit([X_u, Y_u], [Z_u], epochs=100, validation_split=0.2, batch_size=16)

In [None]:
indices

In [None]:
Y[0]

In [None]:
# model = Sequential()
# model.add(Conv1D(16, 3, strides=3, activation='relu', input_shape=(seq_length,1)))
# model.add(Conv1D(5, 1, strides=1, activation='relu'))

# model.add(GlobalAveragePooling1D())
# model.add(Flatten())
# model.add(Dense(2, activation='sigmoid'))

# model.compile(loss='mse',
#               optimizer='adam',
#               metrics=['mse'])

# history = model.fit(X_exp, y, batch_size=1, epochs=1000, validation_split=0.0)

In [None]:
model.predict(X_exp)

In [None]:
y.shape

In [None]:
y_exp = y_exp.reshape(2,1,1)

In [None]:
y_exp

# Multiple input

Let's see if we can get a neural net to add two numbers together, x+y=z
We did it!

In [None]:
x = np.array([[[1,0,0],[0,1,1],[0,1,1]],[[0,0,0],[0,1,1],[0,1,1]]])
x_exp = np.expand_dims(x, axis=2)
y = np.array([1,0])


In [None]:
model = Sequential()
model.add(Conv2D(3, kernel_size=(2, 2), strides=(1, 1),
                 activation='relu',
                 padding ="same",
                 input_shape=x_exp[0].shape))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',metrics=["acc"], optimizer="adam")
model.fit(x_exp, y, batch_size=32, epochs=1000)

In [None]:
model.predict(x_exp)

In [None]:
test = np.array([[1,1,2],[2,1,1]])
max_len = 5
test_pad = np.pad(test, ((0,max_len-len(test)),(0,0)), "constant", constant_values=(0))

In [None]:
test_pad

In [None]:
rf = RandomForestClassifier(n_estimators=100)

In [None]:
rf.fit(Y_norm[:10000], Z[:10000])

In [None]:
rf.score(Y_norm[:1000], Z[:1000])

In [None]:
rf.score(Y_norm[10000:11000], Z[10000:11000])