<a href="https://colab.research.google.com/github/zhuchunlin1995/Deep-Learning/blob/master/Protein_Tertiary_Structure_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM

import pickle
import pandas as pd
import numpy as np
from sklearn import preprocessing

from google.colab import files

Using TensorFlow backend.


In [0]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving train_fold_1.pkl to train_fold_1.pkl
User uploaded file "train_fold_1.pkl" with length 119801324 bytes


In [0]:
def encoder_initialize(corpus):
  if type(corpus) is str:
    corpus = list(corpus)
  corpus_arr = np.expand_dims(corpus, 1) # n x 1 array
  encoder = preprocessing.OneHotEncoder()
  encoder.fit(corpus_arr)
  return encoder

# dictionary for amino acid and local structure type
AA_CHR = 'ACDEFGHIKLMNPQRSTVWXY'
Q8_CHR = 'GHITEBS-'
aa_encoder = encoder_initialize(AA_CHR)
q8_encoder = encoder_initialize(Q8_CHR)

def encoding(seq, encoder):
  #the encoding matrix is in 21 * n and 8 * n
  if type(seq) is str:
    seq = list(seq)
  seq_arr = np.expand_dims(seq, 1)
  oh_encoded = encoder.transform(seq_arr).toarray()
  return oh_encoded


def colToMat(x): 
  #1xn np array passed in
  return np.tile(x.transpose(), (x.shape[1],1,1))

def colToMatTranspose(x):
  x = colToMat(x)
  X_T = np.transpose(x)
  return X_T  


def sequencesToVolume(a, aa_encoder, q, q8_encoder, msa): 
  #MSA of length 21nx
  #a of length 1xn string
  #t encoding returns nx21
  #q8 . of length 1xn
  n = len(a)
  A_new = encoding(a, aa_encoder).transpose()
  Q_new = encoding(q, q8_encoder).transpose()
  
  msa_mat1 = colToMat(msa)
  msa_mat2 = msa_mat1.transpose((1,0,2))

  aa_mat1 = colToMat(A_new)
  aa_mat2 = aa_mat1.transpose((1,0,2))

  q8_mat1 = colToMat(Q_new)
  q8_mat2 = q8_mat1.transpose((1,0,2))

  final_matrix = np.concatenate([msa_mat2, msa_mat1, aa_mat2, aa_mat1, q8_mat2, q8_mat1], axis=2)
  
  return A_new, Q_new, final_matrix


In [0]:
train_matrix,train_aa_seq, train_ss_seq, train_casp_id, train_msas, train_phis, train_psis = [], [], [], [], [], [], []
for i in range(1):
  #load i+1
  fold = i+1
  data_loc = '/content/train_fold_{}.pkl'.format(fold)
  datafile = data_loc
  with open(datafile, 'rb') as f:
    train_input_f =  pickle.load(f)
  indices, pdbs, length_aas, pdb_aas, q8s, dcalphas, psis, phis, msas = train_input_f
  # number of data pts in train fold i+1
  print("num of data pts in train fold {}".format(fold), len(pdbs))
  for j in range(len(length_aas)):
    A, Q, matrix = sequencesToVolume(pdb_aas[j], aa_encoder, q8s[j], q8_encoder, np.array(msas[j]))
    print(matrix.shape)
    train_aa_seq.append(A)
    train_ss_seq.append(Q)
    train_phis.append(phis[j])
    train_psis.append(psis[j])
    train_msas.append(np.array(msas[j]))
    train_matrix.append(matrix)
    
    print("number of sample added: {} ".format(j))
    print("shape of matrix is: {}".format(matrix.shape))
print(len(train_aa_seq), len(train_ss_seq), len(train_psis), len(train_phis), len(train_msas), len(train_matrix))


num of data pts in train fold 1 335
(234, 234, 100)
number of sample added: 0 
shape of matrix is: (234, 234, 100)
(204, 204, 100)
number of sample added: 1 
shape of matrix is: (204, 204, 100)
(94, 94, 100)
number of sample added: 2 
shape of matrix is: (94, 94, 100)
(79, 79, 100)
number of sample added: 3 
shape of matrix is: (79, 79, 100)
(189, 189, 100)
number of sample added: 4 
shape of matrix is: (189, 189, 100)
(71, 71, 100)
number of sample added: 5 
shape of matrix is: (71, 71, 100)
(399, 399, 100)
number of sample added: 6 
shape of matrix is: (399, 399, 100)
(276, 276, 100)
number of sample added: 7 
shape of matrix is: (276, 276, 100)
(106, 106, 100)
number of sample added: 8 
shape of matrix is: (106, 106, 100)
(91, 91, 100)
number of sample added: 9 
shape of matrix is: (91, 91, 100)
(303, 303, 100)
number of sample added: 10 
shape of matrix is: (303, 303, 100)
(93, 93, 100)
number of sample added: 11 
shape of matrix is: (93, 93, 100)
(47, 47, 100)
number of sample add

In [0]:


# load protein names
testfile = "../content/train_fold_1.csv"
test_input = pd.read_csv(testfile, header=None)
protein_names = np.array(test_input.iloc[:,1])
protein_len = np.array(test_input.iloc[:,2])

# concatenate all output to one-dimensional
all_data = []
all_names = []
for i, pname in enumerate(protein_names):
    dist_flat = dist[i].ravel()
    array = np.concatenate([dist_flat, psi[i], phi[i]])
    all_data.append(array)

    length = protein_len[i]
    dist_names = ["{}_d_{}_{}".format(pname, i + 1, j + 1) for i in range(length) for
            j in range(length)]

    psi_names = ["{}_psi_{}".format(pname, i + 1) for i in range(length)]
    phi_names = ["{}_phi_{}".format(pname, i + 1) for i in range(length)]
    row_names = np.array(dist_names + psi_names + phi_names)
    all_names.append(row_names)

all_data = np.concatenate(all_data)
all_names = np.concatenate(all_names)
output = {"Id": all_names, "Predicted": all_data}
output = pd.DataFrame(output)
output.to_csv("SAVE_PATH", index=False)
