In [1]:
import os
from collections import defaultdict
from Bio import SeqIO
import statistics
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MultiLabelBinarizer
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

os.chdir("/Users/zhongyuanchen/Desktop/CAFA-5-Protein-Function-Prediction")

In [2]:
import obonet
graph = obonet.read_obo("Train/go-basic.obo")

In [None]:
graph.nodes

In [80]:
my_annotation[0]

'GO:0005515'

In [77]:
graph.nodes[my_annotation[0]]

{'name': 'protein binding',
 'namespace': 'molecular_function',
 'alt_id': ['GO:0001948', 'GO:0045308'],
 'def': '"Binding to a protein." [GOC:go_curators]',
 'subset': ['goslim_candida',
  'goslim_chembl',
  'goslim_metagenomics',
  'goslim_pir',
  'goslim_plant'],
 'synonym': ['"glycoprotein binding" NARROW []',
  '"protein amino acid binding" EXACT []'],
 'is_a': ['GO:0005488']}

In [92]:
'GO:0045308' in terms_df["term"].to_list()

False

In [81]:
graph.nodes['GO:0045308']["is_a"]

KeyError: 'GO:0045308'

In [37]:
print(my_annotation)

['GO:0008152', 'GO:0044249', 'GO:0006259', 'GO:0009059', 'GO:0009987', 'GO:1901362', 'GO:0009058', 'GO:0044271', 'GO:0006725', 'GO:0034641', 'GO:0044237', 'GO:1901360', 'GO:0008150', 'GO:1901576', 'GO:0019058', 'GO:0071704', 'GO:0006139', 'GO:0044260', 'GO:0090304', 'GO:0043170', 'GO:0046483', 'GO:0034654', 'GO:0019438', 'GO:0018130', 'GO:0016032', 'GO:0044238', 'GO:0006807', 'GO:0005515', 'GO:0005488', 'GO:0003674']


In [53]:
def f(term):
    return ("is_a" not in graph.nodes[term]) or all([(child in my_annotation) for child in graph.nodes[term]["is_a"]])

my_term = entry_ids[10]
my_annotation = id_terms[my_term]
all(f(term) for term in my_annotation)

True

In [9]:
my_term = entry_ids[0]
my_annotation = id_terms[my_term]

In [82]:
# Read in the GO-terms data
terms_df = pd.read_csv('Train/train_terms.tsv',sep="\t")
terms_df.drop(columns = ["aspect"],inplace = True)
terms_df.set_index("EntryID",inplace = True)
freq_counts = terms_df["term"].value_counts()

In [61]:
len(freq_counts)

31466

In [76]:
freq_counts.loc[graph.nodes[my_annotation[0]]["is_a"][0]]


57380

In [74]:
graph.nodes[my_annotation[0]]["is_a"]

['GO:0005488']

In [7]:
entry_ids[0]

'P20536'

In [4]:
# Select num_labels most frequent GO terms and restrict to these prediction
num_labels = 1500
chosen_terms = freq_counts.index[:num_labels]
chosent_terms = set(chosen_terms)
filt = terms_df["term"].isin(chosen_terms)
terms_df = terms_df[filt]
terms = terms_df["term"]
sum(freq_counts.iloc[:num_labels])/sum(freq_counts.iloc)
#num_labels = len(chosen_terms)

0.824170378699083

In [5]:
# Multilabel encoding
terms_mlb = MultiLabelBinarizer()
terms_mlb.fit([terms])

In [6]:
embedded_sqs = np.load("Eembedding/train_embeds.npy")
entry_ids = np.load("Eembedding/train_ids.npy")
embedded_ids = {entry_id:indx for indx,entry_id in enumerate(entry_ids)}

id_terms = defaultdict(list)
for entry_id,row in terms_df.iterrows():
    id_terms[entry_id].append(row["term"])

In [None]:
y = []
for entry_id in entry_ids:
    anotation = id_terms[entry_id]
    y.append(terms_mlb.transform([anotation])[0])

In [None]:
X = embedded_sqs

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(force_alpha=True)
clf.fit(X_train, y_train)

print('Accuracy on test data: {:.1f}%'.format(accuracy_score(y_test, clf.predict(X_test))*100))

In [None]:
multilabel_model

In [None]:
# Define dictionaries for fast acessing of terms and sequences using their EntryIDs


# Define a data generator object to generate data for each epoch
# Without generator, the size of all sequences add up to 30 gigabytes
class Data_Generator(tf.keras.utils.Sequence):
    
    def __init__(self,indexs, batch_size):
        self.indexs = indexs
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.indexs) / self.batch_size)
    
    def on_epoch_end(self):
        random.shuffle(self.indexs)
    
    def __getitem__(self, idx):
        low = idx * self.batch_size
        # Cap upper bound at array length; the last batch may be smaller
        # if the total number of items is not a multiple of batch size.
        high = min(low + self.batch_size, len(self.indexs))
        batch_x = []
        batch_y = []

        for i in range(low,high):
            entry_id = self.indexs[i]
            anotation = id_terms[entry_id]
            batch_x.append(embedded_sqs[embedded_ids[entry_id]])
            batch_y.append(terms_mlb.transform([anotation])[0])

        return np.array(batch_x), np.array(batch_y)


In [None]:
weights = pd.read_csv("IA.txt",header = None,delimiter='\t')
weights.columns = ["term","weight"]
weight_dict = {row["term"]:row["weight"] for indx,row in weights.iterrows()}
weights = {indx:weight_dict[term] for indx,term in enumerate(terms_mlb.classes_)}

In [None]:
# Create training and testing datas by shuffling and spliting an array of EntryIDs
indexs = list(entry_ids)
random.shuffle(indexs)
train_test_split = int(0.3*len(indexs))
train_indexs = indexs[:train_test_split]
test_indexs = indexs[train_test_split:]

# Define the respected generators for training and testing
train_generator = Data_Generator(train_indexs,512)
test_generator = Data_Generator(test_indexs,512)

# Create a simple CNN for multilabel classification
#model = models.Sequential()
#model.add(layers.Dense(num_labels,activation = 'sigmoid'))

#
model = tf.keras.Sequential([
    tf.keras.layers.BatchNormalization(input_shape=[1024]),    
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=num_labels,activation='sigmoid')
])

my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
    tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
]

# Compile and train the model
model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=['binary_accuracy', tf.keras.metrics.AUC()],
             )
model.fit(x = train_generator,validation_data = test_generator,epochs = 20, callbacks=my_callbacks)

In [None]:
model.load_weights("model.11-0.06.h5")
model.evaluate(test_generator)

In [None]:
# Read in test data and preprocess similarly
embedded_sqs_test = np.load("Eembedding/test_embeds.npy")
embedded_ids_test = np.load("Eembedding/test_ids.npy")
prediction = model.predict(embedded_sqs_test)

In [None]:
with open("submission_test.tsv","w") as f:
    for indx,entry_id in enumerate(embedded_ids_test):
        for i in range(1500):
            print (f"{entry_id}\t{terms_mlb.classes_[i]}\t{prediction[indx][i]}",file = f)
        if indx%3000 == 0:
            print(f"{indx/len(embedded_ids_test)}% completed")
    print("Completed")

In [None]:
pred_df = []
for indx,entry_id in enumerate(embedded_ids_test):
    for i in range(1500):
        pred_df.append([entry_id,terms_mlb.classes_[i],prediction[indx][i]])
    if indx%3000 == 0:
        print(f"{indx/len(embedded_ids_test)}% completed")
print("Completed")

submission = pd.DataFrame(pred_df)
submission.columns = ["EntryID","Terms","Prediction"]
submission.to_csv('submission.tsv', sep="\t",header = False,index = False)

In [None]:
trimed = submission[submission["Prediction"] > 0.4]

In [None]:
len(trimed)

In [None]:
trimed.to_csv('submission.tsv', sep="\t",header = False,index = False)

In [None]:
submission.to_csv('submission.tsv', sep="\t",header = False,index = False)

In [None]:
print("\n\n\n\n\n\n\n\n\n\n\n\n\n")