In [1]:
!nvidia-smi

Sat Jan 28 02:36:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.106.00   Driver Version: 460.106.00   CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-DGXS...  Off  | 00000000:07:00.0 Off |                    0 |
| N/A   41C    P0    52W / 300W |      0MiB / 32508MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-DGXS...  Off  | 00000000:08:00.0 Off |                    0 |
| N/A   40C    P0    52W / 300W |      0MiB / 32508MiB |      0%      Default |
|       

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

2023-01-28 02:36:58.319780: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-28 02:36:58.484706: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17474990127066958060
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 32476168192
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11983482615229923725
physical_device_desc: "device: 0, name: Tesla V100-DGXS-32GB, pci bus id: 0000:0e:00.0, compute capability: 7.0"
xla_global_id: 416903419
]


2023-01-28 02:37:01.605855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-28 02:37:04.926955: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /device:GPU:0 with 30971 MB memory:  -> device: 0, name: Tesla V100-DGXS-32GB, pci bus id: 0000:0e:00.0, compute capability: 7.0


In [4]:
import gc
import gdown
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
import tqdm.notebook as tqdm

In [5]:
aspect = ['molecular','function']
aspect_abbr = 'mf'

In [6]:
file = 'cafa3_train/mf_df.csv'
# if not os.path.exists(file):
#     url = "https://drive.google.com/file/d/1RyeLQPFTMWAIr-OzELTWIx60ln-mZ7g_/view?usp=sharing"
#     output = file
#     gdown.download(url=url, output=output, quiet=False, fuzzy=True)

In [7]:
df = pd.read_csv(file)
df

Unnamed: 0,Entry,Sequence,Gene Ontology (molecular function)
0,A0A060X6Z0,MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKE...,GO:0003824;GO:0016491;GO:0016705;GO:0004497;GO...
1,A0A078CGE6,MARQMTSSQFHKSKTLDNKYMLGDEIGKGAYGRVYIGLDLENGDFV...,GO:0016773;GO:0003824;GO:0004672;GO:0004674;GO...
2,A0A086F3E3,MTKGRLEAFSDGVLAIIITIMVLELKVPEGSSWASLQPILPRFLAY...,GO:0022892;GO:0022838;GO:0005216;GO:0015075;GO...
3,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,GO:0003824;GO:0016491;GO:0016705;GO:0004497;GO...
4,A0A096SRM5,MAANGGDHTSARPHVVLLPSAGMGHLVPFARLAVALSEGHGCNVSV...,GO:0003824;GO:0016758;GO:0016740;GO:0008194;GO...
...,...,...,...
36105,V5JFY4,MGPWTLLLLHLPLVVSMLPAPTNVSIVSFNLEHTLTWLPGPETPDN...,GO:0004872;GO:0004888;GO:0003674;GO:0005488;GO...
36106,V5YM14,MRPNLLAAAIAVPLSLLAAQIAQAGEGMWVPQQLPEIAGPLKKAGL...,GO:0003824;GO:0005515;GO:0042803;GO:0046983;GO...
36107,V5YMB3,MRHPAFRLTLLASTVAFALAPQAAQAAPSAADRIAGTELIARDALF...,GO:0003824;GO:0008233;GO:0016787;GO:0003674;GO...
36108,V9GXG1,MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,GO:0004518;GO:0003824;GO:0016787;GO:0044877;GO...


In [8]:
go_terms_bp = set()
for idx, row in df.iterrows():
    for term in row['Gene Ontology ('+' '.join(aspect)+')'].split(';'):
        go_terms_bp.add(term)
go_terms_bp = list(go_terms_bp)
go_terms_bp.sort()
print(len(go_terms_bp))
# print(go_terms_bp)

677


In [9]:
def get_segments(sequence,segment_size=100,gap=30):
    segments = []
    start = 0
    end = segment_size
    while end <= len(sequence):
        segments.append(sequence[start:end])
        start += gap
        end += gap
    last_segment = sequence[start:]
    segments.append(last_segment)
    
    return segments

def get_training_data(df,segment_size=100,gap=30):
    training_data = list()
    for idx,row in tqdm.tqdm(df.iterrows()):
        labels = [0] * len(go_terms_bp)
        for term in row['Gene Ontology ('+' '.join(aspect)+')'].split(';'):
            labels[go_terms_bp.index(term)] = 1
        segments = get_segments(row['Sequence'],segment_size,gap)
        for segment in segments:
            training_data.append([row['Entry'],segment,labels])
    return training_data

In [10]:
training_data = get_training_data(df,gap=30)
print(len(training_data))

0it [00:00, ?it/s]

601353


In [11]:
def get_ngrams(segment,n=3):
    ngrams = []
    for i in range(len(segment)-n+1):
        ngrams.append(segment[i:i+n])
    return ngrams

In [12]:
# Generate training data of ngrams
# if os.path.exists('bp/training_data_4grams.npy'):
#     print('Loading saved ngrams...')
#     training_data_ngrams = np.load('bp/training_data_4grams.npy',allow_pickle=True)
# else:
print('Preparing from scratch...')
training_data_ngrams = []

for i in tqdm.tqdm(range(len(training_data))):
    training_data_ngrams.append([training_data[i][0],get_ngrams(training_data[i][1],n=4),training_data[i][2]])
        
#     np.save('bp/training_data_4grams.npy',training_data_ngrams)
    
print(len(training_data_ngrams))

Preparing from scratch...


  0%|          | 0/601353 [00:00<?, ?it/s]

601353


In [13]:
def get_skip_grams(segment,skip=1,n=3):
    skip_grams = []
    window_size = skip + n
    for i in range(len(segment)-window_size+1):
        window = segment[i:i+window_size]
        indices = list(range(window_size))
        indices.pop(0)
        for idx in indices[::-1]:
            temp = ''
            for j in range(window_size):
                if j!=idx:
                    temp+=window[j]
            skip_grams.append(temp)

    return skip_grams

In [14]:
# if os.path.exists('bp/training_data_skip1_4grams.npy'):
#     print('Loading saved skip grams...')
#     training_data_skip_grams = np.load('bp/training_data_skip1_4grams.npy',allow_pickle=True)
# else:
print('Preparing from scratch...')
training_data_skip_grams = []
for i in tqdm.tqdm(range(len(training_data))):
    training_data_skip_grams.append([training_data[i][0],get_skip_grams(training_data[i][1],n=4),training_data[i][2]])
#     np.save('bp/training_data_skip1_4grams.npy',training_data_skip_grams)
print(len(training_data_skip_grams))

Preparing from scratch...


  0%|          | 0/601353 [00:00<?, ?it/s]

601353


In [15]:
import tensorflow as tf
import tensorflow_addons as tfa

In [16]:
#Assuming training_data as global variable

def train_test_split(X,y,fold_no,prev_index,Kfolds=5):
    test_split = 1/Kfolds
    
    start_index = prev_index
    end_index = (fold_no + 1) * (test_split) * len(X)
    end_index = round(end_index)
    
    if end_index==len(X):
        end_index -= 1
    
    entry = training_data[end_index][0]
    entries = [sample[0] for sample in training_data]
    
    first_occurence = entries.index(entry)
    entries.reverse()
    
    last_occurence = entries.index(entry)
    last_occurence = len(entries) - last_occurence - 1
    
    del entries
    gc.collect()
    
    end_index = first_occurence if (abs(end_index-first_occurence) < abs(end_index-last_occurence)) else last_occurence
    
    X_test = X[start_index:end_index+1]
    y_test = y[start_index:end_index+1]
    X_train = X[:start_index]
    X_train.extend(X[end_index+1:])
    y_train = y[:start_index]
    y_train.extend(y[end_index+1:])
    
    return X_train, y_train, X_test, y_test, start_index, end_index + 1

In [17]:
MAX_WORDS = 331776
MAX_LEN_NG = 97 #100
MAX_LEN_SG = 384 #300
# MAX_WORDS = 7962624
# MAX_LEN_NG = 96 #100
# MAX_LEN_SG = 475 #300

def tokenization(X_train,X_test,maxlen):

    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(X_train)

    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)

    vocab_size = len(tokenizer.word_index) + 1

    X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)
    
    return X_train, X_test, vocab_size, tokenizer

In [18]:
from tensorflow.keras import backend as K

false_negative_penalty = 6
false_positive_penalty = 1

def custom_loss(y_true, y_logit):

    loss = float(0)
    y_true = tf.cast(y_true, tf.float32)
    y_logit = tf.cast(y_logit, tf.float32)
    
    first_term = false_negative_penalty * float(y_true) * - K.log(y_logit + K.epsilon())
    second_term = false_positive_penalty * (1 - float(y_true)) * - K.log(1 - y_logit + K.epsilon())
    
    loss = K.mean(first_term+second_term)

    return loss

def precision(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=1)
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=1)
    precision = true_positives / (predicted_positives + K.epsilon())
    
    return K.mean(precision)


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=1)
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=1)
    recall = true_positives / (possible_positives + K.epsilon())
    return K.mean(recall)

def f1_score(y_true, y_pred):
    rec = recall(y_true,y_pred)
    prec = precision(y_true,y_pred)
    f1 = 2*prec*rec/(prec+rec)
    return f1

In [19]:
class attention(tf.keras.layers.Layer):
    
    def __init__(self, return_sequences=True,**kwargs):
        self.return_sequences = return_sequences
        super(attention,self).__init__()

    def get_config(self):
      config = super().get_config().copy()
      config.update({
          'return_sequences': self.return_sequences 
      })
      return config
        
    def build(self, input_shape):
        
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="zeros")
        
        super(attention,self).build(input_shape)
        
    def call(self, x):
        
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        
        if self.return_sequences:
            return output
        
        return K.sum(output, axis=1)

In [20]:
NUM_CLASSES = 677 #For bp (Change according to aspects)

def get_model_ng_sg(vocab_size_ng, vocab_size_sg):
    #Input layers

    input_ngrams = tf.keras.layers.Input(shape=(MAX_LEN_NG,)) 
    input_skip_grams = tf.keras.layers.Input(shape=(MAX_LEN_SG,)) 

    #embeddings
    embedding_layer_ngrams = tf.keras.layers.Embedding(vocab_size_ng, 32)(input_ngrams)
    embedding_layer_skip_grams = tf.keras.layers.Embedding(vocab_size_sg, 32)(input_skip_grams)

    #BI-LSTMs for each of the inputs
    sequence_output_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(70,return_sequences=True))(embedding_layer_ngrams)
    dropout_0 = tf.keras.layers.Dropout(0.2)(sequence_output_1)
    attention_output_1 = attention(return_sequences=False)(dropout_0)
    dropout_1 = tf.keras.layers.Dropout(0.2)(attention_output_1)
    dense_layer_1 = tf.keras.layers.Dense(NUM_CLASSES, activation='sigmoid')(dropout_1)

    sequence_output_2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(70, return_sequences=True))(embedding_layer_skip_grams)
    dropout_2_0 = tf.keras.layers.Dropout(0.2)(sequence_output_2)
    attention_output_2 = attention(return_sequences=False)(dropout_2_0)
    dropout_2 = tf.keras.layers.Dropout(0.2)(attention_output_2)
    dense_layer_2 = tf.keras.layers.Dense(NUM_CLASSES, activation='sigmoid')(dropout_2)

    max_layer = tf.keras.layers.Maximum()([dense_layer_1,dense_layer_2])

    model = tf.keras.models.Model(
        inputs=[
            input_ngrams,
            input_skip_grams
        ], 
        outputs=max_layer)
    
    return model

In [21]:
def evaluate_annotations(real_annots, pred_annots):
    total = 0
    p = 0.0
    r = 0.0
    p_total= 0
    fps = []
    fns = []
    for i in range(len(real_annots)):
        if len(real_annots[i]) == 0:
            continue
        tp = set(real_annots[i]).intersection(set(pred_annots[i]))
        fp = set(pred_annots[i]) - tp
        fn = set(real_annots[i]) - tp
        
        fps.append(fp)
        fns.append(fn)
        tpn = len(tp)
        fpn = len(fp)
        fnn = len(fn)
        total += 1
        recall = tpn / (1.0 * (tpn + fnn))
        r += recall
        if len(pred_annots[i]) > 0:
            p_total += 1
            precision = tpn / (1.0 * (tpn + fpn))
            p += precision
    r /= total
    if p_total > 0:
        p /= p_total
    f = 0.0
    if p + r > 0:
        f = 2 * p * r / (p + r)
    return f, p, r, fps, fns

### Model using skip grams and ngrams

In [22]:
#Considering ngrams and skip grams
X_train_ng = [' '.join(sample[1]) for sample in training_data_ngrams]
X_train_sg = [' '.join(sample[1]) for sample in training_data_skip_grams]
y_train = [sample[2] for sample in training_data]

del training_data_ngrams
del training_data_skip_grams

In [23]:
print(len(X_train_ng),len(X_train_sg))

601353 601353


In [24]:
test_df = pd.read_csv('cafa3_test/mf_df.csv')
test_df

Unnamed: 0,Entry,Sequence,Gene Ontology (molecular function)
0,T100900000026,MAESFKELDPDSSMGKALEMTCAIQNQLARILAEFEMTLERDVLQP...,GO:0003824;GO:0016818;GO:0016462;GO:0003924;GO...
1,T100900000046,MRLCIPQVLLALFLSMLTAPGEGSRRRATQEDTTQPALLRLSDHLL...,GO:0005515;GO:0005488;GO:0003674;GO:0042802
2,T100900000115,MNNLSFSELCCLFCCPPCPGKIASKLAFLPPDPTYTLMCDESGSRW...,GO:0016790;GO:0003824;GO:0016788;GO:0016787;GO...
3,T100900000116,MPEPGPRMNGFSLGELCWLFCCPPCPSRIAAKLAFLPPEPTYTVLA...,GO:0016790;GO:0003824;GO:0016788;GO:0016787;GO...
4,T100900000161,MADDLEQQPQGWLSSWLPTWRPTSMSQLKNVEARILQCLQNKFLAR...,GO:0004620;GO:0003824;GO:0052689;GO:0016788;GO...
...,...,...,...
1132,T992870001087,MIDGKTANEIFDSIRQHIIAGTLRAEDSLPPVRELASELKVNRNTV...,GO:0030170;GO:0003700;GO:1901363;GO:0001071;GO...
1133,T992870001259,MKQGLQLRLSQQLAMTPQLQQAIRLLQLSTLELQQELQQALENNPL...,GO:1901363;GO:1990837;GO:0001071;GO:0003690;GO...
1134,T992870001336,MDYQNNVSEERVAEMIWDAVSEGATLKDVHGIPQDMMDGLYAHAYE...,GO:0005515;GO:0005488;GO:0003674;GO:0042802
1135,T992870001601,MTVDSNTSSGRGNDPEQIDLIELLLQLWRGKMTIIVAVIIAILLAV...,GO:0005515;GO:0005488;GO:0003674;GO:0042802


In [25]:
testing_data = get_training_data(test_df,gap=30)
print(len(testing_data))

0it [00:00, ?it/s]

16015


In [26]:
print('Preparing from scratch...')
testing_data_ngrams = []

for i in tqdm.tqdm(range(len(testing_data))):
    testing_data_ngrams.append([testing_data[i][0],get_ngrams(testing_data[i][1],n=4),testing_data[i][2]])
    
print(len(testing_data_ngrams))

Preparing from scratch...


  0%|          | 0/16015 [00:00<?, ?it/s]

16015


In [27]:
print('Preparing from scratch...')
testing_data_skip_grams = []

for i in tqdm.tqdm(range(len(testing_data))):
    testing_data_skip_grams.append([testing_data[i][0],get_skip_grams(testing_data[i][1],n=4),testing_data[i][2]])
    
print(len(testing_data_skip_grams))

Preparing from scratch...


  0%|          | 0/16015 [00:00<?, ?it/s]

16015


In [28]:
#Considering ngrams and skip grams
X_test_ng = [' '.join(sample[1]) for sample in testing_data_ngrams]
X_test_sg = [' '.join(sample[1]) for sample in testing_data_skip_grams]
y_test = [sample[2] for sample in testing_data]

del testing_data_ngrams
del testing_data_skip_grams

In [29]:
print(len(X_test_ng),len(X_test_sg))

16015 16015


In [30]:
def compute_metrics(predictions):
    final_predictions = []
    actual_y_test = []

    current_entry = ''
    counter = 0
    total_counts = 0
    start_index = 0

    if len(predictions) == len(testing_data):
        temp = np.zeros(NUM_CLASSES)
        for i in range(len(predictions)):
            if current_entry != testing_data[start_index+i][0]:
                #compute prev
                if i!=0:
                    temp /= counter
                    final_predictions.append(temp)

                #reset
                total_counts += counter
                counter = 1
                temp = np.zeros(NUM_CLASSES)

                #init new
                current_entry = testing_data[start_index+i][0]
                temp += np.array(predictions[i])
                actual_y_test.append(testing_data[start_index+i][2])
            else:
                temp += np.array(predictions[i])
                counter += 1

        total_counts += counter
        temp /= counter
        final_predictions.append(temp)

    else:
        print('Lengths of predictions dont match with test data')
    
    final_predictions = np.array(final_predictions, dtype=float)
    actual_y_test = np.array(actual_y_test, dtype=float)
    
    #Computing maximal F1-score
    fmax = 0.0
    tmax = 0.0
    precisions = []
    recalls = []
    
    for t in tqdm.tqdm(range(0, 101)):
        
        threshold = t / 100.0
        pred_annots = []
        real_annots = []
    
        for i in range(len(final_predictions)):
            new_preds = []
            new_ys = []
            for j in range(NUM_CLASSES):
                if final_predictions[i][j]>=threshold:
                    new_preds.append(go_terms_bp[j]) #GO_TERMS_BP
                if actual_y_test[i][j]==1:
                    new_ys.append(go_terms_bp[j])
            pred_annots.append(new_preds)
            real_annots.append(new_ys)
    
        fscore, prec, rec, fps, fns = evaluate_annotations(real_annots, pred_annots)
        avg_fp = sum(map(lambda x: len(x), fps)) / len(fps)
        print(f'{avg_fp}')
        precisions.append(prec)
        recalls.append(rec)
        print(f'Fscore: {fscore}, Precision: {prec}, Recall: {rec}, threshold: {threshold}')
        if fmax < fscore:
            fmax = fscore
            tmax = threshold
    print('\nFinal Results (Maximal F1-score):')
    print(f'Fmax: {fmax:0.3f}, threshold: {tmax}')
    precisions = np.array(precisions)
    recalls = np.array(recalls)
        
    
#     rec = recall(actual_y_test,final_predictions)
#     prec = precision(actual_y_test,final_predictions)
#     f1 = f1_score(actual_y_test,final_predictions)
    
#     return rec,prec,f1

In [31]:

recs = []
precs = []
f1s = []

for i in range(1):
    
#     print('Splitting into train-test...')
#     X_train_ng, y_train, X_test_ng, y_test, start_index, prev_index1 = train_test_split(X_ng,y,i,prev_index,Kfolds)
#     X_train_sg, _, X_test_sg, _, _, _= train_test_split(X_sg,y,i,prev_index,Kfolds)
    
#     prev_index = prev_index1
    
    print('Tokenizing...')
    X_train_ng, X_test_ng, vocab_size_ng, tokenizer1 = tokenization(X_train_ng, X_test_ng, MAX_LEN_NG)  
    X_train_sg, X_test_sg, vocab_size_sg, tokenizer2 = tokenization(X_train_sg, X_test_sg, MAX_LEN_SG)
    
   
    print('Shuffling...')   
    shuffled = [[X_train_ng[i],X_train_sg[i],y_train[i]] for i in range(len(X_train_ng))]
    np.random.shuffle(shuffled)

    X_train_ng = [shuffled[i][0] for i in range(len(shuffled))]
    X_train_sg = [shuffled[i][1] for i in range(len(shuffled))]
    y_train = [shuffled[i][2] for i in range(len(shuffled))]
    X_train_ng = np.array(X_train_ng)
    X_train_sg = np.array(X_train_sg)
    y_train = np.array(y_train)
    
    
    model = get_model_ng_sg(vocab_size_ng, vocab_size_sg)
    
    model.compile(
        loss=custom_loss, 
        optimizer='adam', 
        metrics=[
            recall,
            precision,
            f1_score
        ])

    print('Training...')
    history = model.fit([X_train_ng,X_train_sg], y_train, batch_size=32, epochs=5,validation_split=0.2)
    
    print('Evaluating model...')
    predictions = model.predict([X_test_ng,X_test_sg])
    
    print('Computing Metrics...\n')
    compute_metrics(predictions)
    
#     recs.append(rec.numpy())
#     precs.append(prec.numpy())
#     f1s.append(f1.numpy())

# print('Recall:',sum(recs)/len(recs))
# print('Precision:',sum(precs)/len(precs))
# print('F1-Score:',sum(f1s)/len(f1s))

Tokenizing...
Shuffling...


2023-01-28 02:48:49.785559: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30971 MB memory:  -> device: 0, name: Tesla V100-DGXS-32GB, pci bus id: 0000:0e:00.0, compute capability: 7.0


Training...


2023-01-28 02:48:51.848193: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2605540112 exceeds 10% of free system memory.
2023-01-28 02:48:53.721556: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2605540112 exceeds 10% of free system memory.


Epoch 1/5


2023-01-28 02:49:02.353749: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating model...
Computing Metrics...



  0%|          | 0/101 [00:00<?, ?it/s]

668.5637642919964
Fscore: 0.024615668879219788, Precision: 0.01246120488626814, Recall: 1.0, threshold: 0.0
225.4160070360598
Fscore: 0.06534556477447555, Precision: 0.033888963965542584, Recall: 0.9104288082692883, threshold: 0.01
152.74582233948988
Fscore: 0.09004648983611811, Precision: 0.047480384950314654, Recall: 0.8700037329412451, threshold: 0.02
117.09938434476693
Fscore: 0.1111912193388366, Precision: 0.05950725303996494, Recall: 0.8457678011804206, threshold: 0.03
95.70976253298153
Fscore: 0.12914311961653277, Precision: 0.07007257644090344, Recall: 0.8225198836390525, threshold: 0.04
81.2928759894459
Fscore: 0.14512880330824232, Precision: 0.07976917967892214, Recall: 0.8034116767949911, threshold: 0.05
70.56200527704486
Fscore: 0.1602159665736338, Precision: 0.08919247567523773, Recall: 0.7865083755047079, threshold: 0.06
62.061565523306946
Fscore: 0.17424528834636133, Precision: 0.09825959325495508, Recall: 0.768669724257101, threshold: 0.07
55.38434476693052
Fscore: 0.18

2.6341248900615657
Fscore: 0.4693207736755468, Precision: 0.5982325047731251, Recall: 0.38611726274882685, threshold: 0.68
2.5277044854881265
Fscore: 0.4676043878899123, Precision: 0.6065363000765491, Recall: 0.3804575844612619, threshold: 0.69
2.4291996481970095
Fscore: 0.46619090241681777, Precision: 0.6143289963968609, Recall: 0.3756156728935009, threshold: 0.7
2.3368513632365877
Fscore: 0.4652175951788699, Precision: 0.6227923362113877, Recall: 0.37127905750633683, threshold: 0.71
2.241864555848725
Fscore: 0.464046469293065, Precision: 0.6337373823043302, Recall: 0.3660358944687584, threshold: 0.72
2.143359718557608
Fscore: 0.4623531307925529, Precision: 0.6434865278212422, Recall: 0.3607941086496599, threshold: 0.73
2.040457343887423
Fscore: 0.4610833389209206, Precision: 0.6536185454027839, Recall: 0.35616768302888285, threshold: 0.74
1.9560246262093228
Fscore: 0.4591066020946532, Precision: 0.661979654006768, Recall: 0.3514115496510304, threshold: 0.75
1.8583992963940192
Fscore:

In [32]:
history_new = model.fit([X_train_ng,X_train_sg], y_train, batch_size=32, epochs=5,validation_split=0.2)
    
print('Evaluating model...')
predictions = model.predict([X_test_ng,X_test_sg])

print('Computing Metrics...\n')
compute_metrics(predictions)

2023-01-28 03:48:34.586118: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2605540112 exceeds 10% of free system memory.
2023-01-28 03:48:36.494192: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 2605540112 exceeds 10% of free system memory.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating model...
Computing Metrics...



  0%|          | 0/101 [00:00<?, ?it/s]

668.5637642919964
Fscore: 0.024615668879219788, Precision: 0.01246120488626814, Recall: 1.0, threshold: 0.0
120.97625329815304
Fscore: 0.1091243385683682, Precision: 0.05840529624124293, Recall: 0.8291996845835232, threshold: 0.01
83.4547053649956
Fscore: 0.14236342123225051, Precision: 0.07827614086370059, Recall: 0.7853808428113424, threshold: 0.02
65.43447669305189
Fscore: 0.16905481177610446, Precision: 0.09510288397984894, Recall: 0.7601358544276727, threshold: 0.03
54.41424802110818
Fscore: 0.19161454621671975, Precision: 0.11002568784258548, Recall: 0.7413809008324621, threshold: 0.04
46.59982409850484
Fscore: 0.21099040072867786, Precision: 0.12347411869158281, Recall: 0.7245111554592357, threshold: 0.05
40.766051011433596
Fscore: 0.22926533579074915, Precision: 0.13663167554509978, Recall: 0.7119618182843641, threshold: 0.06
36.143359718557605
Fscore: 0.24684495602279477, Precision: 0.14984532356267577, Recall: 0.699935255520967, threshold: 0.07
32.338610378188214
Fscore: 0.26

2.0202286719437117
Fscore: 0.4738901127156319, Precision: 0.6853764468050044, Recall: 0.36214360607669543, threshold: 0.68
1.9648197009674582
Fscore: 0.47310130547793044, Precision: 0.6914230767883475, Recall: 0.3595658287592987, threshold: 0.69
1.9058927000879506
Fscore: 0.4707201172076977, Precision: 0.6988840687768317, Recall: 0.3548670441542511, threshold: 0.7
1.8425681618293754
Fscore: 0.4680229477383675, Precision: 0.7046929302114834, Recall: 0.35035633760777746, threshold: 0.71
1.781882145998241
Fscore: 0.466127332817407, Precision: 0.7126224831564322, Recall: 0.34633167588509206, threshold: 0.72
1.7264731750219877
Fscore: 0.46427285725086026, Precision: 0.7206785434026295, Recall: 0.34243873392319313, threshold: 0.73
1.6772207563764292
Fscore: 0.46188507417592206, Precision: 0.725490640488275, Recall: 0.3387873656696864, threshold: 0.74
1.6121372031662269
Fscore: 0.459976348234244, Precision: 0.7332301796142503, Recall: 0.33509577574007404, threshold: 0.75
1.5664028144239226
Fs

In [78]:

# recs = []
# precs = []
# f1s = []

# for i in range(1):
    
# #     print('Splitting into train-test...')
# #     X_train_ng, y_train, X_test_ng, y_test, start_index, prev_index1 = train_test_split(X_ng,y,i,prev_index,Kfolds)
# #     X_train_sg, _, X_test_sg, _, _, _= train_test_split(X_sg,y,i,prev_index,Kfolds)
    
# #     prev_index = prev_index1
    
#     print('Tokenizing...')
#     X_train_ng, X_test_ng, vocab_size_ng, tokenizer1 = tokenization(X_train_ng, X_test_ng, MAX_LEN_NG)  
#     X_train_sg, X_test_sg, vocab_size_sg, tokenizer2 = tokenization(X_train_sg, X_test_sg, MAX_LEN_SG)
    #print('Shuffling...')   
#     shuffled = [[X_train_ng[i],X_train_sg[i],y_train[i]] for i in range(len(X_train_ng))]
#     np.random.shuffle(shuffled)

#     X_train_ng = [shuffled[i][0] for i in range(len(shuffled))]
#     X_train_sg = [shuffled[i][1] for i in range(len(shuffled))]
#     y_train = [shuffled[i][2] for i in range(len(shuffled))]
#     X_train_ng = np.array(X_train_ng)
#     X_train_sg = np.array(X_train_sg)
#     y_train = np.array(y_train)
    
    
#     model = get_model_ng_sg(vocab_size_ng, vocab_size_sg)
    
#     model.compile(
#         loss=custom_loss, 
#         optimizer='adam', 
#         metrics=[
#             recall,
#             precision,
#             f1_score
#         ])

#     print('Training...')
#     history = model.fit([X_train_ng,X_train_sg], y_train, batch_size=32, epochs=5,validation_split=0.2)
    
#     print('Evaluating model...')
#     predictions = model.predict([X_test_ng,X_test_sg])
    
#     print('Computing Metrics...\n')
#     compute_metrics(predictions)
    
# #     recs.append(rec.numpy())
# #     precs.append(prec.numpy())
# #     f1s.append(f1.numpy())

# # print('Recall:',sum(recs)/len(recs))
# # print('Precision:',sum(precs)/len(precs))
# # print('F1-Score:',sum(f1s)/len(f1s))

Tokenizing...
Shuffling...
Training...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating model...
Computing Metrics...



  0%|          | 0/101 [00:00<?, ?it/s]

668.5637642919964
Fscore: 0.024615668879219788, Precision: 0.01246120488626814, Recall: 1.0, threshold: 0.0
107.41864555848724
Fscore: 0.13687349413773942, Precision: 0.07462921486932615, Recall: 0.8247730737629253, threshold: 0.01
71.59454705364996
Fscore: 0.18064335161697084, Precision: 0.10210483756357953, Recall: 0.782665996493573, threshold: 0.02
55.26912928759894
Fscore: 0.21104794976249536, Precision: 0.12275616485312657, Recall: 0.751716321466902, threshold: 0.03
45.54881266490765
Fscore: 0.2349369007137557, Precision: 0.1400761551075522, Recall: 0.7278283688636286, threshold: 0.04
38.86719437115215
Fscore: 0.25675292162712715, Precision: 0.15661089758323382, Recall: 0.7120791153223296, threshold: 0.05
33.792436235708
Fscore: 0.27687539591177557, Precision: 0.17276851951498973, Recall: 0.6966823115411549, threshold: 0.06
29.99208443271768
Fscore: 0.2943159381832155, Precision: 0.18732349019740804, Recall: 0.6863136247626433, threshold: 0.07
26.941952506596305
Fscore: 0.30860446

2.059806508355321
Fscore: 0.48048433757140113, Precision: 0.6407321002983046, Recall: 0.38435640268134247, threshold: 0.68
1.990325417766051
Fscore: 0.4793977078000418, Precision: 0.6476645620074115, Recall: 0.3805330941785182, threshold: 0.69
1.940193491644679
Fscore: 0.4775482265899588, Precision: 0.6528041585454426, Recall: 0.37647686659471685, threshold: 0.7
1.8891820580474934
Fscore: 0.47584883835958847, Precision: 0.6576660339250494, Recall: 0.3727884099146618, threshold: 0.71
1.832014072119613
Fscore: 0.47502083820877206, Precision: 0.6630712159376867, Recall: 0.37006774018027055, threshold: 0.72
1.7616534740545295
Fscore: 0.4753962373005117, Precision: 0.6719220023613672, Recall: 0.3678162391161046, threshold: 0.73
1.7115215479331574
Fscore: 0.4735895404904162, Precision: 0.67747545179912, Recall: 0.3640337565737277, threshold: 0.74
1.660510114335972
Fscore: 0.47185026113151024, Precision: 0.6836756974300958, Recall: 0.3602369044676129, threshold: 0.75
1.6077396657871592
Fscore

In [32]:
print(len(predictions))

16015


In [33]:
history_new = model.fit([X_train_ng,X_train_sg], y_train, batch_size=32, epochs=5,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
predictions = model.predict([X_test_ng,X_test_sg])
compute_metrics(predictions)



  0%|          | 0/101 [00:00<?, ?it/s]

668.5637642919964
Fscore: 0.024615668879219788, Precision: 0.01246120488626814, Recall: 1.0, threshold: 0.0
90.44591029023746
Fscore: 0.1392536272026431, Precision: 0.07631675779612021, Recall: 0.7942805672219185, threshold: 0.01
65.31310466138962
Fscore: 0.17544502977411952, Precision: 0.09917696267784197, Recall: 0.7595348762217585, threshold: 0.02
52.81618293755497
Fscore: 0.2011857537908022, Precision: 0.1165257519314751, Recall: 0.7356902374899461, threshold: 0.03
44.80650835532102
Fscore: 0.22256442488567377, Precision: 0.13174328637403748, Recall: 0.7165158794445, threshold: 0.04
38.96569920844327
Fscore: 0.2423893713041758, Precision: 0.14641005030194082, Recall: 0.7037026935382267, threshold: 0.05
34.49868073878628
Fscore: 0.2597601246625415, Precision: 0.15987416780985075, Recall: 0.6922849189528987, threshold: 0.06
30.940193491644678
Fscore: 0.2749873937602949, Precision: 0.1723172431569839, Recall: 0.6803596224663817, threshold: 0.07
28.00967458223395
Fscore: 0.290227652271

1.9876868953386104
Fscore: 0.4722753113377861, Precision: 0.6823265229483843, Recall: 0.36110938068029325, threshold: 0.68
1.9173262972735268
Fscore: 0.4700013554830123, Precision: 0.6886117639310421, Recall: 0.35674664079297347, threshold: 0.69
1.8583992963940192
Fscore: 0.46849360933790296, Precision: 0.6956263991232512, Recall: 0.3531761334844438, threshold: 0.7
1.7862796833773087
Fscore: 0.4663698489572931, Precision: 0.7039767467966808, Recall: 0.3486822768676181, threshold: 0.71
1.7361477572559367
Fscore: 0.4636321686171908, Precision: 0.7092581328889709, Recall: 0.3443715182171845, threshold: 0.72
1.6798592788038698
Fscore: 0.46175369501859037, Precision: 0.7159725858167026, Recall: 0.34076055603623195, threshold: 0.73
1.6191732629727353
Fscore: 0.4594602319395971, Precision: 0.724732630635796, Recall: 0.33634760703266237, threshold: 0.74
1.5549692172383465
Fscore: 0.45681192118783703, Precision: 0.7332308386293749, Recall: 0.3317473074741461, threshold: 0.75
1.5118733509234827


In [79]:
model.save('mf_model_0.494.h5')

In [80]:
np.save('mf_predictions',predictions)