In [1]:
from __future__ import annotations

In [2]:
# from IPython.core.display import display, HTML
# display(HTML('<style>.container { width:100% !important; }</style>'))

In [2]:
import datetime
import numpy as np
import tensorflow as tf
import tqdm.notebook as tqdm
import tensorflow_addons as tfa
import clang
from utils.cpp_tokens.tokens import tokens
from gensim.models import Word2Vec

In [3]:
def make_dataset(X, y, batch_size):
    X = tf.data.Dataset.from_tensor_slices(X)
    y = tf.data.Dataset.from_tensor_slices(y)
    
    return tf.data.Dataset.zip((X, y)).shuffle(len(X)).batch(batch_size)

In [4]:
def prepare_data(path, max_length=1000, labels=None, min_count=100):
    X_data, y_data, index = [], [], []
    for i in tqdm.tqdm(open(path)):
        ind, _, tokens, tags = i.split('\t')
        X_data.append(tokens.split())
        y_data.append(tags.split())
        index.append(ind)
        
    index = np.array(index)
        
    X_data = tf.keras.preprocessing.sequence.pad_sequences(X_data, maxlen=max_length, truncating='post', padding='post', value=0)
    
    if labels is None:
        labels, counts = np.unique([j for i in y_data for j in i], return_counts=True)
        labels = labels[counts >= min_count]
        
    label_to_id = {label: i for i, label in enumerate(labels)}
    
    for i in range(len(y_data)):
        value = [0] * len(labels)
        for j in y_data[i]:
            if j in label_to_id:
                value[label_to_id[j]] = 1
        y_data[i] = value

    y_data = np.array(y_data)
    
    return X_data, y_data, index, labels, label_to_id

In [5]:
X_data, y_data, index, labels, label_to_id = prepare_data('../data/cpp/train.txt')

0it [00:00, ?it/s]

In [6]:
counts = np.zeros(len(labels), dtype=int)
for i in y_data:
    counts += i

In [7]:
for i, j in zip(counts, labels):
    print(f'{i}:\t{j}')

2005:	*special
1946:	binary_search
3520:	bitmasks
7966:	brute_force
3253:	combinatorics
14233:	constructive_algorithms
6101:	data_structures
2705:	dfs_and_similar
738:	divide_and_conquer
8919:	dp
629:	dsu
362:	flows
918:	games
793:	geometry
338:	graph_matchings
1974:	graphs
15495:	greedy
1349:	hashing
7347:	implementation
455:	interactive
14264:	math
536:	matrices
7416:	number_theory
492:	probabilities
732:	shortest_paths
3841:	sortings
2017:	strings
3344:	trees
2092:	two_pointers


In [8]:
X_test_data, y_test_data, _, _, _ = prepare_data('../data/cpp/test.txt', labels=labels)

0it [00:00, ?it/s]

In [9]:
counts = np.zeros(len(labels), dtype=int)
for i in y_test_data:
    counts += i
for i, j in zip(counts, labels):
    print(f'{i}:\t{j}')

0:	*special
1029:	binary_search
347:	bitmasks
1290:	brute_force
381:	combinatorics
2949:	constructive_algorithms
1306:	data_structures
315:	dfs_and_similar
201:	divide_and_conquer
1508:	dp
157:	dsu
149:	flows
350:	games
197:	geometry
91:	graph_matchings
555:	graphs
4243:	greedy
52:	hashing
2436:	implementation
162:	interactive
3756:	math
4:	matrices
574:	number_theory
90:	probabilities
234:	shortest_paths
1768:	sortings
478:	strings
379:	trees
557:	two_pointers


In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X_data, y_data)

In [None]:
# train_dataset = make_dataset(X_train, y_train, 32)
# test_dataset = make_dataset(X_test, y_test, 32)

In [17]:
train_dataset = make_dataset(X_data, y_data, 32)
test_dataset = make_dataset(X_test_data, y_test_data, 32)

In [None]:
# w2v_model = Word2Vec.load('w2v.model')

In [11]:
emb_size = 128
# weights = np.zeros((len(tokens) + 1, emb_size))

# for i, token in enumerate(tokens):
#     try:
#         weights[i + 1] = w2v_model.wv[token]
#     except:
#         pass

embedding = tf.keras.layers.Embedding(len(tokens) + 1, emb_size, name='token_embedding', mask_zero=True)
# embedding.build((None, ))
# embedding.set_weights([weights])
# embedding.trainable = False

In [12]:
inputs = tf.keras.layers.Input((None, ), dtype=tf.int32, name='token_input')
embedded = embedding(inputs)
dropout = tf.keras.layers.Dropout(0.2, name='embedding_dropout')(embedded)

n_layers = 4
kernels = [3, 5, 7]
layers = []

for k in kernels:
    x = dropout
    n = x.shape[-1]
    for i in range(n_layers):
        x = tf.keras.layers.Conv1D(n, k, activation=tf.keras.activations.swish, padding='same', name=f'conv1d_{k}_{n}')(x)
        x = tf.keras.layers.BatchNormalization(name=f'batch_norm_{k}_{n}')(x)
        n *= 2
        
    x = tf.keras.layers.GlobalMaxPooling1D(name=f'max_pool_{k}')(x)
    layers.append(x)

x = tf.keras.layers.Concatenate(axis=-1, name='pool_concatenate')(layers)
x = tf.keras.layers.Dropout(0.2, name='concatenate_dropout')(x)
x = tf.keras.layers.Dense(units=512, activation=tf.keras.activations.swish, name=f'dense_1')(x)
x = tf.keras.layers.Dense(units=len(labels), activation='sigmoid', name='prediction')(x)
model = tf.keras.models.Model(inputs=inputs, outputs=x, name='multilabel_model')

In [13]:
model.summary()

Model: "multilabel_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
token_input (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
token_embedding (Embedding)     (None, None, 128)    68608       token_input[0][0]                
__________________________________________________________________________________________________
embedding_dropout (Dropout)     (None, None, 128)    0           token_embedding[0][0]            
__________________________________________________________________________________________________
conv1d_3_128 (Conv1D)           (None, None, 128)    49280       embedding_dropout[0][0]          
___________________________________________________________________________________

In [16]:
tf.keras.utils.plot_model(model)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tfa.metrics.F1Score(len(labels))])

In [15]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(f'weights/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/{{epoch}}'),
]

In [18]:
model.fit(train_dataset, epochs=30, validation_data=test_dataset, callbacks=callbacks)



Epoch 1/30
 126/1233 [==>...........................] - ETA: 6:52 - loss: 0.5029 - accuracy: 0.1317 - f1_score: 0.0518 

KeyboardInterrupt: 

In [None]:
model.fit(train_dataset, epochs=40, validation_data=test_dataset, callbacks=callbacks, initial_epoch=30)

In [None]:
model.fit(train_dataset, epochs=50, validation_data=test_dataset, callbacks=callbacks, initial_epoch=40)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy', tfa.metrics.F1Score(len(labels))])

In [None]:
model.fit(train_dataset, epochs=80, validation_data=test_dataset, callbacks=callbacks, initial_epoch=50)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy', tfa.metrics.F1Score(len(labels))])

In [None]:
model.fit(train_dataset, epochs=90, validation_data=test_dataset, callbacks=callbacks, initial_epoch=80)

In [None]:
model.evaluate(train_dataset)

In [None]:
model.evaluate(test_dataset)