In [75]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model

## Read Data

In [85]:
# Load the datasets
train_sentences  = np.load('FS_train.npy', allow_pickle=True)
train_labels  = np.load('FS_labels.npy', allow_pickle=True)
test_sentences  = np.load('FS_test.npy', allow_pickle=True)

## Preprocessing Data

In [86]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

In [87]:
# Creating the vocabulary with words appearing at least 5 times
word_counts = tokenizer.word_counts
vocab = {word for word, count in word_counts.items() if count >= 5}

In [65]:
# Load pre-trained embeddings
embedding_index = {}
with open('crawl-300d-2M.vec', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Intersecting the vocabulary with embeddings and replacing unknown words
vocab = vocab.intersection(set(embedding_index.keys()))
word_index = {word: i for i, word in enumerate(vocab, 1)}
word_index['UNKNOWN_PROXY'] = 0

In [88]:
# Tokenizing and padding
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=41, padding='post', truncating='post')

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=41, padding='post', truncating='post')

# Creating embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Create Model

In [120]:
# Model definition
input_layer = Input(shape=(41,))
embed = Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
bi_lstm = Bidirectional(LSTM(512, return_sequences=True))(embed)
dropout_1 = Dropout(0)(bi_lstm)
lstm = LSTM(256)(dropout_1)
global_avg_pool = GlobalAveragePooling1D()(dropout_1)
global_max_pool = GlobalMaxPooling1D()(dropout_1)
concat = concatenate([lstm, global_avg_pool, global_max_pool])
dense_1 = Dense(1024, activation='relu')(concat)
dropout_2 = Dropout(0)(dense_1)
dense_2 = Dense(512, activation='relu')(dropout_2)
dropout_3 = Dropout(0)(dense_2)
dense_3 = Dense(256, activation='relu')(dropout_3)
dropout_4 = Dropout(0)(dense_3)
dense_4 = Dense(128, activation='relu')(dropout_4)
dropout_5 = Dropout(0)(dense_4)
output = Dense(1, activation='sigmoid')(dropout_5)
model = Model(inputs=input_layer, outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Train Model

In [None]:
# Model training
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
model.fit(train_padded, train_labels, epochs=20, validation_split=0.2, callbacks=[es, mc])

In [122]:
# Load the best model and make predictions on the test set
model.load_weights('best_model.h5')
test_predictions = model.predict(test_padded)
test_predictions = (test_predictions > 0.5).astype(int)



## Extract Output

In [123]:
# Saving predictions to CSV
np.savetxt('submission.csv', test_predictions, fmt='%d', delimiter=',')

print("Predictions saved to submission.csv.")

Predictions saved to submission.csv.
