In [102]:
# imports
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import nltk
from nltk.tokenize import word_tokenize
import gzip

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\eryoo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
def load_data(file_path):
    labels = []
    texts = []

    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            parts = line.strip().split('\t')
            if len(parts) < 2:
                print(f"Skipping malformed line {line_num}: {line}")
                continue
            label = parts[0]
            text = '\t'.join(parts[1:])
            labels.append(label)
            texts.append(text)

    return pd.DataFrame({'label': labels, 'text': texts})

def read_tsv(file_path):
    lines = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            lines.append(line.rstrip('\n'))
    return pd.DataFrame(lines, columns=['text'])

In [85]:
train_df = load_data('train/train.tsv.gz')
dev_df = pd.read_csv('dev-0/in.tsv', sep='\t', header=None, names=['text'])
dev_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None, names=['label'])
test_df = read_tsv('test-A/in.tsv')

In [86]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

In [87]:
train_df['tokens'] = train_df['text'].apply(preprocess)
dev_df['tokens'] = dev_df['text'].apply(preprocess)
test_df['tokens'] = test_df['text'].apply(preprocess)

In [89]:
# download polish fasttext embedding: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz
# and put it directly in the current directory (warning: after unpacking it will be 4,5 GB)
fasttext_path = 'cc.pl.300.vec'
word2vec = KeyedVectors.load_word2vec_format(fasttext_path, binary=False)
vector_size = word2vec.vector_size

def vectorize(tokens, model, vector_size):
    vecs = [model[word] for word in tokens if word in model]
    if not vecs:
        return np.zeros(vector_size)
    return np.mean(vecs, axis=0)

train_vectors = np.array([vectorize(tokens, word2vec, vector_size) for tokens in train_df['tokens']])
dev_vectors = np.array([vectorize(tokens, word2vec, vector_size) for tokens in dev_df['tokens']])
test_vectors = np.array([vectorize(tokens, word2vec, vector_size) for tokens in test_df['tokens']])

In [92]:
dev_labels['label'] = dev_labels['label'].astype(str)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_dev = label_encoder.transform(dev_labels['label'])

In [None]:
model = Sequential()
model.add(Input(shape=(vector_size,)))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(train_vectors, y_train,
          epochs=13,
          batch_size=32,
          validation_data=(dev_vectors, y_dev),
          callbacks=[early_stop])

Epoch 1/13
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8967 - loss: 0.2528 - val_accuracy: 0.9600 - val_loss: 0.1099
Epoch 2/13
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9496 - loss: 0.1393 - val_accuracy: 0.9356 - val_loss: 0.1694
Epoch 3/13
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9527 - loss: 0.1324 - val_accuracy: 0.9666 - val_loss: 0.0923
Epoch 4/13
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9561 - loss: 0.1240 - val_accuracy: 0.9666 - val_loss: 0.0929
Epoch 5/13
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9571 - loss: 0.1191 - val_accuracy: 0.9685 - val_loss: 0.0867
Epoch 6/13
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9584 - loss: 0.1150 - val_accuracy: 0.9710 - val_loss: 0.0805
Epoch 7/13
[1m3

<keras.src.callbacks.history.History at 0x276b0c02990>

In [101]:
loss, accuracy = model.evaluate(dev_vectors, y_dev)
print(f'Dev Accuracy: {accuracy:.4f}')

[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9746 - loss: 0.0765   
Dev Accuracy: 0.9741


In [103]:
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    train_vectors, y_train, test_size=0.2, random_state=42)

model = Sequential()
model.add(Input(shape=(vector_size,)))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(X_train_split, y_train_split,
          epochs=13,
          batch_size=32,
          validation_data=(X_test_split, y_test_split),
          callbacks=[early_stop])

Epoch 1/13
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8899 - loss: 0.2650 - val_accuracy: 0.9608 - val_loss: 0.1051
Epoch 2/13
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9484 - loss: 0.1432 - val_accuracy: 0.9653 - val_loss: 0.0971
Epoch 3/13
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9539 - loss: 0.1297 - val_accuracy: 0.9658 - val_loss: 0.0906
Epoch 4/13
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9539 - loss: 0.1262 - val_accuracy: 0.9615 - val_loss: 0.0961
Epoch 5/13
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9572 - loss: 0.1202 - val_accuracy: 0.9698 - val_loss: 0.0835
Epoch 6/13
[1m2454/2454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9605 - loss: 0.1108 - val_accuracy: 0.9689 - val_loss: 0.0828
Epoch 7/13
[1m2

<keras.src.callbacks.history.History at 0x276a09a3990>

In [104]:
loss, accuracy = model.evaluate(X_test_split, y_test_split)
print(f'Test Accuracy: {accuracy:.4f}')

[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 889us/step - accuracy: 0.9734 - loss: 0.0701
Test Accuracy: 0.9730
