In [None]:
import pandas as pd
from keras.src.layers import BatchNormalization
from sklearn.model_selection import train_test_split

data = pd.read_csv("./data/kaggle/input/dga-domain-detection-challenge-i/train.csv.gz")

print(f'Full data shape: {data.shape}')

data = data.sample(frac=0.3, random_state=42)

print(f'Data shape: {data.shape}')

def preprocess_domain(domain):
    return str(domain).split('.')[0]


data["domain"] = data["domain"].apply(preprocess_domain)


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_LEN = 64  # Максимальная длина домена
MAX_WORDS = 64  # Максимальный размер словаря

tokenizer = Tokenizer(num_words=MAX_WORDS, char_level=True, oov_token='<OOV>')
tokenizer.fit_on_texts(data['domain'].values)

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)

sequences = tokenizer.texts_to_sequences(data['domain'].values)

# Считаем OOV
total_tokens = 0
oov_tokens = 0

for seq in sequences:
    total_tokens += len(seq)
    oov_tokens += seq.count(1)  # <OOV> имеет индекс 1

oov_percentage = (oov_tokens / total_tokens) * 100
print(f"vocab_size={MAX_WORDS}: {total_tokens} tokens, {oov_percentage:.1f}% OOV")


X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
y = data['label'].values



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42, stratify=y_train)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_val shape:", X_val.shape)


In [None]:
import tensorflow as tf
from keras.src.backend import epsilon

# Простая реализация, которая работает без ошибок
def simple_fbeta_score(beta=0.5):
    """Простая F-beta метрика без внутренних tf.Variable"""
    beta_squared = beta ** 2

    def fbeta(y_true, y_pred):
        # Для sparse categorical (y_true: int, y_pred: probabilities)
        y_pred_class = tf.argmax(y_pred, axis=-1)
        y_true_class = tf.cast(y_true, tf.int64)

        # Создаем confusion matrix
        tp = tf.reduce_sum(tf.cast((y_true_class == 1) & (y_pred_class == 1), tf.float32))
        fp = tf.reduce_sum(tf.cast((y_true_class == 0) & (y_pred_class == 1), tf.float32))
        fn = tf.reduce_sum(tf.cast((y_true_class == 1) & (y_pred_class == 0), tf.float32))

        # Вычисляем precision и recall
        precision = tp / (tp + fp + epsilon())
        recall = tp / (tp + fn + epsilon())

        # F-beta score
        fbeta_value = (1 + beta_squared) * (precision * recall) / (
            beta_squared * precision + recall + epsilon())

        return fbeta_value

    # Даем метрике имя для отображения
    fbeta.__name__ = f'f_beta_{beta}_score'
    return fbeta


In [None]:
from keras import layers, models


model = models.Sequential([
    layers.Input(shape=(MAX_LEN,)),
    layers.Embedding(input_dim=vocab_size,output_dim=128),

    layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.BatchNormalization(),
    layers.MaxPooling1D(pool_size=2),

    layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
    layers.Dropout(0.2),
    layers.Bidirectional(layers.LSTM(32)),
    layers.Dropout(0.2),

    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),

    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),

    layers.Dense(2, activation='softmax'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[simple_fbeta_score()]
)

model.summary()


In [None]:
from keras import layers, models
from keras.src.legacy.layers import ThresholdedReLU

inputs = layers.Input(shape=(MAX_LEN,))

x = layers.Embedding(input_dim=vocab_size,output_dim=128)(inputs)

x1 = layers.Conv1D(filters=256, kernel_size=2, activation='relu', padding='same')(x)
x1 = ThresholdedReLU(1e-5)(x1)

x2 = layers.Conv1D(filters=256, kernel_size=3, activation='relu', padding='same')(x)
x2 = ThresholdedReLU(1e-5)(x2)

x = layers.Concatenate()([x1, x2])
x = layers.Flatten()(x)

x = layers.Dense(64, activation='relu')(x)
x = ThresholdedReLU(1e-5)(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(2, activation='softmax')(x)

model = models.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[simple_fbeta_score()]
)

model.summary()

In [None]:
from keras.src.legacy.layers import ThresholdedReLU
from keras import layers, models


model = models.Sequential([
    layers.Input(shape=(MAX_LEN,)),
    layers.Embedding(input_dim=vocab_size,output_dim=128),
    layers.Conv1D(filters=128, kernel_size=3, padding="same", strides=1),
    ThresholdedReLU(1e-6),
    layers.MaxPooling1D(pool_size=2, padding="same"),
    layers.Conv1D(filters=128, kernel_size=2, padding="same", strides=1),
    ThresholdedReLU(1e-6),
    layers.MaxPooling1D(pool_size=2, padding="same"),
    layers.Flatten(),
    layers.Dense(64),
    ThresholdedReLU(1e-6),
    layers.Dropout(0.5),
    layers.Dense(2, activation='softmax'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[simple_fbeta_score()]
)

model.summary()

In [None]:
from keras.src.legacy.layers import ThresholdedReLU
from keras import layers, models

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)

model = models.Sequential([
    layers.Input(shape=(MAX_LEN,)),
    layers.Embedding(input_dim=vocab_size, output_dim=128),
    layers.Conv1D(filters=128, kernel_size=6, activation='relu', padding='same'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(2, activation='softmax'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[simple_fbeta_score()]
)

model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=0.0001,
    verbose=1
)

# %%
# Параметры обучения
EPOCHS = 40
BATCH_SIZE = 256

# Обучение модели
print("Начинаем обучение модели...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)


In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    """Визуализация истории обучения"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # График точности
    ax1.plot(history.history['f_beta_05_score'], label='Точность на обучении', linewidth=2)
    ax1.plot(history.history['val_f_beta_05_score'], label='Точность на валидации', linewidth=2)
    ax1.set_title('Точность модели', fontsize=14)
    ax1.set_xlabel('Эпоха', fontsize=12)
    ax1.set_ylabel('Точность', fontsize=12)
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # График потерь
    ax2.plot(history.history['loss'], label='Потери на обучении', linewidth=2)
    ax2.plot(history.history['val_loss'], label='Потери на валидации', linewidth=2)
    ax2.set_title('Потери модели', fontsize=14)
    ax2.set_xlabel('Эпоха', fontsize=12)
    ax2.set_ylabel('Потери', fontsize=12)
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

# Визуализируем историю обучения
plot_training_history(history)


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Точность на тестовой выборке: {test_accuracy:.4f}")
print(f"Потери на тестовой выборке: {test_loss:.4f}")

In [None]:
model.save('dga_detector_cnn_lstm.keras')

import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
data_test = pd.read_csv("./data/kaggle/input/dga-domain-detection-challenge-i/test.csv.gz")

print(f'Test shape: {data_test.shape}')
data_test["domain"] = data_test["domain"].apply(preprocess_domain)


In [None]:
import numpy as np

test_sequences = tokenizer.texts_to_sequences(data_test['domain'].values)
print(f'test_sequences len: {len(test_sequences)}')

X_predict = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

print(f'X_predict len: {len(X_predict)}')

y_pred_proba = model.predict(X_predict)
y_pred = np.argmax(y_pred_proba, axis=1)

print(f'y_pred len: {len(y_pred)}')



In [None]:
data_test["label"] = y_pred
data_test[["id", "label"]].to_csv("submission_cnnlstm2.csv", index=False)
