In [89]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras import ops
import tldextract
from sklearn.metrics import precision_recall_fscore_support as score

import pandas as pd
from sklearn.metrics import confusion_matrix, roc_auc_score
import numpy as np

In [107]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
eval = pd.read_csv('val.csv')
train = train.drop('gozi', axis=1)
train['target'] = 1 - pd.factorize(train['dga'])[0]
train = train.drop('dga', axis=1)
eval.head()

Unnamed: 0,domain,is_dga
0,r8s3-zzdxp.ru,1
1,mysolarfocus.com,0
2,m60oax.ru,1
3,kbgckwrax.nl,1
4,mostbetru32new.ru,0


In [108]:
train['tld'] = [tldextract.extract(d).domain for d in train['mortiscontrastatim.com']]
test['tld'] = [tldextract.extract(d).domain for d in test['domain']]
eval['tld'] = [tldextract.extract(d).domain for d in eval['domain']]

X, y = train['tld'], train['target']
X_eval, y_eval = eval['tld'], eval['is_dga']
X_test = test['tld']


(10000,)


In [109]:
validChars = { x: idx + 1 for idx, x in enumerate(set(''.join(X))) }
maxFeatures = len(validChars) + 1
maxlen = np.max([len(x) for x in X])
dfs = [X, X_eval, X_test]
X = [[validChars[y] for y in x] for x in X]
X = pad_sequences(X, maxlen=maxlen)
X_eval = [[validChars[y] for y in x] for x in X_eval]
X_eval = pad_sequences(X_eval, maxlen=maxlen)
X_test = [[validChars[y] for y in x] for x in X_test]
X_test = pad_sequences(X_test, maxlen=maxlen)


(10000, 63)


In [74]:
model = Sequential()
model.add(Embedding(maxFeatures, 128, input_length=maxlen))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')



In [75]:
model.fit(X, y, batch_size=16, epochs=15)
model.save('model.keras')


Epoch 1/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m776s[0m 18ms/step - loss: 0.3039
Epoch 2/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m764s[0m 18ms/step - loss: 0.1719
Epoch 3/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m735s[0m 17ms/step - loss: 0.1586
Epoch 4/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m745s[0m 18ms/step - loss: 0.1597
Epoch 5/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m738s[0m 17ms/step - loss: 0.1603
Epoch 6/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m739s[0m 18ms/step - loss: 0.1650
Epoch 7/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 18ms/step - loss: 0.1701
Epoch 8/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 18ms/step - loss: 0.1753
Epoch 9/15
[1m42182/42182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 18ms/step - loss: 0.1815
Epoch 10/15
[1m42182/42182

In [118]:
predictions = model.predict(X_eval)
print(predictions)
tn, fp, fn, tp = confusion_matrix(y_eval, predictions > 0.5).ravel().astype(int)
precision, recall, fscore, support = score(y_eval, predictions > 0.5)
print(tp, tn, fp, fn)
print((tp + tn) / (tp + tn + fp + fn))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step
[[1.0680774e-16]
 [5.9416667e-03]
 [2.9623476e-10]
 ...
 [8.0209005e-01]
 [9.9998552e-01]
 [1.8005538e-05]]
3129 4758 224 1889
0.7887


In [120]:
f = open('validation.txt','w')
print('True positive: ' + str(tp), file=f)
print('False positive: ' + str(fp), file=f) 
print('False negative: ' + str(fn), file=f) 
print('True negative: ' + str(tn), file=f) 
print('Accuracy: ' + str((tp + tn) / (tp + tn + fp + fn)), file=f) 
print('Precision: ' + str(precision), file=f) 
print('Recall: ' + str(recall), file=f) 
print('F1: ' + str(fscore), file=f) 

predictions = model.predict(X_test)

predict = pd.DataFrame({'domain': test['domain'], 'is_dga': predictions.ravel() > 0.5})
predict['is_dga'] = predict['is_dga'].astype(int)
predict.to_csv('predict.csv', encoding='utf-8', index=False, header=True)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step


In [121]:
model.summary()

In [125]:
loaded_model = keras.saving.load_model("model.keras")

print(model.get_config() == loaded_model.get_config())
model.get_config()

False


{'name': 'sequential_3',
 'trainable': True,
 'dtype': {'module': 'keras',
  'class_name': 'DTypePolicy',
  'config': {'name': 'float32'},
  'registered_name': None},
 'layers': [{'module': 'keras.layers',
   'class_name': 'InputLayer',
   'config': {'batch_shape': (None, 63),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_layer_2'},
   'registered_name': None},
  {'module': 'keras.layers',
   'class_name': 'Embedding',
   'config': {'name': 'embedding_3',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float32'},
     'registered_name': None},
    'input_dim': 39,
    'output_dim': 128,
    'embeddings_initializer': {'module': 'keras.initializers',
     'class_name': 'RandomUniform',
     'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None},
     'registered_name': None},
    'embeddings_regularizer': None,
    'activity_regularizer': None,
    'embeddings_constraint': None,
    'mask_zero': False

In [127]:
loaded_model.get_config()

{'name': 'sequential_3',
 'trainable': True,
 'dtype': {'module': 'keras',
  'class_name': 'DTypePolicy',
  'config': {'name': 'float32'},
  'registered_name': None},
 'layers': [{'module': 'keras.layers',
   'class_name': 'InputLayer',
   'config': {'batch_shape': (None, 63),
    'dtype': 'float32',
    'sparse': False,
    'name': 'input_layer_2'},
   'registered_name': None},
  {'module': 'keras.layers',
   'class_name': 'Embedding',
   'config': {'name': 'embedding_3',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float32'},
     'registered_name': None},
    'input_dim': 39,
    'output_dim': 128,
    'embeddings_initializer': {'module': 'keras.initializers',
     'class_name': 'RandomUniform',
     'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None},
     'registered_name': None},
    'embeddings_regularizer': None,
    'activity_regularizer': None,
    'embeddings_constraint': None,
    'mask_zero': False

In [130]:
print(type(validChars))
import pickle 

with open('enumeration_dictionary.pkl', 'wb') as f:
    pickle.dump(validChars, f)


<class 'dict'>
