In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

full_data = pd.read_csv("./data/kaggle/input/dga-domain-detection-challenge-i/train.csv.gz")
full_train, full_test = train_test_split(full_data, test_size=0.25, random_state=42)




In [8]:
max(len(str(d).split('.')[0]) for d in full_data["domain"].values)

63

In [57]:
import re
from math import log2

vowels = set("aeiou")
consonants = set("bcdfghjklmnpqrstvwxz")

forbidden_bigrams = {
    'qf', 'qj', 'qk', 'qv', 'qz', 'qy',
    'fj', 'vj', 'dj', 'wj', 'tj', 'hj', 'bj', 'cj', 'gj', 'kj', 'lj', 'mj', 'nj', 'pj', 'rj', 'sj', 'xj', 'zj',
    'bv', 'dv', 'jv', 'mv', 'pv', 'sv', 'tv', 'vv', 'cv', 'fv', 'gv', 'hv', 'kv', 'lv', 'nv', 'qv', 'rv', 'wv', 'xv',
    'zv',
    'kk', 'vv', 'ww', 'xx', 'jj',
    'ckq', 'jw', 'qj', 'vf', 'vk', 'vp', 'vw', 'vz', 'wk', 'wq', 'wu', 'wz', 'xq', 'yw', 'yz',
    'fx', 'gq', 'gx', 'hq', 'hx', 'jq', 'jx', 'jz', 'kq', 'kx', 'pq', 'px', 'qa', 'qe', 'qg', 'qh', 'qi', 'qm', 'qn',
    'qo', 'qr', 'qs', 'qt', 'qu', 'qx',
    'qz', 'sx', 'vx', 'wx', 'xj', 'xr', 'xz', 'zq', 'zx'
}

common_bigrams = {
    'co', 'my', 'in', 're', 'go', 'to', 'on', 'we', 'hi', 'st',
    'te', 'ma', 'no', 'ne', 'ha', 'he', 'ho', 'do', 'sh', 'me',
    'er', 'ly', 'ng', 'ed', 'es', 'al', 'or', 'ty', 'ra', 'li',
    'an', 'ar', 'en', 'el', 'ch', 'ic', 'ck', 'rd', 'ss', 'tt',
    'fy', 'io', 'ti', 'ai', 'ro', 'mo', 've', 'ea', 'oo', 'ou',
    'ei', 'ie', 'ba', 'be', 'ca', 'ce', 'da', 'de', 'fa', 'fe',
    'ga', 'ge', 'at', 'et', 'it'
}



def shannon_entropy(s):
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * log2(p) for p in probs)


def get_ngrams(s, n=2):
    return [s[i:i + n] for i in range(len(s) - n + 1)]


def extract_features(domain):
    domain = str(domain).split('.')[0]

    entropy = shannon_entropy(domain)
    length = len(domain)

    digit_count = sum(c.isdigit() for c in domain)
    special_count = sum(not c.isalnum() for c in domain)

    vowel_count = sum(c in vowels for c in domain)
    consonant_count = sum(c in consonants for c in domain)

    digit_sequences = re.findall(r'\d+', domain)
    vowel_sequences = re.findall(f'[{vowels}]+', domain)
    consonant_sequences = re.findall(f'[{consonants}]+', domain)

    max_digits = max([len(seq) for seq in digit_sequences]) if digit_sequences else 0
    max_vowels = max([len(seq) for seq in vowel_sequences]) if vowel_sequences else 0
    max_consonants = max([len(seq) for seq in consonant_sequences]) if consonant_sequences else 0

    bigrams = get_ngrams(domain, 2)
    bigram_entropy = shannon_entropy(bigrams)
    uniq_bigram_count = len(set(bigrams))

    forbidden_bigrams_count = sum(b in forbidden_bigrams for b in bigrams)
    common_bigrams_count = sum(b in common_bigrams for b in bigrams)


    return {
        "length": length,
        "entropy": entropy,

        "digit_ratio": digit_count / length,
        "special_ratio": special_count / length,
        "vowel_ratio": vowel_count / length,
        "consonant_ratio": consonant_count / length,

        "max_digits": max_digits,
        "max_vowels": max_vowels,
        "max_consonants": max_consonants,

        "bigram_entropy": bigram_entropy,
        "uniq_bigram_count": uniq_bigram_count,

        "forbidden_bigrams_count": forbidden_bigrams_count,
        "common_bigrams_count": common_bigrams_count,
    }

In [58]:
from tqdm import tqdm

train = full_train.sample(frac=0.1, random_state=42)
test = full_test.sample(frac=0.1, random_state=42)

X_train = pd.DataFrame([extract_features(str(d)) for d in tqdm(train["domain"], desc="Extracting train features")])
y_train = train["label"].values

X_test = pd.DataFrame([extract_features(str(d)) for d in tqdm(test["domain"], desc="Extracting test features")])
y_test = test["label"].values

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


Extracting train features: 100%|██████████| 6644921/6644921 [01:53<00:00, 58759.48it/s]
Extracting test features: 100%|██████████| 2214974/2214974 [00:37<00:00, 59254.81it/s]


In [59]:
from sklearn.metrics import fbeta_score
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

y_test_pred = lr_model.predict(X_test_scaled)

lr_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"LogisticRegression score: {lr_score}")


LogisticRegression score: 0.8703194026684563


In [60]:
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier(
    class_weight='balanced', random_state=42, n_jobs=-1,
    n_estimators=150,
    max_depth=20,
    max_features='sqrt',
    min_samples_split=5,
    min_samples_leaf=1,
)

rf_model.fit(X_train_scaled, y_train)

y_test_pred = rf_model.predict(X_test_scaled)

rf_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"RandomForestClassifier score: {rf_score}")


RandomForestClassifier score: 0.9097261764635975


In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=8)
param_grid = {
    'n_estimators': [150],
    'max_depth': [10, 20],
    'max_features': ['sqrt', 'log2', 0.3, 0.5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
scorer = make_scorer(fbeta_score, beta=0.5, greater_is_better=True)

search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=2,
    verbose=10
)

search.fit(X_train_scaled, y_train)

print("Best params:", search.best_params_)
print("Best score:", search.best_score_)

final_model = search.best_estimator_

y_test_pred = final_model.predict(X_test_scaled)

rf_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"RandomForestClassifier score: {rf_score}")


In [47]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)

                    feature  importance
11  forbidden_bigrams_count    0.192985
8            max_consonants    0.178362
12     common_bigrams_count    0.134335
9            bigram_entropy    0.081980
4               vowel_ratio    0.076596
1                   entropy    0.065009
0                    length    0.060156
5           consonant_ratio    0.056084
10        uniq_bigram_count    0.054880
3             special_ratio    0.041534
7                max_vowels    0.029067
2               digit_ratio    0.016650
6                max_digits    0.012362


In [61]:
data_test = pd.read_csv("./data/kaggle/input/dga-domain-detection-challenge-i/test.csv.gz")

test_features = pd.DataFrame(
    [extract_features(str(d)) for d in tqdm(data_test["domain"], desc="Extracting test features")])
test_features_scaled = scaler.fit_transform(test_features)



Extracting test features: 100%|██████████| 7594197/7594197 [02:09<00:00, 58850.91it/s]


In [62]:
data_test["label"] = rf_model.predict(test_features_scaled)
data_test[["id", "label"]].to_csv("submission_rf3.csv", index=False)
