In [1]:
import numpy as np
import pandas as pd


ModuleNotFoundError: No module named 'numpy'

In [None]:
import re
from math import log2

VOWELS = set("aeiou")
CONSONANTS = set("bcdfghjklmnpqrstvwxz")

def shannon_entropy(s):
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * log2(p) for p in probs)

def extract_features(domain):
    domain = str(domain).split('.')[0]

    entropy = shannon_entropy(domain)
    length = len(domain)

    unique_count = len(set(domain))

    letter_count = sum(c.isalpha() for c in domain)
    digit_count = sum(c.isdigit() for c in domain)
    special_count = sum(not c.isalnum() for c in domain)

    vowel_count = sum(c in VOWELS for c in domain)
    consonant_count = sum(c in VOWELS for c in domain)


    digit_sequences = re.findall(r'\d+', domain)
    vowel_sequences = re.findall(f'[{VOWELS}]+', domain)
    consonant_sequences = re.findall(f'[{CONSONANTS}]+', domain)

    max_digits = max([len(seq) for seq in digit_sequences]) if digit_sequences else 0
    max_vowels = max([len(seq) for seq in vowel_sequences]) if vowel_sequences else 0
    max_consonants = max([len(seq) for seq in consonant_sequences]) if consonant_sequences else 0

    return [
        length,
        entropy,

        unique_count,
        letter_count,
        digit_count,
        special_count,
        vowel_count,
        consonant_count,

        max_digits,
        max_vowels,
        max_consonants,
    ]

In [None]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("./data/kaggle/input/dga-domain-detection-challenge-i/train.csv")
data['label'].value_counts(normalize=True)


In [None]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [None]:
from tqdm import tqdm

X_train = np.array([extract_features(str(d)) for d in tqdm(train["domain"], desc="Extracting train features")])
y_train = train["label"].values

X_test = np.array([extract_features(str(d)) for  d in tqdm(test["domain"], desc="Extracting test features")])
y_test = test["label"].values

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


In [None]:
from sklearn.metrics import fbeta_score
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

y_test_pred = lr_model.predict(X_test_scaled)

lr_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"LogisticRegression score: {lr_score}")


In [None]:
#from sklearn.ensemble import RandomForestClassifier
from cuml.ensemble import RandomForestClassifier

sample_X = X_train_scaled.sample(n=3_000_000, random_state=42)
sample_y = sample_X["label"].values.astype(int)


rf_model = RandomForestClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
    )

rf_model.fit(sample_X, sample_y)

y_test_pred = rf_model.predict(X_test_scaled)

rf_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"RandomForestClassifier score: {rf_score}")


In [None]:
data_test = pd.read_csv("./data/kaggle/input/dga-domain-detection-challenge-i/test.csv")

test_features = np.array([extract_features(str(d)) for d in tqdm(data_test["domain"], desc="Extracting test features")])
test_features_scaled = scaler.fit_transform(test_features)



In [None]:
data_test["label"] = lr_model.predict(test_features_scaled)
data_test[["id", "label"]].to_csv("submission.csv", index=False)
