In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
!ls

EDA and metrics.ipynb [34mmodel[m[m                 server.py
README.md             mvnw                  [34msrc[m[m
[34mdata[m[m                  mvnw.cmd
[34mdataset[m[m               pom.xml


In [2]:
raw = pd.read_csv('data/host.csv', names=['host'])

In [3]:
raw.drop_duplicates()

Unnamed: 0,host
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com
3,questtime.net
4,passport-authproxy.taxi.yandex.net
...,...
999991,m39.hefxosi.com
999993,m42.ytoxcrj.com
999995,m5.fupppzz.net
999997,m5.ehgeqxn.me


In [4]:
def df_process(df):
    df['len'] = df['host'].map(lambda x: len(x))
    df['dots'] = df['host'].map(lambda x: x.count('.'))
    df['digits'] = df['host'].map(lambda x: sum([x.count(str(c)) for c in range(10)]))
    df['dash'] = df['host'].map(lambda x: x.count('-'))    
    return df

In [5]:
raw = df_process(raw)

In [6]:
raw.describe()

Unnamed: 0,len,dots,digits,dash
count,1000000.0,1000000.0,1000000.0,1000000.0
mean,22.4854,2.297148,2.293751,0.712215
std,11.133069,1.093478,4.588589,1.218499
min,4.0,1.0,0.0,0.0
25%,15.0,2.0,0.0,0.0
50%,19.0,2.0,1.0,0.0
75%,27.0,3.0,2.0,1.0
max,84.0,32.0,41.0,11.0


In [7]:
dataset = pd.read_csv('dataset/data.csv', index_col=0)

In [8]:
dataset.index = range(len(dataset))

In [9]:
dataset

Unnamed: 0,host,y
0,pcvrc.com,1
1,m25.dlfynky.net,0
2,coconutgrovekeywest.com,1
3,lisacerasoli.com,1
4,cal25.com,1
...,...,...
345657,sunmines.com.tw,1
345658,doksan.com,1
345659,9f398ace-e1c4-442a-9cb3-345e01db1c26.mitdmp.wh...,0
345660,m40.cnoyucn.com,0


In [10]:
dataset = df_process(dataset)

In [11]:
dataset.describe()

Unnamed: 0,y,len,dots,digits,dash
count,345662.0,345662.0,345662.0,345662.0,345662.0
mean,0.5,22.523734,1.836505,3.376003,0.465281
std,0.500001,14.686289,1.522946,6.592448,1.155138
min,0.0,2.0,0.0,0.0,0.0
25%,0.0,14.0,1.0,0.0,0.0
50%,0.5,17.0,2.0,0.0,0.0
75%,1.0,25.0,2.0,2.0,0.0
max,1.0,84.0,32.0,40.0,11.0


In [12]:
user = dataset[dataset['y'] == 1]
tech = dataset[dataset['y'] == 0]

In [13]:
user.describe()

Unnamed: 0,y,len,dots,digits,dash
count,172831.0,172831.0,172831.0,172831.0,172831.0
mean,1.0,16.24513,1.234674,0.06673,0.10987
std,0.0,5.089179,0.494328,0.404235,0.371298
min,1.0,2.0,0.0,0.0,0.0
25%,1.0,13.0,1.0,0.0,0.0
50%,1.0,16.0,1.0,0.0,0.0
75%,1.0,19.0,1.0,0.0,0.0
max,1.0,67.0,5.0,13.0,10.0


In [14]:
tech.describe()

Unnamed: 0,y,len,dots,digits,dash
count,172831.0,172831.0,172831.0,172831.0,172831.0
mean,0.0,28.802339,2.438336,6.685276,0.820692
std,0.0,18.073004,1.915719,8.053253,1.509371
min,0.0,4.0,1.0,0.0,0.0
25%,0.0,15.0,2.0,1.0,0.0
50%,0.0,22.0,2.0,2.0,0.0
75%,0.0,38.0,3.0,12.0,1.0
max,0.0,84.0,32.0,40.0,11.0


In [15]:
print(f'соотношение классов user/tech: {len(user) / len(tech)}')

соотношение классов user/tech: 1.0


In [16]:
from sklearn.utils import shuffle
dataset = shuffle(dataset)

In [17]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [18]:
kf = KFold(n_splits=5, shuffle=True)

In [19]:
from model.Model import Model

In [20]:
X = dataset['host'].values
y = dataset['y'].values

In [21]:
metrics = {
    'accuracy': [],
    'roc_auc': [],
    'f1_score': [],
}

from tqdm import tqdm
for train_index, test_index in tqdm(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = Model(depth=8)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred))
    metrics['f1_score'].append(f1_score(y_test, y_pred))

5it [04:50, 58.11s/it]


In [22]:
for k, v in metrics.items():
    print(f'{k}: {np.round(np.mean(v), 4)}')

accuracy: 0.9505
roc_auc: 0.9505
f1_score: 0.951


In [23]:
print(str(round(100 * len(dataset[dataset['y'] == 1]) / len(dataset), 2)) + '%')

50.0%


In [24]:
from model.Model import Model

In [25]:
model = Model()

In [26]:
model.fit(dataset['host'], y)

In [28]:
model.model.best_score_

{'learn': {'Logloss': 0.05685225405639392}}

In [29]:
features = model.model.get_feature_importance()

In [30]:
the_most_important = features.argsort()[-3:][::-1]

In [33]:
np.array(model.vocab)[the_most_important]

array(['.', 'o', 'c'], dtype='<U1')

In [32]:
user.describe()

Unnamed: 0,y,len,dots,digits,dash
count,172831.0,172831.0,172831.0,172831.0,172831.0
mean,1.0,16.24513,1.234674,0.06673,0.10987
std,0.0,5.089179,0.494328,0.404235,0.371298
min,1.0,2.0,0.0,0.0,0.0
25%,1.0,13.0,1.0,0.0,0.0
50%,1.0,16.0,1.0,0.0,0.0
75%,1.0,19.0,1.0,0.0,0.0
max,1.0,67.0,5.0,13.0,10.0


In [34]:
model.model.save_model('best_model',
                       format="cbm",
                       export_parameters=None,
                       pool=None)