In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
!ls

EDA and metrics.ipynb [34mdata_utils[m[m            pom.xml
README.md             [34mdataset[m[m               server.py
best_model            [34mmodel[m[m                 [34msrc[m[m
[34mcatboost_info[m[m         mvnw
[34mdata[m[m                  mvnw.cmd


In [3]:
raw = pd.read_csv('data/host.csv', names=['host'])

In [4]:
raw.drop_duplicates()

Unnamed: 0,host
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com
3,questtime.net
4,passport-authproxy.taxi.yandex.net
...,...
999991,m39.hefxosi.com
999993,m42.ytoxcrj.com
999995,m5.fupppzz.net
999997,m5.ehgeqxn.me


In [5]:
def df_process(df):
    df['len'] = df['host'].map(lambda x: len(x))
    df['dots'] = df['host'].map(lambda x: x.count('.'))
    df['digits'] = df['host'].map(lambda x: sum([x.count(str(c)) for c in range(10)]))
    df['dash'] = df['host'].map(lambda x: x.count('-'))    
    return df

In [6]:
raw = df_process(raw)

In [7]:
raw.describe()

Unnamed: 0,len,dots,digits,dash
count,1000000.0,1000000.0,1000000.0,1000000.0
mean,22.4854,2.297148,2.293751,0.712215
std,11.133069,1.093478,4.588589,1.218499
min,4.0,1.0,0.0,0.0
25%,15.0,2.0,0.0,0.0
50%,19.0,2.0,1.0,0.0
75%,27.0,3.0,2.0,1.0
max,84.0,32.0,41.0,11.0


In [8]:
dataset = pd.read_csv('dataset/data.csv', index_col=0)

In [9]:
dataset.index = range(len(dataset))

In [10]:
dataset

Unnamed: 0,host,y
0,thevalleychronicle.com,1
1,ax1.porn-cdn.com,0
2,m25.xwtuizg.org,0
3,peta2.com,1
4,utsusemiaikikai.com,1
...,...,...
345749,greatrockiesshow.com,1
345750,m41.nnryeuh.me,0
345751,m28.amrjcad.com,0
345752,r5---sn-axq7sn7z.googlevideo.com,0


In [11]:
dataset = df_process(dataset)

In [12]:
dataset.describe()

Unnamed: 0,y,len,dots,digits,dash
count,345754.0,345754.0,345754.0,345754.0,345754.0
mean,0.500133,22.521469,1.836456,3.375131,0.465175
std,0.500001,14.685246,1.522778,6.591792,1.15501
min,0.0,2.0,0.0,0.0,0.0
25%,0.0,14.0,1.0,0.0,0.0
50%,1.0,17.0,2.0,0.0,0.0
75%,1.0,25.0,2.0,2.0,0.0
max,1.0,84.0,32.0,40.0,11.0


In [13]:
user = dataset[dataset['y'] == 1]
tech = dataset[dataset['y'] == 0]

In [14]:
user.describe()

Unnamed: 0,y,len,dots,digits,dash
count,172923.0,172923.0,172923.0,172923.0,172923.0
mean,1.0,16.243941,1.234896,0.066746,0.109847
std,0.0,5.089556,0.494483,0.404272,0.371244
min,1.0,2.0,0.0,0.0,0.0
25%,1.0,13.0,1.0,0.0,0.0
50%,1.0,16.0,1.0,0.0,0.0
75%,1.0,19.0,1.0,0.0,0.0
max,1.0,67.0,5.0,13.0,10.0


In [15]:
tech.describe()

Unnamed: 0,y,len,dots,digits,dash
count,172831.0,172831.0,172831.0,172831.0,172831.0
mean,0.0,28.802339,2.438336,6.685276,0.820692
std,0.0,18.073004,1.915719,8.053253,1.509371
min,0.0,4.0,1.0,0.0,0.0
25%,0.0,15.0,2.0,1.0,0.0
50%,0.0,22.0,2.0,2.0,0.0
75%,0.0,38.0,3.0,12.0,1.0
max,0.0,84.0,32.0,40.0,11.0


In [23]:
print(f'соотношение классов: user {round(len(user) * 100 / len(dataset), 3)}% tech {round(len(tech) * 100 /len(dataset), 3)}%')

соотношение классов: user 50.013% tech 49.987%


In [43]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

In [25]:
kf = KFold(n_splits=5, shuffle=True)

In [26]:
from model.Model import Model

In [46]:
X = dataset['host'].values
y = dataset['y'].values

In [47]:
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'roc_auc': [],
}

from tqdm import tqdm
for train_index, test_index in tqdm(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = Model(depth=8)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    metrics['accuracy'].append(accuracy_score(y_test, y_pred))
    metrics['roc_auc'].append(roc_auc_score(y_test, y_pred))
    metrics['precision'].append(precision_score(y_test, y_pred))
    metrics['recall'].append(recall_score(y_test, y_pred))

4it [03:38, 54.64s/it]


KeyboardInterrupt: 

In [48]:
for k, v in metrics.items():
    print(f'{k}: {np.round(np.mean(v), 4)}')

accuracy: 0.9504
precision: 0.9415
recall: 0.9607
roc_auc: 0.9504


In [51]:
model = Model()

In [52]:
model.fit(dataset['host'], y)

In [53]:
model.model.best_score_

{'learn': {'Logloss': 0.09926274481748229}}

In [54]:
features = model.model.get_feature_importance()

In [58]:
the_most_important = features.argsort()[-10:][::-1]

In [59]:
np.array(model.vocab)[the_most_important]

array(['.', 'm', 'o', 'c', '1', '-', 'u', '2', 'r', 'e'], dtype='<U1')

In [34]:
model.model.save_model('best_model',
                       format="cbm",
                       export_parameters=None,
                       pool=None)