In [383]:
import requests
import json
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, FeatureHasher, TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score, homogeneity_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline



In [2]:
URL = "https://slot-ml.com/"
token = "f5d972e2a847e8fa8586a292579d972a07485796"

In [3]:
recs = []

In [222]:
for i in range(3000):
    input = requests.get(url=f'{URL}api/v1/users/{token}/vectors/?random')
    try:
        recs.append(input.json())
    except ValueError:
        recs.append(json.loads(input.text.replace('\n\t',' ')))

In [224]:
len(recs)

3200

In [225]:
recs_df = pd.DataFrame.from_records(recs)

In [226]:
# recs_df['meta7'] = recs_df['meta1'] + recs_df['meta3'].astype(str)

In [227]:
for c in recs_df.columns:
    print(c, recs_df[c].nunique())

meta1 20
id 3200
meta2 596
vector 3196
meta3 17
meta6 1692
meta4 1536
meta5 940


In [230]:
recs_df.to_parquet('data_3200.pq')

In [228]:
recs_df.head()

Unnamed: 0,meta1,id,meta2,vector,meta3,meta6,meta4,meta5
0,REQUEST_METHOD\';\'REQUEST_CONTE,c2783e93ef19a9e152eb57e41cc8eb1d,,POST\';\'multipart/form-data; boundary=4986caa...,200,146.185.223.177,8893,Mozilla/5.0 (Windows NT 8.0; WOW64; rv:41.0) G...
1,REQUEST_ARGS,888c19e9a60688f0c2b87ad18904e5fd,section,44{${print(chr(49).chr(55).chr(73).chr(53).chr...,200,2a01:4f8:212:284c::2,404,{${print(chr(49).chr(55).chr(73).chr(53).chr(5...
2,REQUEST_PATH,4042685b4edfd933d71e8c61a2ee21c7,,/.gitmodules,404,185.5.140.253,291,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...
3,REQUEST_ARGS,68e5756671163069c0d3c869bcf3f1de,,\n--boundaryemae\nContent-Disposition: form-da...,200,185.70.104.11,968,Mozilla/4.0 (compatible; Win32; WinHttp.WinHtt...
4,REQUEST_POST_ARGS,cd37c886e974148cd5de091b0ae8db36,a,http://w2pxzw12.b.wlrm.tl/a.dtd,200,139.162.144.202,471,Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7....


In [231]:
text_features = ['meta1', 'meta2', 'vector', 'meta5', 'meta6']
categ_features = ['meta3']
float_features = ['meta4']

In [373]:
def ColumnPicker(dataframe, column_name):
    return dataframe[column_name]

In [394]:

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.columns]

In [395]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

text_preprocessing = FeatureUnion([
                ('meta1', 
                  Pipeline([('extract_field', FeatureSelector(columns='meta1')),
                            ('tfidf', TfidfVectorizer(min_df=0.01))])),
                ('meta2', 
                  Pipeline([('extract_field', FeatureSelector(columns='meta2')),
                            ('tfidf', TfidfVectorizer(min_df=0.005))])),
                ('vector', 
                  Pipeline([('extract_field', FeatureSelector(columns='vector')),
                            ('tfidf', TfidfVectorizer(min_df=0.01))])),
                ('meta5', 
                  Pipeline([('extract_field', FeatureSelector(columns='meta5')),
                            ('tfidf', TfidfVectorizer(min_df=0.05))])),
                ('meta6', 
                  Pipeline([('extract_field', FeatureSelector(columns='meta6')),
                            ('tfidf', TfidfVectorizer(min_df=0.05))])),
                            ]) 

preprocessor = ColumnTransformer(
    transformers=[
                ('num', numeric_transformer, float_features),
                ('cat', categorical_transformer, categ_features),
                ('text', text_preprocessing, text_features)
                ])
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      
                      ])

In [379]:
x_train, x_test = train_test_split(recs_df, test_size=0.2)

In [291]:
x_train.head()

Unnamed: 0,meta1,id,meta2,vector,meta3,meta6,meta4,meta5
639,REQUEST_HEADERS,74be2f2a5b38f18d4c111895eae6cb24,Referer,file://localhost/mtd_down/widgets/user/IPTVGro...,200,78.62.183.18,528,Mozilla/5.0 (Windows; U; en-US; rv:1.8.1.11; G...
2874,REQUEST_COOKIES,8450af1251875542a94125c6d7451517,REQUEST_COOKIES.LtpaToken2,ckhnPdEMvNUfKGMyV0/52msVUtSwgQiSuTo89xU3wxPjNs...,404,217.66.154.231,1481,Mozilla/5.0 (iPhone; CPU iPhone OS 11_1_2 like...
714,REQUEST_HEADERS,a38f1fda186fa8f8f4d2113e61e883ee,User-Agent,Mozilla/5.00 (Nikto/2.1.6) (Evasions:None) (Te...,404,213.108.201.228,328,Mozilla/5.00 (Nikto/2.1.6) (Evasions:None) (Te...
1016,REQUEST_ARGS,ca2ecd18079ef359094b9117b491894b,,\n--boundaryemae\nContent-Disposition: form-da...,200,185.70.104.11,973,Mozilla/4.0 (compatible; Win32; WinHttp.WinHtt...
2227,CLIENT_USERAGENT,aeb0edd2cfacaf91cca3008e55704e61,,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...,200,93.189.248.14,640,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ...


In [355]:
m1tf = TfidfVectorizer(min_df=0.05)
m2tf = TfidfVectorizer(min_df=0.01)
vectf = TfidfVectorizer(min_df=0.005)
m1 = m1tf.fit_transform(x_train['meta1'])
m2 = m2tf.fit_transform(x_train['meta2'])
vec = vectf.fit_transform(x_train['vector'])

In [396]:
x_train_trn = clf.fit_transform(x_train.drop('id', axis=1))
x_test_trn = clf.transform(x_test.drop('id', axis=1))

In [397]:
x_train_trn.shape

(2560, 262)

In [398]:
km = KMeans(n_clusters=50)

In [399]:
y_train = km.fit_predict(x_train_trn)

In [400]:
y_test = km.predict(x_test_trn)

In [401]:
silhouette_score(x_train_trn, y_train)

0.28450805831218

In [402]:
silhouette_score(x_test_trn, y_test)

0.28385764215358994

In [403]:
x_all = clf.fit_transform(recs_df.drop('id',axis=1))

In [404]:
y = km.fit_predict(x_all)

In [405]:
pickle.dump(km, open('km.p', 'wb'))

In [406]:
pickle.dump(clf, open('prep.p', 'wb'))