In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
FEATURE_LEN = 128
EPOCHS = 15

# Blacklists
df = pd.read_csv('./data/blacklists.csv.bz2')
df


Unnamed: 0,domain,blacklists_cat
0,59.52.114.68,phishing
1,eroticalee1.blogspot.ru,adult
2,jstanothergrly.blogspot.com.br,adult
3,youmightbest.tk,redirector|strict_redirector|strong_redirector
4,jstanothergrly.blogspot.com.by,adult
5,edwardmax1.blogspot.com.au,adult
6,prisonpussy.co.uk,adult
7,fenhentai.blogspot.co.za,adult
8,seksfilmpjes.web-log.nl,adult
9,spaconservatoryvillages.co.uk,shopping


In [2]:
df['cat_count'] = df.blacklists_cat.apply(lambda c: len(c.split('|')))
sdf = df[df.cat_count == 1]
sdf

Unnamed: 0,domain,blacklists_cat,cat_count
0,59.52.114.68,phishing,1
1,eroticalee1.blogspot.ru,adult,1
2,jstanothergrly.blogspot.com.br,adult,1
4,jstanothergrly.blogspot.com.by,adult,1
5,edwardmax1.blogspot.com.au,adult,1
6,prisonpussy.co.uk,adult,1
7,fenhentai.blogspot.co.za,adult,1
8,seksfilmpjes.web-log.nl,adult,1
9,spaconservatoryvillages.co.uk,shopping,1
10,nice-fuck.com,adult,1


In [3]:
dom_group = sdf.groupby('blacklists_cat').agg({'domain': 'count'})
dom_group

Unnamed: 0_level_0,domain
blacklists_cat,Unnamed: 1_level_1
adult,1870741
agressif,304
arjel,11
associations_religieuses,1
astrology,27
audio-video,2977
bank,1689
bitcoin,251
blog,1425
celebrity,537


### Take out categories that have less than 1000 domains.

In [4]:
filter_cat = list(dom_group[dom_group.domain > 1000].index)

### Take out categories that have recall less than 0.3

In [5]:
cmat = """
      adult       0.97      0.99      0.98    374149
audio-video       0.37      0.10      0.16       595
       bank       0.59      0.55      0.57       338
       blog       0.67      0.26      0.37       285
     dating       0.70      0.19      0.29       660
   gambling       0.46      0.31      0.37       202
      games       0.82      0.52      0.63      1872
   liste_bu       0.41      0.15      0.22       483
    malware       0.92      0.47      0.62       893
     others       0.66      0.21      0.32      1594
   phishing       0.75      0.61      0.67     12543
      press       0.71      0.60      0.65       882
  publicite       0.75      0.40      0.52       218
   shopping       0.65      0.50      0.57      7266
     sports       0.53      0.12      0.19       452
"""
import re

excat = []
for l in cmat.split('\n'):
    a = re.split('\s+', l)
    if len(a) > 1:
        cat = a[1]
        recall = float(a[3])
        if recall < 0.3:
            excat.append(cat)
excat

['0.37', 'blog', 'dating', 'liste_bu', 'others', 'sports']

In [6]:
filter_cat = [x for x in filter_cat if x not in excat]
filter_cat

['adult',
 'audio-video',
 'bank',
 'gambling',
 'games',
 'malware',
 'phishing',
 'press',
 'publicite',
 'shopping']

In [7]:
sdf.loc[sdf.blacklists_cat.isin(filter_cat) == False, 'blacklists_cat'] = 'others'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
sdf.groupby('blacklists_cat').agg({'domain': 'count'})

Unnamed: 0_level_0,domain
blacklists_cat,Unnamed: 1_level_1
adult,1870741
audio-video,2977
bank,1689
gambling,1012
games,9357
malware,4463
others,17373
phishing,62712
press,4410
publicite,1091


## Preprocessing the input data

In [9]:
# build n-gram list
#vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
vect = CountVectorizer(analyzer='char', ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.domain)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.domain.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

num_words = 1498


In [10]:
print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.blacklists_cat.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
#X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Max feature len = 137, Avg. feature len = 23


In [11]:
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{0: 374149,
 1: 595,
 2: 338,
 3: 202,
 4: 1871,
 5: 893,
 6: 3475,
 7: 12543,
 8: 882,
 9: 218,
 10: 7266}

In [12]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 1496592,
 1: 2382,
 2: 1351,
 3: 810,
 4: 7486,
 5: 3570,
 6: 13898,
 7: 50169,
 8: 3528,
 9: 873,
 10: 29065}

## Train a LSTM model

In [13]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = FEATURE_LEN # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


1609724 train sequences
402432 test sequences
Pad sequences (samples x time)
X_train shape: (1609724, 128)
X_test shape: (402432, 128)
11 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (1609724, 11)
y_test shape: (402432, 11)


In [14]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 32)           47936     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 11)                1419      
Total params: 131,787
Trainable params: 131,787
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 1448751 samples, validate on 160973 samples
Epoch 1/15
 - 8312s - loss: 0.1316 - acc: 0.9575 - val_loss: 0.1305 - val_acc: 0.9579
Epoch 2/15
 - 8148s - loss: 0.1315 - acc: 0.9576 - val_loss: 0.1297 - val_acc: 0.9584
Epoch 3/15
 - 4895s - loss: 0.1314 - acc: 0.9574 - val_loss: 0.1300 - val_acc: 0.9583
Epoch 4/15
 - 4928s - loss: 0.1311 - acc: 0.9575 - val_loss: 0.1302 - val_acc: 0.9582
Epoch 5/15
 - 5027s - loss: 0.1312 - acc: 0.9576 - val_loss: 0.1295 - val_acc: 0.9586
Epoch 6/15
 - 4986s - loss: 0.1311 - acc: 0.9576 - val_loss: 0.1295 - val_acc: 0.9582
Epoch 7/15
 - 4944s - loss: 0.1312 - acc: 0.9576 - val_loss: 0.1294 - val_acc: 0.9584
Epoch 8/15
 - 4979s - loss: 0.1314 - acc: 0.9576 - val_loss: 0.1314 - val_acc: 0.9578
Epoch 9/15
 - 4924s - loss: 0.1329 - acc: 0.9569 - val_loss: 0.1296 - val_acc: 0.9584
Epoch 10/15
 - 4939s - loss: 0.1316 - acc: 0.9575 - val_loss: 0.1309 - val_acc: 0.9580
Epoch 11/15
 - 4896s - loss: 0.1315 - acc: 0.9575 - val_loss: 0.1300 - val_ac

## Confusion Matrix

In [22]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.blacklists_cat.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

             precision    recall  f1-score   support

      adult       0.98      0.99      0.98    374149
audio-video       0.41      0.10      0.16       595
       bank       0.65      0.56      0.60       338
   gambling       0.40      0.47      0.43       202
      games       0.79      0.52      0.63      1871
    malware       0.89      0.48      0.62       893
     others       0.57      0.23      0.33      3475
   phishing       0.72      0.63      0.67     12543
      press       0.75      0.56      0.64       882
  publicite       0.70      0.42      0.53       218
   shopping       0.60      0.60      0.60      7266

avg / total       0.95      0.96      0.96    402432

[[370429     17     27     85    120     20    359   1552     56     10
    1474]
 [   362     61      0      0      5      0     34     67      2      0
      64]
 [    58      1    190      0      0      0     14     45      2      0
      28]
 [    60      0      0     95      6      0      2     26     

## Save model

In [23]:
model.save('./models/toulouse_cat_lstm_others_2017.h5')

In [24]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/toulouse_cat_vocab_others_2017.csv', index=False, encoding='utf-8')

In [25]:
pd.DataFrame(target_names, columns=['toulouse_cat']).to_csv('./models/toulouse_cat_names_others_2017.csv', index=False)