In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
FEATURE_LEN = 128
EPOCHS = 15

# Shalla
df = pd.read_csv('./data/shalla_2017.csv.bz2')
df


Unnamed: 0,domain,shalla_cat
0,nicoleaniston.org,porn
1,gadiz.hotfire.net,porn
2,turismorural.com,recreation/travel
3,esfollandodondesea.blogspot.de,porn
4,oliebollen.com,shopping
5,elpanasex.blogspot.gr,porn
6,pelawgrange.co.uk,recreation/sports
7,www.accesoplugin.com,spyware
8,globalamateurs.com,porn
9,seksfilmpjes.web-log.nl,porn


In [2]:
df['cat_count'] = df.shalla_cat.apply(lambda c: len(c.split('|')))
sdf = df[df.cat_count == 1]
sdf

Unnamed: 0,domain,shalla_cat,cat_count
0,nicoleaniston.org,porn,1
1,gadiz.hotfire.net,porn,1
2,turismorural.com,recreation/travel,1
3,esfollandodondesea.blogspot.de,porn,1
4,oliebollen.com,shopping,1
5,elpanasex.blogspot.gr,porn,1
6,pelawgrange.co.uk,recreation/sports,1
7,www.accesoplugin.com,spyware,1
8,globalamateurs.com,porn,1
9,seksfilmpjes.web-log.nl,porn,1


In [3]:
dom_group = sdf.groupby('shalla_cat').agg({'domain': 'count'})
dom_group

Unnamed: 0_level_0,domain
shalla_cat,Unnamed: 1_level_1
adv,12712
aggressive,483
alcohol,638
anonvpn,6981
automobile/bikes,893
automobile/boats,252
automobile/cars,3133
automobile/planes,674
chat,13428
costtraps,901


### Take out categories that have less than 1000 domains.

In [4]:
filter_cat = list(dom_group[dom_group.domain > 1000].index)

### Take out categories that have recall less than 0.3

In [5]:
cmat = """
                   adv       0.77      0.42      0.54      2542
               anonvpn       0.77      0.72      0.75      1396
       automobile/cars       0.48      0.19      0.27       627
                  chat       0.59      0.10      0.17      2686
                dating       0.63      0.11      0.19       519
             downloads       0.58      0.41      0.48       835
                 drugs       0.60      0.24      0.34      2227
               dynamic       0.66      0.51      0.58       213
     education/schools       0.86      0.79      0.82      2014
       finance/banking       0.69      0.56      0.62       998
     finance/insurance       0.90      0.83      0.87       616
  finance/moneylending       0.82      0.80      0.81       760
    finance/realestate       0.61      0.38      0.47       276
        fortunetelling       0.79      0.39      0.52       215
                 forum       0.79      0.75      0.77      1612
                gamble       0.79      0.74      0.76      2765
      hobby/games-misc       0.80      0.04      0.08       378
    hobby/games-online       0.67      0.48      0.56      2772
            hobby/pets       0.65      0.36      0.46      3233
             hospitals       0.87      0.67      0.76       327
             jobsearch       0.84      0.46      0.59       859
                models       0.00      0.00      0.00       345
                movies       0.64      0.54      0.58      1112
                 music       0.83      0.81      0.82      1784
                  news       0.52      0.24      0.33      7668
                others       0.50      0.10      0.16      3011
              politics       0.79      0.22      0.35       258
                  porn       0.86      0.92      0.89    165489
               radiotv       0.58      0.50      0.54       712
recreation/restaurants       0.70      0.26      0.38       282
     recreation/sports       0.62      0.64      0.63     24085
     recreation/travel       0.69      0.66      0.68     27789
            redirector       0.81      0.67      0.73      5873
              religion       0.89      0.81      0.84      1838
     science/astronomy       0.69      0.77      0.73       207
         searchengines       0.57      0.23      0.32       220
          sex/lingerie       0.50      0.35      0.41       211
              shopping       0.50      0.67      0.57     33452
               spyware       0.51      0.22      0.31      3666
               tracker       0.58      0.16      0.25       220
                 warez       0.47      0.16      0.23       374
               webmail       0.80      0.55      0.65       705
              webradio       0.54      0.37      0.44       451
"""

import re

excat = []
for l in cmat.split('\n'):
    a = re.split('\s+', l)
    if len(a) > 1:
        cat = a[1]
        recall = float(a[3])
        if recall < 0.3:
            excat.append(cat)
excat

['automobile/cars',
 'chat',
 'dating',
 'drugs',
 'hobby/games-misc',
 'models',
 'news',
 'others',
 'politics',
 'searchengines',
 'spyware',
 'tracker',
 'warez']

In [6]:
filter_cat = [x for x in filter_cat if x not in excat]
filter_cat

['adv',
 'anonvpn',
 'downloads',
 'dynamic',
 'education/schools',
 'finance/banking',
 'finance/insurance',
 'finance/moneylending',
 'finance/realestate',
 'fortunetelling',
 'forum',
 'gamble',
 'hobby/games-online',
 'hobby/pets',
 'hospitals',
 'jobsearch',
 'movies',
 'music',
 'porn',
 'radiotv',
 'recreation/restaurants',
 'recreation/sports',
 'recreation/travel',
 'redirector',
 'religion',
 'science/astronomy',
 'sex/lingerie',
 'shopping',
 'webmail',
 'webradio']

In [7]:
sdf.loc[sdf.shalla_cat.isin(filter_cat) == False, 'shalla_cat'] = 'others'
sdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,domain,shalla_cat,cat_count
0,nicoleaniston.org,porn,1
1,gadiz.hotfire.net,porn,1
2,turismorural.com,recreation/travel,1
3,esfollandodondesea.blogspot.de,porn,1
4,oliebollen.com,shopping,1
5,elpanasex.blogspot.gr,porn,1
6,pelawgrange.co.uk,recreation/sports,1
7,www.accesoplugin.com,others,1
8,globalamateurs.com,porn,1
9,seksfilmpjes.web-log.nl,porn,1


In [8]:
sdf.groupby('shalla_cat').agg({'domain': 'count'})

Unnamed: 0_level_0,domain
shalla_cat,Unnamed: 1_level_1
adv,12712
anonvpn,6981
downloads,4177
dynamic,1066
education/schools,10068
finance/banking,4989
finance/insurance,3081
finance/moneylending,3802
finance/realestate,1379
fortunetelling,1077


## Preprocessing the input data

In [9]:
# build n-gram list
#vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
vect = CountVectorizer(analyzer='char', ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.domain)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.domain.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))



In [10]:
print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.shalla_cat.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
#X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Max feature len = 119, Avg. feature len = 15


In [11]:
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{0: 2542,
 1: 1396,
 2: 835,
 3: 213,
 4: 2014,
 5: 998,
 6: 616,
 7: 760,
 8: 276,
 9: 215,
 10: 1612,
 11: 2765,
 12: 2772,
 13: 3233,
 14: 327,
 15: 859,
 16: 1112,
 17: 1784,
 18: 22198,
 19: 165489,
 20: 712,
 21: 282,
 22: 24085,
 23: 27789,
 24: 5873,
 25: 1838,
 26: 207,
 27: 211,
 28: 33453,
 29: 705,
 30: 451}

In [12]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 10170,
 1: 5585,
 2: 3342,
 3: 853,
 4: 8054,
 5: 3991,
 6: 2465,
 7: 3042,
 8: 1103,
 9: 862,
 10: 6446,
 11: 11062,
 12: 11089,
 13: 12931,
 14: 1310,
 15: 3435,
 16: 4446,
 17: 7134,
 18: 88793,
 19: 661955,
 20: 2848,
 21: 1126,
 22: 96341,
 23: 111154,
 24: 23493,
 25: 7351,
 26: 828,
 27: 845,
 28: 133809,
 29: 2820,
 30: 1803}

## Train a LSTM model

In [13]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = FEATURE_LEN # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


1230486 train sequences
307622 test sequences
Pad sequences (samples x time)
X_train shape: (1230486, 128)
X_test shape: (307622, 128)
31 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (1230486, 31)
y_test shape: (307622, 31)


In [14]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 32)           61952     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 31)                3999      
Total params: 148,383
Trainable params: 148,383
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 1107437 samples, validate on 123049 samples
Epoch 1/15
 - 3874s - loss: 1.0598 - acc: 0.6711 - val_loss: 0.9050 - val_acc: 0.7163
Epoch 2/15
 - 3891s - loss: 0.8973 - acc: 0.7157 - val_loss: 0.8411 - val_acc: 0.7342
Epoch 3/15
 - 3765s - loss: 0.8578 - acc: 0.7279 - val_loss: 0.8084 - val_acc: 0.7453
Epoch 4/15
 - 3731s - loss: 0.8379 - acc: 0.7338 - val_loss: 0.7922 - val_acc: 0.7491
Epoch 5/15
 - 3736s - loss: 0.8261 - acc: 0.7370 - val_loss: 0.7793 - val_acc: 0.7532
Epoch 6/15
 - 3733s - loss: 0.8162 - acc: 0.7405 - val_loss: 0.7726 - val_acc: 0.7565
Epoch 7/15
 - 3728s - loss: 0.8107 - acc: 0.7425 - val_loss: 0.7667 - val_acc: 0.7579
Epoch 8/15
 - 3737s - loss: 0.8051 - acc: 0.7441 - val_loss: 0.7606 - val_acc: 0.7594
Epoch 9/15
 - 3734s - loss: 0.7997 - acc: 0.7458 - val_loss: 0.7568 - val_acc: 0.7602
Epoch 10/15
 - 3708s - loss: 0.7962 - acc: 0.7462 - val_loss: 0.7548 - val_acc: 0.7605
Epoch 11/15
 - 3751s - loss: 0.7939 - acc: 0.7471 - val_loss: 0.7493 - val_ac

## Confusion Matrix

In [16]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.shalla_cat.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

                        precision    recall  f1-score   support

                   adv       0.83      0.41      0.55      2542
               anonvpn       0.79      0.70      0.75      1396
             downloads       0.56      0.42      0.48       835
               dynamic       0.80      0.54      0.64       213
     education/schools       0.84      0.79      0.82      2014
       finance/banking       0.77      0.54      0.63       998
     finance/insurance       0.93      0.82      0.87       616
  finance/moneylending       0.92      0.79      0.85       760
    finance/realestate       0.61      0.44      0.51       276
        fortunetelling       0.68      0.36      0.47       215
                 forum       0.76      0.77      0.77      1612
                gamble       0.79      0.76      0.78      2765
    hobby/games-online       0.66      0.49      0.56      2772
            hobby/pets       0.62      0.39      0.48      3233
             hospitals       0.83      

## Save model

In [17]:
model.save('./models/shalla_cat_lstm_others_2017.h5')

In [18]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/shalla_cat_vocab_others_2017.csv', index=False, encoding='utf-8')

In [19]:
pd.DataFrame(target_names, columns=['shalla_cat']).to_csv('./models/shalla_cat_names_others_2017.csv', index=False)