In [1]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
documents = pickle.load(open('documents.pl','rb'))
labels = pickle.load(open('labels.pl','rb'))
labels = np.array(labels, dtype=str)

In [35]:
from collections import Counter
Counter(labels)

Counter({'A-Team': 62,
         'Abuse/Missbrauch': 106,
         'Anfragen in Arabisch': 1,
         'Dienste & Abos': 2,
         'Endgeräte': 1,
         'Entertain': 5,
         'Festnetz': 12524,
         'Festnetz (Mark./T-Shop)': 243,
         'Festnetz COM': 1565,
         'GK': 1604,
         'Hosting': 1,
         'Hosting FB/TW': 28,
         'Hotspot': 1963,
         'Hybrid & LTE': 8,
         'Hybrid FB/TW': 137,
         'MMS': 6819,
         'MMS (Mark./T-Shop)': 101,
         'MMS COM': 538,
         'Meinungsbildner': 80,
         'Mobile': 33,
         'Mobilfunk': 7984,
         'Mobilfunk (Mark./T-Shop)': 98,
         'Mobilfunk COM': 713,
         'Neue Themen': 3,
         'Smart Home FB/TW': 27,
         'Wechsler': 480,
         'nan': 73961})

# FEATURES

In [36]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

BoW = CountVectorizer(documents, strip_accents='unicode', ngram_range=(1,3), min_df=3)
X_bag = BoW.fit_transform(documents)
tf_transformer = TfidfTransformer().fit(X_bag)
X_bag = tf_transformer.fit_transform(X_bag)

In [37]:
Y = labels[labels != 'nan']
X_bag = X_bag[labels != 'nan']

In [38]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)

# CLASSIFIERS

In [39]:
import xgboost as xgb
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold

In [40]:
from imblearn.over_sampling import RandomOverSampler

In [41]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

## XGBOOST

In [53]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'multi:softmax'
params['eval_metric'] = 'mlogloss'
params['eta'] = 0.07
params['max_depth'] = 40
params['min_child_weight'] = 5
params['lambda'] = 1
params['nthread'] = 4
params['num_class'] = len(np.unique(Y))

In [54]:
# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros((k,2))
models = []

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    models.append(xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10))
    predictions = models[-1].predict(data=D_val).round()
    f1 = f1_score(Y_val, predictions, average='macro')
    acc = accuracy_score(Y_val, predictions)
    results[i] = [f1, acc]
    print('f1 score: {}'.format(f1))
    print('accuracy: {}\%'.format(acc))
    print(confusion_matrix(Y_val, predictions))



(23410, 321767) (11716, 321767) (23410,) (11716,)
[0]	train-mlogloss:2.93537	valid-mlogloss:2.94323
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[10]	train-mlogloss:1.88268	valid-mlogloss:1.95454
[20]	train-mlogloss:1.50185	valid-mlogloss:1.62844
[30]	train-mlogloss:1.29145	valid-mlogloss:1.46404
[40]	train-mlogloss:1.16056	valid-mlogloss:1.37141
[50]	train-mlogloss:1.0719	valid-mlogloss:1.31574
[60]	train-mlogloss:1.00872	valid-mlogloss:1.28193
[70]	train-mlogloss:0.960907	valid-mlogloss:1.26022
[80]	train-mlogloss:0.922655	valid-mlogloss:1.24604
[90]	train-mlogloss:0.891983	valid-mlogloss:1.2368
[100]	train-mlogloss:0.866196	valid-mlogloss:1.23089
[110]	train-mlogloss:0.844156	valid-mlogloss:1.22733
[120]	train-mlogloss:0.825074	valid-mlogloss:1.22593
[130]	train-mlogloss:0.807997	valid-mlogloss:1.22497
[140]	train-mlogloss:0.792289	valid-mlogloss:1.22547
[150]	train-mlogloss:0

  'precision', 'predicted', average, warn_for)


f1 score: 0.16383232505603076
accuracy: 0.6185558210993514\%
[[   0    0    0    0    0   16    0    1    0    0    0    0    0    2
     0    0    0    0    2    0    0    0    0    0]
 [   0    0    0    0    0   29    0    0    0    0    1    0    0    4
     0    0    0    0    2    0    0    0    0    0]
 [   0    0    0    0    0    1    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    1    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    1    0    0    0    0    0    0    0    1
     0    0    0    0    0    0    0    0    0    0]
 [   0    4    0    0    0 3320    2   17   39    0   14    0    9  493
     0    1    0    0  250    0    0    0    0   26]
 [   0    0    0    0    0   65    0    2    1    0    1    0    0    9
     2    0    0    0    1    0    0    0    0    0]
 [   0    0    0    0    0  387    2   17    6    0    3    0   

In [51]:
np.mean(results, 0), np.var(results, 0)

(array([0.1638, 0.6127]), array([0.0001, 0.    ]))

In [46]:
models_cache, results_cache = models, results