In [1]:
%matplotlib inline
from __future__ import division
import os
from sklearn.datasets import load_files
from tqdm import tqdm
import numpy as np
import random
import scipy

In [2]:
train = load_files(os.path.join(os.getcwd(), '..', 'data', 'raw', '20news-bydate', '20news-bydate-train'), encoding = 'utf-8', decode_error= 'replace')
print train.keys()
test = load_files(os.path.join(os.getcwd(), '..', 'data', 'raw', '20news-bydate', '20news-bydate-test'), encoding = 'utf-8', decode_error= 'replace')
print train.keys()

['target_names', 'data', 'target', 'DESCR', 'filenames']
['target_names', 'data', 'target', 'DESCR', 'filenames']


In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
import re

In [4]:
def preprocess(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'[a-z]+')
    tokens = tokenizer.tokenize(text)
    filtered_words = [w for w in tokens if not w in stopwords]
    stemmed_words = [stemmer.stem(w) for w in filtered_words]
    return " ".join(stemmed_words)

In [5]:
for i in tqdm(range(len(train['data']))):
    train['data'][i] = preprocess(train['data'][i])
for i in tqdm(range(len(test['data']))):
    test['data'][i] = preprocess(test['data'][i])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11314/11314 [00:15<00:00, 725.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7532/7532 [00:09<00:00, 764.90it/s]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [7]:
vect = CountVectorizer()
tfidf = TfidfTransformer()
train_vect = vect.fit_transform(train['data'])
test_vect = vect.transform(test['data'])
train_tfidf = tfidf.fit_transform(train_vect)
test_tfidf = tfidf.transform(test_vect)

In [8]:
train_target = train['target']
test_target = test['target']

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [10]:
clf = SVC(kernel='linear', probability= True, verbose = True, decision_function_shape= 'ovr')
%time clf.fit(train_tfidf, train_target)

[LibSVM]Wall time: 9min 7s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [11]:
predicted = clf.predict(test_tfidf)
f1_score(test_target, predicted, average= 'macro')

0.82622209840653338

In [12]:
print train_tfidf.shape
print len(train_target)

(11314, 69637)
11314


In [13]:
def sample_data_target(L_data, L_target, data, target, idxs):
    for idx, idxdel in enumerate(idxs):
        L_data = scipy.sparse.vstack((L_data, data.getrow(idxdel)))
    idxs = list(idxs)
    mask = np.ones(data.shape[0], bool)
    mask[idxs] = False
    data = data[mask]
    L_target = np.append(L_target, np.take(target, idxs))
    target = np.delete(target, idxs)
    return (L_data, L_target, data, target)

In [14]:
idxs = np.random.randint(len(train_target), size=100)
L_data, L_target, train_tfidf, train_target = sample_data_target(scipy.sparse.csr_matrix((0, train_tfidf.shape[1])), [], train_tfidf, train_target, idxs)

In [15]:
print train_tfidf.shape
print len(train_target)
print L_data.shape
print len(L_target)
import pandas as pd

(11215, 69637)
11215
(100, 69637)
100


In [16]:
clf = SVC(kernel='linear', probability= True, decision_function_shape= 'ovr')
for t in range(100):
    print t,'iter'
    %time clf.fit(L_data, L_target)
    predicted = clf.predict(test_tfidf)
    print 'train size', L_data.shape[0], 'f1 score on test', f1_score(test_target, predicted, average= 'macro')
    #choose 5 nearest to margin elements for each of 20 classes
    #choose 100 nearest to margin examples
    probs = clf.predict_proba(train_tfidf)
    #print probs.shape
    #idx_probs = enumerate(probs)
    #print idx_probs
    #print probs[0]
    probs = np.apply_along_axis(sorted, 1, probs)
    #print probs
    sorted_probs = []
    for arr in probs:
        sorted_probs = np.append(sorted_probs, arr[-1])
    #print sorted_probs
    idx_sorted_probs = np.argsort(sorted_probs)
    #print idx_sorted_probs
    idxs = idx_sorted_probs[:100]
    #idxs = np.random.randint(len(train_target), size=100)
    L_data, L_target, train_tfidf, train_target = sample_data_target(L_data, L_target, train_tfidf, train_target, idxs)

0 iter
Wall time: 83 ms
train size 100 f1 score on test

  'precision', 'predicted', average, warn_for)


 0.0836142428556
1 iter
Wall time: 841 ms
train size 200 f1 score on test 0.0641843660113
2 iter
Wall time: 1.42 s
train size 300 f1 score on test 0.09332023783
3 iter
Wall time: 2.01 s
train size 400 f1 score on test 0.267680497235
4 iter
Wall time: 2.63 s
train size 500 f1 score on test 0.349506113467
5 iter
Wall time: 3.36 s
train size 600 f1 score on test 0.465529141126
6 iter
Wall time: 4.95 s
train size 700 f1 score on test 0.534151854823
7 iter
Wall time: 6.27 s
train size 800 f1 score on test 0.583272900409
8 iter
Wall time: 6.69 s
train size 900 f1 score on test 0.60952853748
9 iter
Wall time: 7.17 s
train size 1000 f1 score on test 0.630735800882
10 iter
Wall time: 8.21 s
train size 1100 f1 score on test 0.644022087435
11 iter
Wall time: 9.76 s
train size 1200 f1 score on test 0.654454662065
12 iter
Wall time: 11.1 s
train size 1300 f1 score on test 0.660166096288
13 iter
Wall time: 12.7 s
train size 1400 f1 score on test 0.666518239872
14 iter
Wall time: 14.4 s
train size 15