In [62]:
%matplotlib inline
from __future__ import division
import os
from tqdm import tqdm
from sklearn.datasets import load_files
import numpy as np
import random
import scipy
from scipy.sparse import csr_matrix
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
import multiprocessing

In [18]:
train = load_files(os.path.join(os.getcwd(), '..', 'data', 'raw', '20news-bydate', '20news-bydate-train'), encoding = 'utf-8', decode_error= 'replace')
print train.keys()
test = load_files(os.path.join(os.getcwd(), '..', 'data', 'raw', '20news-bydate', '20news-bydate-test'), encoding = 'utf-8', decode_error= 'replace')
print train.keys()

['target_names', 'data', 'target', 'DESCR', 'filenames']
['target_names', 'data', 'target', 'DESCR', 'filenames']


In [19]:
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

In [20]:
def preprocess(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'[a-z]+')
    tokens = tokenizer.tokenize(text)
    filtered_words = [w for w in tokens if not w in stopwords]
    lemmed_words = [wnl.lemmatize(w) for w in filtered_words]
    #stemmed_words = [stemmer.stem(w) for w in filtered_words]
    return " ".join(lemmed_words)

In [21]:
for i in tqdm(range(len(train['data']))):
    train['data'][i] = preprocess(train['data'][i])
for i in tqdm(range(len(test['data']))):
    test['data'][i] = preprocess(test['data'][i])

100%|██████████| 11314/11314 [00:09<00:00, 1152.44it/s]
100%|██████████| 7532/7532 [00:05<00:00, 1343.97it/s]


In [22]:
vect = CountVectorizer()
tfidf = TfidfTransformer()
train_vect = vect.fit_transform(train['data'])
test_vect = vect.transform(test['data'])
train_tfidf = tfidf.fit_transform(train_vect)
test_tfidf = tfidf.transform(test_vect)

In [23]:
train_target = train['target']
test_target = test['target']

In [24]:
clf = LinearSVC()
clf.fit(train_tfidf, train_target)
predicted = clf.predict(test_tfidf)
baseline = f1_score(test_target, predicted, average= 'macro')
baseline

0.84427513752902894

In [25]:
def delete_rows_csr(mat, indices):
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]
def dropcols_fancy(M, idx_to_drop):
    keep = np.logical_not(np.in1d(np.r_[:M.shape[1]], idx_to_drop))
    return M[:, np.where(keep)[0]]

In [26]:
feat_weights = np.fabs(clf.coef_)
sorted_feat_weights = np.fliplr(np.argsort(feat_weights))
print sorted_feat_weights.shape

(20, 82706)


In [35]:
#cut_off from 1 to 82706
def experiment(cut_off):
    selected_words = []
    for row in sorted_feat_weights:
        selected_words = np.append(selected_words, row[:cut_off])
    print len(selected_words)
    selected_words = np.unique(selected_words)
    print len(selected_words)
    all_range = range(sorted_feat_weights.shape[1])

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = dropcols_fancy(csr_matrix(train_tfidf), mask)
    new_test_tfidf = dropcols_fancy(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    return f1_score(test_target, predicted, average= 'macro')

In [28]:
#inputs = range(1, 82706)
inputs = np.linspace(1, 82706, num=1000)
num_cores = multiprocessing.cpu_count()
%time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
sorted_results = np.flipud(np.sort(results))
sorted_permutation = np.flipud(np.argsort(results))
nums = inputs.astype(int)
for i in range(10):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

CPU times: user 1.14 s, sys: 232 ms, total: 1.37 s
Wall time: 11min 16s
5133 0.845077651804 0.000802514274982
4802 0.845074192667 0.000799055138382
4057 0.845071231038 0.000796093509315
2898 0.845062923083 0.000787785554279
4885 0.845061343285 0.000786205756063
4554 0.84504004397 0.00076490644065
4637 0.845039646469 0.000764508939528
4223 0.845036520927 0.000761383397696
4140 0.845031724703 0.000756587173762
4471 0.844959931241 0.000684793711534


In [30]:
for i in range(1000):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

5133 0.845077651804 0.000802514274982
4802 0.845074192667 0.000799055138382
4057 0.845071231038 0.000796093509315
2898 0.845062923083 0.000787785554279
4885 0.845061343285 0.000786205756063
4554 0.84504004397 0.00076490644065
4637 0.845039646469 0.000764508939528
4223 0.845036520927 0.000761383397696
4140 0.845031724703 0.000756587173762
4471 0.844959931241 0.000684793711534
5630 0.84492641422 0.000651276691399
5796 0.844918047336 0.000642909807403
5713 0.844917605334 0.000642467804867
5547 0.844906900875 0.000631763345843
4719 0.844904208263 0.000629070733809
5216 0.844867737944 0.000592600415247
2981 0.84486689856 0.00059176103115
5051 0.844827846751 0.000552709221647
4968 0.844801756966 0.000526619437316
4388 0.844790461203 0.000515323673706
6375 0.844782921272 0.000507783742871
5464 0.844775783605 0.00050064607642
4305 0.844763870464 0.000488732934597
5382 0.844763851443 0.000488713913497
3974 0.844697345454 0.000422207925011
5299 0.844690353347 0.000415215818126
3229 0.84465436757

In [36]:
print experiment(5000)

100000
34436
0.84482129586


In [38]:
chi2_stat, pval_stat = chi2(train_tfidf, train_target)

In [43]:
sorted_chi2_stat = np.flipud(np.argsort(chi2_stat))
print len(sorted_chi2_stat)

82706


In [48]:
#cut_off from 1 to 82706
def experiment(cut_off):
    selected_words = []
    selected_words = sorted_chi2_stat[:cut_off]
    #print len(selected_words)
    all_range = range(len(sorted_chi2_stat))

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = dropcols_fancy(csr_matrix(train_tfidf), mask)
    new_test_tfidf = dropcols_fancy(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    return f1_score(test_target, predicted, average= 'macro')

In [51]:
#inputs = range(1, 82706)
inputs = np.linspace(1, 82706, num=1000)
num_cores = multiprocessing.cpu_count()
%time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
sorted_results = np.flipud(np.sort(results))
sorted_permutation = np.flipud(np.argsort(results))
nums = inputs.astype(int)
for i in range(10):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

  'precision', 'predicted', average, warn_for)


CPU times: user 856 ms, sys: 272 ms, total: 1.13 s
Wall time: 7min 56s
25002 0.845551654618 0.00127651708898
25830 0.845403251549 0.00112811401997
28562 0.84538421521 0.00110907768106
25748 0.845338688855 0.00106355132608
24920 0.845312024734 0.00103688720524
25168 0.845278259717 0.00100312218779
28645 0.845244742408 0.00096960487857
28479 0.845242521761 0.000967384231727
28976 0.845237580284 0.00096244275496
28893 0.845237302441 0.000962164912301


In [53]:
vth = VarianceThreshold()
vth.fit(train_tfidf)
vths = vth.variances_

In [54]:
sorted_vths = np.flipud(np.argsort(vths))
print len(sorted_vths)

82706


In [55]:
#cut_off from 1 to 82706
def experiment(cut_off):
    selected_words = []
    selected_words = sorted_vths[:cut_off]
    #print len(selected_words)
    all_range = range(len(sorted_vths))

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = dropcols_fancy(csr_matrix(train_tfidf), mask)
    new_test_tfidf = dropcols_fancy(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    return f1_score(test_target, predicted, average= 'macro')

In [57]:
#inputs = range(1, 82706)
inputs = np.linspace(1, 82706, num=100)
num_cores = multiprocessing.cpu_count()
%time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
sorted_results = np.flipud(np.sort(results))
sorted_permutation = np.flipud(np.argsort(results))
nums = inputs.astype(int)
for i in range(10):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

  'precision', 'predicted', average, warn_for)


CPU times: user 176 ms, sys: 96 ms, total: 272 ms
Wall time: 52.2 s
45948 0.845233292849 0.000958155320402
27569 0.845215900352 0.000940762823013
49289 0.845203424001 0.00092828647192
50125 0.845203096661 0.000927959132212
48454 0.845201588551 0.000926451022393
47619 0.845201261212 0.000926123682684
30075 0.845106480352 0.000831342823362
45112 0.845097201104 0.000822063575152
46783 0.845073315677 0.000798178147991
35923 0.845071104059 0.00079596652998


In [69]:
#cut_off from 1 to 82706
def experiment2(cut_off):
    selected_words = []
    selected_words = sorted_vths[:cut_off]
    #print len(selected_words)
    all_range = range(len(sorted_vths))

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = dropcols_fancy(csr_matrix(train_tfidf), mask)
    new_test_tfidf = dropcols_fancy(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    #predicted2 = clf.predict(new_train_tfidf)
    #print f1_score(test_target, predicted, average= 'macro'), f1_score(train_target, predicted2, average= 'macro')
    print f1_score(test_target, predicted, average= 'macro')
    print np.mean(cross_val_score(LinearSVC(), new_train_tfidf, train_target, scoring='f1_macro'))

In [70]:
experiment2(45000)
experiment2(20000)

0.845097201104
0.911937191379
0.84360180173
0.910835246139
