In [12]:
%matplotlib inline
from __future__ import division
import os
from tqdm import tqdm
from sklearn.datasets import load_files
import numpy as np
import random
import scipy
from scipy.sparse import csr_matrix
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
import multiprocessing

In [13]:
train = load_files(os.path.join(os.getcwd(), '..', 'data', 'raw', '20news-bydate', '20news-bydate-train'), encoding = 'utf-8', decode_error= 'replace')
print train.keys()
test = load_files(os.path.join(os.getcwd(), '..', 'data', 'raw', '20news-bydate', '20news-bydate-test'), encoding = 'utf-8', decode_error= 'replace')
print train.keys()

['target_names', 'data', 'target', 'DESCR', 'filenames']
['target_names', 'data', 'target', 'DESCR', 'filenames']


In [14]:
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

In [15]:
def preprocess(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'[a-z]+')
    tokens = tokenizer.tokenize(text)
    filtered_words = [w for w in tokens if not w in stopwords]
    lemmed_words = [wnl.lemmatize(w) for w in filtered_words]
    #stemmed_words = [stemmer.stem(w) for w in filtered_words]
    return " ".join(lemmed_words)

In [16]:
for i in tqdm(range(len(train['data']))):
    train['data'][i] = preprocess(train['data'][i])
for i in tqdm(range(len(test['data']))):
    test['data'][i] = preprocess(test['data'][i])

100%|██████████| 11314/11314 [00:09<00:00, 1210.72it/s]
100%|██████████| 7532/7532 [00:05<00:00, 1368.55it/s]


In [17]:
vect = CountVectorizer()
tfidf = TfidfTransformer()
train_vect = vect.fit_transform(train['data'])
test_vect = vect.transform(test['data'])
train_tfidf = tfidf.fit_transform(train_vect)
test_tfidf = tfidf.transform(test_vect)

In [18]:
train_target = train['target']
test_target = test['target']

In [19]:
clf = LinearSVC()
clf.fit(train_tfidf, train_target)
predicted = clf.predict(test_tfidf)
baseline = f1_score(test_target, predicted, average= 'macro')
baseline

0.84427513752902894

In [20]:
def delete_rows_csr(mat, indices):
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]
def delete_columns_csr(M, idx_to_drop):
    keep = np.logical_not(np.in1d(np.r_[:M.shape[1]], idx_to_drop))
    return M[:, np.where(keep)[0]]

In [21]:
feat_weights = np.fabs(clf.coef_)
sorted_feat_weights = np.fliplr(np.argsort(feat_weights))

#cut_off from 1 to 82706
def experiment(cut_off):
    selected_words = []
    for row in sorted_feat_weights:
        selected_words = np.append(selected_words, row[:cut_off])
    #print len(selected_words)
    selected_words = np.unique(selected_words)
    #print len(selected_words)
    all_range = range(sorted_feat_weights.shape[1])

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = delete_columns_csr(csr_matrix(train_tfidf), mask)
    new_test_tfidf = delete_columns_csr(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    return f1_score(test_target, predicted, average= 'macro')

#inputs = range(1, 82706)
inputs = np.linspace(1, sorted_feat_weights.shape[1], num=100)
num_cores = multiprocessing.cpu_count()
%time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
sorted_results = np.flipud(np.sort(results))
sorted_permutation = np.flipud(np.argsort(results))
nums = inputs.astype(int)
for i in range(10):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

CPU times: user 184 ms, sys: 40 ms, total: 224 ms
Wall time: 1min 5s
4178 0.845163696052 0.000888558522885
5013 0.84482129586 0.000546158331172
5848 0.844727720845 0.000452583316319
12532 0.844416957986 0.000141820456874
16709 0.844406965122 0.000131827592775
15873 0.844406965122 0.000131827592775
17544 0.844406965122 0.000131827592775
18379 0.844399687452 0.000124549922601
3342 0.844354332317 7.91947874914e-05
6684 0.844310686665 3.55491360136e-05


In [22]:
chi2_stat, pval_stat = chi2(train_tfidf, train_target)

sorted_chi2_stat = np.flipud(np.argsort(chi2_stat))

#cut_off from 1 to 82706
def experiment(cut_off):
    selected_words = []
    selected_words = sorted_chi2_stat[:cut_off]
    #print len(selected_words)
    all_range = range(len(sorted_chi2_stat))

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = delete_columns_csr(csr_matrix(train_tfidf), mask)
    new_test_tfidf = delete_columns_csr(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    return f1_score(test_target, predicted, average= 'macro')

#inputs = range(1, 82706)
inputs = np.linspace(1, len(sorted_chi2_stat), num=100)
num_cores = multiprocessing.cpu_count()
%time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
sorted_results = np.flipud(np.sort(results))
sorted_permutation = np.flipud(np.argsort(results))
nums = inputs.astype(int)
for i in range(10):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

  'precision', 'predicted', average, warn_for)


CPU times: user 164 ms, sys: 48 ms, total: 212 ms
Wall time: 51.3 s
25898 0.845403251549 0.00112811401997
25063 0.84535744091 0.00108230338104
30075 0.845127006065 0.000851868536233
28404 0.845114341534 0.000839204004848
38429 0.845078515479 0.000803377949597
52631 0.845057878286 0.000782740756973
53466 0.84504366304 0.000768525510919
40100 0.845013053704 0.000737916175461
46783 0.845005138938 0.000730001409358
49289 0.844910629664 0.000635492134663


In [23]:
vth = VarianceThreshold()
vth.fit(train_tfidf)
vths = vth.variances_

sorted_vths = np.flipud(np.argsort(vths))

#cut_off from 1 to 82706
def experiment(cut_off):
    selected_words = []
    selected_words = sorted_vths[:cut_off]
    #print len(selected_words)
    all_range = range(len(sorted_vths))

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = delete_columns_csr(csr_matrix(train_tfidf), mask)
    new_test_tfidf = delete_columns_csr(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    return f1_score(test_target, predicted, average= 'macro')

#inputs = range(1, 82706)
inputs = np.linspace(1, len(sorted_vths), num=100)
num_cores = multiprocessing.cpu_count()
%time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
sorted_results = np.flipud(np.sort(results))
sorted_permutation = np.flipud(np.argsort(results))
nums = inputs.astype(int)
for i in range(10):
    print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

  'precision', 'predicted', average, warn_for)


CPU times: user 160 ms, sys: 60 ms, total: 220 ms
Wall time: 55.6 s
45948 0.845233292849 0.000958155320402
27569 0.845215900352 0.000940762823013
49289 0.845203424001 0.00092828647192
50125 0.845203096661 0.000927959132212
48454 0.845201588551 0.000926451022393
47619 0.845201261212 0.000926123682684
30075 0.845106480352 0.000831342823362
45112 0.845097201104 0.000822063575152
46783 0.845073315677 0.000798178147991
35923 0.845071104059 0.00079596652998


In [None]:
vth = VarianceThreshold()
vth.fit(train_tfidf)
vths = vth.variances_
#importances array vths
def experiment(importance_array):
    sorted_vths = np.flipud(np.argsort(vths))

    def get_scores(cut_off):
        selected_words = []
        selected_words = sorted_vths[:cut_off]
        
        all_range = range(len(sorted_vths))

        mask = np.delete(all_range, selected_words)

        new_train_tfidf = delete_columns_csr(csr_matrix(train_tfidf), mask)
        new_test_tfidf = delete_columns_csr(csr_matrix(test_tfidf), mask)

        clf = LinearSVC()
        clf.fit(new_train_tfidf, train_target)
        predicted = clf.predict(new_test_tfidf)
        return np.mean(cross_val_score(LinearSVC(), new_train_tfidf, train_target, scoring='f1_macro')), f1_score(test_target, predicted, average= 'macro')

    inputs = np.linspace(1, len(sorted_vths), num=100)
    num_cores = multiprocessing.cpu_count()
    %time results = Parallel(n_jobs=4)(delayed(experiment)(int(i)) for i in inputs)
    sorted_results = np.flipud(np.sort(results))
    sorted_permutation = np.flipud(np.argsort(results))
    nums = inputs.astype(int)
    for i in range(10):
        print nums[sorted_permutation[i]], sorted_results[i], sorted_results[i] - baseline

In [24]:
#cut_off from 1 to 82706
def experiment2(cut_off):
    selected_words = []
    selected_words = sorted_vths[:cut_off]
    #print len(selected_words)
    all_range = range(len(sorted_vths))

    mask = np.delete(all_range, selected_words)

    new_train_tfidf = delete_columns_csr(csr_matrix(train_tfidf), mask)
    new_test_tfidf = delete_columns_csr(csr_matrix(test_tfidf), mask)

    clf = LinearSVC()
    clf.fit(new_train_tfidf, train_target)
    predicted = clf.predict(new_test_tfidf)
    #predicted2 = clf.predict(new_train_tfidf)
    #print f1_score(test_target, predicted, average= 'macro'), f1_score(train_target, predicted2, average= 'macro')
    print f1_score(test_target, predicted, average= 'macro')
    print np.mean(cross_val_score(LinearSVC(), new_train_tfidf, train_target, scoring='f1_macro'))

In [25]:
experiment2(45000)
experiment2(20000)

0.845097201104
0.911937191379
0.84360180173
0.910835246139
