In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
import numpy as np

In [3]:
# fetch train and test datasets
train_data = fetch_20newsgroups(subset='train')
news_group_train_y = train_data['target']
test_data = fetch_20newsgroups(subset='test')
news_group_test_y = test_data['target']
# convert into unigram matrix
vectorizer = TfidfVectorizer()
news_group_train_X = vectorizer.fit_transform(train_data['data'])
news_group_test_X = vectorizer.transform(test_data['data'])
# convert into a sparse matrix
news_group_train_X = csr_matrix(news_group_train_X)
news_group_test_X = csr_matrix(news_group_test_X)

I run Logistic Regression on 20NG since it works better than Decision Trees. Using Chi squared works better than Mutual Information.

## Chi-squared

In [4]:
nn_chi2, _ = chi2(news_group_train_X, news_group_train_y)

In [5]:
print("200 Features with best Chi-squared score")
indices = np.argsort(nn_chi2)[-200:]
print(indices)

200 Features with best Chi-squared score
[ 74944 103273  55328  53491 107529  39924  92868  32556  92893  61742
  95455  27442  74953  34726  86963 126615  94323  44222  73126 121840
  48395  79831  32754 109035  37820 106447  34048 129693 109067  89396
  38493  55307  58076 108726  99267 122997  32390 113763  67296  47664
 124478  43969  43710  68628  39601  34137  27576  43135 115062  89005
  88240 101947  72234  40422  68541 105791  46777 105760 103588  68696
  34341  48665  30431 110966  54240  77295  56466  34842  62688 103554
  75081  52533  68620  89612 123603  27366 105785 106243  65577  94343
  74917  58306  94342  12661  59910 101960  49463  46846  62410  94862
  51588  95989  54590 101944 124573  69448  93536  68617 106869 107159
  46509  38662  49047  27739  30128  30090 117015  40286  30827  94021
 105696  29114  74753  93400 125155  35760 125671  37469 117029 105155
  86099  69411  42390  49717  67773  83898 127115  98213  32046  33306
  33199  39583  39596  60724 108764 

In [6]:
vocabulary = {}
for k, v in vectorizer.vocabulary_.items():
    vocabulary[v] = k

In [13]:
indices = np.argsort(nn_chi2)[-200:]
str = ""
for i in range(200):
    #print(train_data.target_names[i])
    str += vocabulary[indices[i]] + " "
print(str)

lciii rsa files expo sin circuit penguins behanna pens hamburg prb allan lcs br newton xlib play cview kmr4 vlb doctor mathew benedikt sox card sgi bmw zoo spacecraft offer cdt file games soldiers rangers waco beauchaine teams ini disease wings cubs cryptography islanders christianity bobbe alomar cpr tiff objective nsmca rider kent clinton isa security detroit secret rutgers israelis bontchev dos aurora stratus faith lunar forsale braves helmet rushdie lebanese espn islamic okcforum weapons algorithm secure server ico playoffs lc gay playoff 3d government riding duo devils he polygon encrypted privacy fbi ride wiretap jake phillies islam shipping shuttle des centris drive amanda athos atf turkey clayton ax pitching season arabs launch pgp wpd bus x11r5 caltech turks schneider nasa jaeger controller dyer intercon motorcycle xterm quadra batf bikes bible christ christian gtoal solntze players leafs argic firearms church kaldis widget serdar orbit morality keys apple game moon alaska arm

In [40]:
reduced_news_group_train_X = news_group_train_X[:, indices]
reduced_news_group_test_X = news_group_test_X[:, indices]

In [41]:
%%time
nn_clf = LogisticRegression(penalty='l2', n_jobs=-1, solver='sag')
nn_clf.fit(reduced_news_group_train_X, news_group_train_y.ravel())

CPU times: user 10.6 s, sys: 25.3 s, total: 35.9 s
Wall time: 25.5 s


In [42]:
print("Train Accuracy score: ", nn_clf.score(reduced_news_group_train_X, news_group_train_y))
print("Test Accuracy score: ", nn_clf.score(reduced_news_group_test_X, news_group_test_y))

Train Accuracy score:  0.6741205586
Test Accuracy score:  0.6107275624


## Mutual Information

In [14]:
nn_mutual_info = mutual_info_classif(news_group_train_X, news_group_train_y)

In [46]:
print("200 Features with best Mutual Information score")
indices = np.argsort(nn_mutual_info)[-200:]
print(indices)

200 Features with best Mutual Information score
[ 63246  64200  96047   6475 114882  32988 128022  59195  28621   8266
  76211  76715   4605  48753  78784  76683  52641  99755  55011  85032
  34523  48450  75390 114688 124640  90763  91190  38019 108809 114692
  59860  32976  76377  28421 119781  86839  94725  86914  32651   2336
 112031 124061 128026  74675  98356  59686 104361 110355  81998  62696
  78955 112674  32517  55411  96064  89377 113279  31414  43740 110697
  68857 107539  47139  41614 125017  90946 119714 108821 119740  99822
  32596 115663  59590 101990 104494  26605  86493 120941  94362 119701
 114428 111533 123196 104830  79371  37219  55525 117211  80005 124031
  83836 114800  63333  62821  68003 125053 105818  32422 124198  84681
  79055 121265 123422  28615 107022  52907  88034 114625  86864 119737
 123759  27721 101034  59779  42876 114520  61546 123796  62410 115133
 114494  32491  27618 114508  92923  47982  48448 114418 114696  90686
  58830  89919 119451 124055 

In [15]:
indices = np.argsort(nn_mutual_info)[-200:]
str = ""
for i in range(200):
    str += vocabulary[indices[i]] + " "
print(str)

him however probably 1993 through between year give anything 20 little look 15 down made long etc read few must both doesn let thing without our own case someone things got better ll another using never point news believe 10 sure while years last question going said state might help mail system before find problem off take back cs still its since did computer work over usa something used really being too go right same after need ve please us thanks such want say many ca first two may where most those his here into world see because why much make very way anyone should even now these new use well am reply good could then had were he time their been also them people distribution does than think other get only up which we how when more who know your don just some like out will no me has so any do about would by one university all my what there they was an nntp com host at posting can as article or but if writes are not with be you have this on edu re that it for is and in of to the organi

In [47]:
reduced_news_group_train_X = news_group_train_X[:, indices]
reduced_news_group_test_X = news_group_test_X[:, indices]

In [48]:
%%time
nn_clf = LogisticRegression(penalty='l2', n_jobs=-1, solver='sag')
nn_clf.fit(reduced_news_group_train_X, news_group_train_y.ravel())

CPU times: user 14.9 s, sys: 28.1 s, total: 42.9 s
Wall time: 31 s


In [49]:
print("Train Accuracy score: ", nn_clf.score(reduced_news_group_train_X, news_group_train_y))
print("Test Accuracy score: ", nn_clf.score(reduced_news_group_test_X, news_group_test_y))

Train Accuracy score:  0.362471274527
Test Accuracy score:  0.29580456718


### L1 Regularization

In [65]:
# Coarse Parameter Search
Cs = [10, 5, 1, 0.1]
for c in Cs:
    nn_clf = LogisticRegression(penalty='l1', C=c)
    nn_clf.fit(news_group_train_X, news_group_train_y)
    non_zero_coef = nn_clf.coef_[nn_clf.coef_ > 0.0].shape
    print("Non zeros coefficients: ", non_zero_coef)

Non zeros coefficients:  (5887,)


Non zeros coefficients:  (5046,)


Non zeros coefficients:  (1378,)


Non zeros coefficients:  (67,)


In [66]:
# Fine parameter search between 1 and 0.1

Cs = [0.8, 0.6, 0.4, 0.2]
for c in Cs:
    nn_clf = LogisticRegression(penalty='l1', C=c)
    nn_clf.fit(news_group_train_X, news_group_train_y)
    non_zero_coef = nn_clf.coef_[nn_clf.coef_ > 0.0].shape
    print("Non zeros coefficients: ", non_zero_coef)

Non zeros coefficients:  (1116,)


Non zeros coefficients:  (798,)


Non zeros coefficients:  (476,)


Non zeros coefficients:  (198,)


In [70]:
nn_clf = LogisticRegression(penalty='l1', C=0.25)
nn_clf.fit(news_group_train_X, news_group_train_y)
non_zero_coef = nn_clf.coef_[nn_clf.coef_ > 0.0].shape
print("Non zeros coefficients: ", non_zero_coef)

Non zeros coefficients:  (259,)


In [83]:
feature_indices = np.unique(np.argwhere(nn_clf.coef_ > 0.0)[:, 1])[:200]
feature_indices

array([     0,  12661,  14058,  27237,  27576,  27929,  28856,  29401,
        29403,  30090,  30101,  30105,  30128,  30336,  30827,  31657,
        31767,  31954,  32046,  32754,  33199,  33301,  33306,  33457,
        34046,  34048,  34110,  34726,  34842,  35477,  35760,  37037,
        37219,  37442,  37598,  37780,  37820,  37955,  38662,  39415,
        39583,  39596,  39601,  39603,  39705,  39924,  40422,  40446,
        41105,  42390,  42514,  42817,  43230,  43969,  44067,  44222,
        46777,  46846,  47246,  47664,  47721,  48395,  48421,  48665,
        49047,  49055,  49057,  49122,  49463,  49717,  50527,  50868,
        51051,  51591,  51730,  54240,  54590,  55006,  55307,  55328,
        55489,  56010,  56283,  56420,  56466,  58063,  58245,  58306,
        58487,  58776,  59351,  59590,  59626,  59833,  59910,  60150,
        60492,  60724,  60892,  60915,  62410,  62466,  62661,  62688,
        62784,  63365,  63638,  65577,  65675,  66012,  66208,  66221,
      

In [84]:
reduced_news_group_train_X = news_group_train_X[:, indices]
reduced_news_group_test_X = news_group_test_X[:, indices]

In [87]:
%%time
nn_clf = LogisticRegression(penalty='l2', n_jobs=-1, solver='sag')
nn_clf.fit(reduced_news_group_train_X, news_group_train_y.ravel())

CPU times: user 14.5 s, sys: 26.5 s, total: 41 s
Wall time: 29.9 s


In [87]:
print("Train Accuracy score: ", nn_clf.score(reduced_news_group_train_X, news_group_train_y))
print("Test Accuracy score: ", nn_clf.score(reduced_news_group_test_X, news_group_test_y))

Train Accuracy score:  0.362382888457
Test Accuracy score:  0.29580456718
