In [41]:
import numpy as np
from sklearn import datasets
from sklearn import svm
from sklearn import feature_extraction
from sklearn import grid_search

In [42]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [43]:
vectorizer = feature_extraction.text.TfidfVectorizer()

In [44]:
y = newsgroups.target

In [45]:
X_train = vectorizer.fit_transform(newsgroups.data)

In [50]:
parameters = {'C':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

In [51]:
svr = svm.SVC(kernel='linear', random_state=241, cv=5)
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(X_train, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [61]:
best_c = clf.best_params_['C']

In [62]:
clf = svm.SVC(C = best_c, kernel='linear', random_state=241)

In [63]:
clf.fit(X_train, y)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=241, shrinking=True,
  tol=0.001, verbose=False)

In [65]:
clf.coef_.toarray()

array([[ 0.27365782, -0.10705497,  0.        , ...,  0.01994628,
         0.05950164, -0.00301381]])

In [94]:
feature_names = vectorizer.get_feature_names()
feature_names = np.array(feature_names)
len(feature_names)

28382

In [95]:
weights = clf.coef_.toarray()[0]

In [96]:
f = np.vectorize(abs)
weights_abs = f(weights)
top10i = np.argsort(weights_abs)[-10:]
top10i

array([18430, 15606,  5776, 21850, 23673, 17802,  5093,  5088, 12871, 24019])

In [98]:
np.sort(feature_names[top10i])

array(['atheism', 'atheists', 'bible', 'god', 'keith', 'moon', 'nick',
       'religion', 'sky', 'space'], 
      dtype='<U80')