In [1]:
import pandas as pd
from sklearn import datasets
from sklearn import svm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


In [2]:
newsgroups = datasets.fetch_20newsgroups(
                    subset='all', 
                    categories=['alt.atheism', 'sci.space']
             )

In [3]:
target = newsgroups.target
X = newsgroups.data

In [4]:
vectorizer = TfidfVectorizer()

In [5]:
X_vec_text = vectorizer.fit_transform(X)

In [6]:
feature_mapping = vectorizer.get_feature_names()

In [7]:
for i in range(0,20):
    print(feature_mapping[i])

00
000
0000
00000
000000
000021
000050
000062david42
000406
000410
00041032
0004136
00041555
0004244402
0004246
00043819
0004422
00044513
00044808
00044939


In [10]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold( n_splits=5, shuffle=True, random_state=241)
clf = svm.SVC(kernel='linear', random_state=241)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
gs.fit(X_vec_text, target)

GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [11]:
for a in gs.grid_scores_:
    # a.mean_validation_score — оценка качества по кросс-валидации
    # a.parameters — значения параметров
    print(a.mean, a.parameters)

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'

In [14]:
gs.best_params_

{'C': 1.0}

In [17]:
clf = svm.SVC(kernel='linear', random_state=241, C = 1)

In [18]:
clf.fit(X_vec_text, target)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False)

In [31]:
clf.coef_

<1x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 18404 stored elements in Compressed Sparse Row format>

In [29]:
X_vec_text.size

303138

In [32]:
svm_coef = clf.coef_.toarray() 


In [36]:
X_vec_text

<2x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 533 stored elements in Compressed Sparse Row format>

In [38]:
row = clf.coef_.getrow(0).toarray()[0].ravel()
top_ten_indicies = np.argsort(abs(row))[-10:]
top_ten_values = row[top_ten_indicies]

In [39]:
top_ten_values

array([ 1.02930693, -1.09709365, -1.13061234, -1.13908084,  1.1801316 ,
        1.20161118, -1.24918001, -1.25468995, -1.9203794 ,  2.66316479])

In [42]:
answer = list()
for a in top_ten_indicies:
    print(a, ' ', feature_mapping[a])
    answer = np.append(answer, feature_mapping[a])

answer.sort()
print(answer)

22936   sci
15606   keith
5776   bible
21850   religion
23673   sky
17802   moon
5093   atheists
5088   atheism
12871   god
24019   space
['atheism' 'atheists' 'bible' 'god' 'keith' 'moon' 'religion' 'sci' 'sky'
 'space']
