In [4]:
import pandas
import numpy as np
from sklearn import datasets, grid_search
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import KFold
from sklearn.svm import SVC

#get data
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

# convert to TF-IDF format and prepare data to train
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = np.array(newsgroups.target)

# start search the best parameters
kf = KFold(len(y), n_folds=5, random_state=241)
steps = [10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 1, 10, 10**2, 10**3, 10**4, 10**5]
tuned_parameters = [{'kernel': ['linear'], 'C': steps}]
clf = grid_search.GridSearchCV(SVC(random_state=241), tuned_parameters, cv=kf)
clf.fit(X, y)
print(clf.best_params_)

#We can see, that the best C =10
C_best = clf.best_params_['C']

# start train ith the best parameters
X = vectorizer.transform(newsgroups.data)
y = np.array(newsgroups.target)
clf = SVC(random_state=241, C=C_best, kernel = 'linear')
clf.fit(X, y)

# this only for usefull - save first 10 words here with the biffest value (abs)
result = {}

# create function to convert coef_ to words
def most_informative_feature_for_class_svm(vectorizer, classifier,  n=10):
    '''
    vectorizer - our class TfidfVectorizer
    clssifier - SVC()
    n - count of first words with the biggest value (default 10)
    '''
    labelid = 0 # this is the coef we're interested in. 
    feature_names = vectorizer.get_feature_names()
    svm_coef = classifier.coef_.toarray() 
    topn = sorted(zip(abs(svm_coef[labelid]), feature_names))[-n:]
    for coef, feat in topn:
        print(feat, end = '\t')
        print(coef)
        result[feat] = coef

# show parameters
most_informative_feature_for_class_svm(vectorizer, clf)

print('_______________________________________________')

# sort result (for answer)
result = sorted(result)
print('Result: ')
for i in result:
    print(i, end = ' ')

{'C': 10, 'kernel': 'linear'}
nick	1.0899040656
keith	1.10719566752
bible	1.126690662
religion	1.15585139495
sky	1.19599660331
moon	1.24846413748
atheists	1.2572771582
atheism	1.29997058932
god	1.97206528531
space	2.72025270605
_______________________________________________
Result: 
atheism atheists bible god keith moon nick religion sky space 