## SVM for text classifying

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook

In [14]:
vectorizer = TfidfVectorizer()
news = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])
X = news.data
y = news.target

vectorizer.fit_transform(X)

<1786x28382 sparse matrix of type '<class 'numpy.float64'>'
	with 303138 stored elements in Compressed Sparse Row format>

In [15]:
clf = SVC(kernel='linear', random_state=241)
cv = KFold(n_splits=5, shuffle=True, random_state=241)

grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)

gs.fit(vectorizer.transform(X), y)


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
       error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [19]:
C = gs.best_params_.get('C')
print(C)

1.0


In [20]:
model = SVC(C=C, kernel='linear', random_state=241)
model.fit(vectorizer.transform(X), y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=241,
  shrinking=True, tol=0.001, verbose=False)

In [42]:
ids = vectorizer.get_feature_names()
coef = pd.DataFrame(model.coef_.data, model.coef_.indices)

#top_words = list(coef[0].map(lambda w: abs(w)).sort_values(ascending=False).head(10))
#top_words = list(coef[0].map(lambda w: abs(w)).sort_values(ascending=False).head(10).index.map(lambda i: ids[i]))

top_words = coef[0].map(lambda w: abs(w)).sort_values(ascending=False).head(10).index.map(lambda i: ids[i])
top_words

# top_words.sort()
# ids[top_words[0]]

Index(['space', 'god', 'atheism', 'atheists', 'moon', 'sky', 'religion',
       'bible', 'keith', 'sci'],
      dtype='object')

In [25]:
f = open('svm-text-answer.txt', 'w')
f.write(','.join(top_words))
f.close()

In [28]:
print(ids)




In [29]:
print(model.coef_.data)

[ 0.11331532  0.05134321  0.05445196 ... -0.2372855   0.18461124
 -0.2410018 ]


In [30]:
coef.head()

Unnamed: 0,0
11098,0.113315
6775,0.051343
5107,0.054452
98,0.059766
27042,0.104719
