6주차 맨마지막 강의 svm topic classification

In [2]:
from sklearn.datasets import fetch_20newsgroups

data_train = fetch_20newsgroups(subset='train', categories=None, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=None, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
import nltk
nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
  cleaned_docs = []
  for doc in docs:
    lemmatized_list = [lemmatizer.lemmatize(word.lower())
      for word in doc.split()
      if word.isalpha() and word not in all_names]
    cleaned_docs.append(' '.join(lemmatized_list))
  return cleaned_docs

In [5]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target

cleaned_test = clean_text(data_test.data)
label_test = data_test.target

len(label_train), len(label_test)

(11314, 7532)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)

term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [12]:
from sklearn.svm import SVC
svc_libsvm = SVC(kernel='linear')

In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {'C':(0.1, 1, 10, 100)}
grid_search = GridSearchCV(svc_libsvm, parameters, n_jobs = -1, cv=2)

In [14]:
import timeit
start_time = timeit.default_timer()
grid_search.fit(term_docs_train, label_train)
print('--- %0.3fs seconds ---' %(timeit.default_timer() - start_time))

--- 265.988s seconds ---


In [16]:
grid_search.best_params_

{'C': 1}

In [17]:
grid_search.best_score_

0.8475340286370868

In [19]:
svc_libsvm_best = grid_search.best_estimator_
accuracy = svc_libsvm_best.score(term_docs_test, label_test)
print('Test accuracy is {0:.1f}%'.format(accuracy*100))

Test accuracy is 77.4%


In [20]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
                     ('tfidf', TfidfVectorizer(stop_words='english')),
                     ('svc', SVC(kernel='linear')),
])

In [21]:
parameters_pipeline = {
    'tfidf__max_df':(0.25, 0.5),
    'tfidf__max_features':(40000, 50000),
    'tfidf__sublinear_tf':(True, False),
    'tfidf__smooth_idf':(True, False),
    'svc__C':(0.1, 1, 10, 100),
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters_pipeline, n_jobs=-1, cv=3)
start_time = timeit.default_timer()
grid_search.fit(cleaned_train, label_train)
print('--- %0.3fs seconds ---' %(timeit.default_timer() - start_time))

In [None]:
grid_search_best_params_

In [None]:
grid_search_best_score_

In [None]:
accuracy = pipeline_best.score(cleaned_test, label_test)
print('Test accuracy is {0:.1f}%'.format(accuracy*100))