SVM의 학습 파라미터 C 고려

In [None]:
from sklearn.datasets import fetch_20newsgroups

data_train = fetch_20newsgroups(subset = 'train', categories = None, random_state = 42)
data_test = fetch_20newsgroups(subset = 'test', categories = None, random_state = 42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
import nltk

nltk.download('names')
nltk.download('wordnet')

from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
  cleaned_docs = []
  for doc in docs :
    lemmatized_list = [lemmatizer.lemmatize(word.lower()) for word in doc.split() if word.isalpha() and word not in all_names]
    cleaned_docs.append(' '.join(lemmatized_list))
  return cleaned_docs

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target

cleaned_test = clean_text(data_test.data)
label_test = data_test.target

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True, max_df=0.5, stop_words='english', max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [None]:
from sklearn.svm import SVC

svc_libsvm = SVC(kernel = 'linear')

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'C' : (0.1, 1, 10, 100)}
grid_search = GridSearchCV(svc_libsvm, parameters, n_jobs=-1, cv=3)
# svc_libsvm : 검증에 사용할 모델 객체
# parameters : 테스트하고자 하는 하이퍼 파라미터
# n_jobs : 병렬 수행에 사용할 코어 수 (-1은 가능한 것 모두 사용)
# cv : 교차검증 k값

In [None]:
import timeit

start_time = timeit.default_timer()

grid_search.fit(term_docs_train, label_train)

print("--- %0.3fs seconds ---" %(timeit.default_timer() - start_time))

--- 465.097s seconds ---


In [None]:
grid_search.best_params_

{'C': 10}

In [None]:
grid_search.best_score_

0.8666260504741258

In [None]:
svc_libsvm_best = grid_search.best_estimator_
accuracy = svc_libsvm_best.score(term_docs_test, label_test)
print("The accuracy on testing set is :", accuracy*100)

The accuracy on testing set is : 76.22145512480085


TF-IDF 벡터화 할 때 사용되는 파라미터들 고려

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('svc', SVC(kernel='linear')),])
parameters_pipline = {'tfidf__max_df' : (0.25,0.5),           # max_df : 문서에서 일반적으로 발견되는 용어를 배제하기 위한 문서 빈도 최대값(전체 문서 중 등장 문서 수 비율)
                      'tfidf__max_features' : (40000,50000),  # max_features : 가장 중요하게 고려할 피처의 개수
                      'tfidf__sublinear_tf': (True, False),   # sublinear_tf : 로그함수 또는 다른 함수 등을 이용해 용어의 출현 빈도를 변
                      'svc__C': (0.1,1,10,100),}

In [None]:
grid_search = GridSearchCV(pipeline, parameters_pipline, n_jobs=-1, cv=3)

start_time=timeit.default_timer()
grid_search.fit(cleaned_train, label_train)

print("--- %0.3fs seconds ---" %(timeit.default_timer() - start_time))

--- 4318.046s seconds ---


In [None]:
grid_search.best_params_

{'svc__C': 10,
 'tfidf__max_df': 0.25,
 'tfidf__max_features': 40000,
 'tfidf__sublinear_tf': True}

In [None]:
grid_search.best_score_

0.8834192478758519

In [None]:
pipline_best = grid_search.best_estimator_

accuracy = pipeline_best.score(cleaned_test, label_test)
print("The accuracy on testing set is : ", accuracy*100)

NameError: ignored