# Get Data (Use all 20 categories)

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

data_train = fetch_20newsgroups(subset='train',categories=None, random_state=42)
data_test = fetch_20newsgroups(subset='test',categories=None,random_state=42)

In [3]:
import nltk
nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\KIST\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIST\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names= set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        lemmatized_list = [lemmatizer.lemmatize(word.lower()) for word in doc.split() if word.isalpha() and word not in all_names]
        cleaned_docs.append(' '.join(lemmatized_list))
    return cleaned_docs

In [12]:
cleaned_train,label_train = clean_text(data_train.data),data_train.target
cleaned_test,label_test = clean_text(data_test.data), data_test.target

len(label_train),len(label_test)

(11314, 7532)

TF-Idf로 추출

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.5, stop_words='english',max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [15]:
from sklearn.svm import SVC
svc_libsvbm  = SVC(kernel = 'linear')

## 하이퍼파라미터 찾기

GridSearchCV 사용

In [19]:
from sklearn.model_selection import GridSearchCV
parameters = {'C':(0.1,1,10,100)}
grid_search = GridSearchCV(svc_libsvbm,parameters,n_jobs=-1,cv = 3)#n_jobs = -1: 병렬 수행에 모든 코어 수 사용 cv: 교차 검증 k값
grid_search

GridSearchCV(cv=3, estimator=SVC(kernel='linear'), n_jobs=-1,
             param_grid={'C': (0.1, 1, 10, 100)})

In [20]:
import timeit
start_time = timeit.default_timer()
grid_search.fit(term_docs_train, label_train)
print(f'--- %0.3fs seconds ---'%(timeit.default_timer()-start_time))#시간 오래 걸림...

--- 136.210s seconds ---


In [21]:
grid_search.best_params_

{'C': 10}

In [25]:
grid_search.best_score_

0.8666260504741258

## 테스트 데이터에 대한 최종 성능 평가

In [26]:
svc_libsvm_best = grid_search.best_estimator_

In [31]:
accuracy = svc_libsvm_best.score(term_docs_test,label_test)
print("The accuracy on testing set: {0:.1f}%".format(accuracy*100))

The accuracy on testing set: 76.2%


## 더 다양한 파이프라인 고려하기

In [32]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english')),
    ('svc',SVC(kernel = 'linear'))
])

In [33]:
parameters_pipeline = {
    'tfidf__max_df':(0.25,0.5),
    'tfidf__max_features':(40000,50000),
    'tfidf__sublinear_tf':(True,False),
    'tfidf__smooth_idf':(True,False),
    'svc__C':(0.1,1,10,100),
}

In [None]:
import timeit
grid_search = GridSearchCV(pipeline,parameters_pipeline,n_jobs=-1,cv = 3)
start_time = timeit.default_timer()
grid_search.fit(cleaned_train, label_train)
print(f'--- %0.3fs seconds ---'%(timeit.default_timer()-start_time))#시간 오래 걸림...

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
pipeline_best = grid_search.best_estimator_

In [None]:
accuracy = pipeline_best.score(term_docs_test,label_test)
print("The accuracy on testing set: {0:.1f}%".format(accuracy*100))