# 20 뉴스그룹 분류

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state=2022)

- 데이터 탐색

In [4]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print(news.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [6]:
print(news.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
# np.unique(): 리스트에서 중복된 요소들 중에 고유한 요소들을 알고 싶다!
# 참고: https://bskyvision.com/1175, https://rfriend.tistory.com/621
np.unique(news.target, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 array([799, 973, 985, 982, 963, 988, 975, 990, 996, 994, 999, 991, 984,
        990, 987, 997, 910, 940, 775, 628], dtype=int64))

In [8]:
# header - quotes('>'부분) - 본문 - footer 
print(news.data[2])

From: astein@nysernet.org (Alan Stein)
Subject: Re: Hamza Salah, the Humanist
Organization: NYSERNet, Inc.
Lines: 16

dzk@cs.brown.edu (Danny Keren) writes:

>cl056@cleveland.Freenet.Edu (Hamaza H. Salah) writes:

># Well said Mr. Beyer :)

>He-he. The great humanist speaks. One has to read Mr. Salah's posters,
>in which he decribes Jews as "sons of pigs and monkeys", keeps
>promising the "final battle" between Muslims and Jews (in which the
>stons and the trees will "cry for the Muslims to come and kill the
>Jews hiding behind them"), makes jokes about Jews dying from heart
>attacks etc, to realize his objective stance on the matters involved.

Humanist, or sub-humanist? :-)
-- 
Alan H. Stein                     astein@israel.nysernet.org



- Train/test data 추출

In [9]:
# 본문만 추출
train_news = fetch_20newsgroups(
    subset='train', random_state=2022, remove=('headers', 'quotes', 'footers')
)
X_train = train_news.data
y_train = train_news.target

In [10]:
print(X_train[1])

I have the local bus card also, and don't have any such problems with it
now, but this is the second card I've gotten - the first card didn't work
in VGA mode correctly.  Maybe they still have some quality control problems.
I would suggest checking with ATI (I went through the vendor I bought the
card from since the problem showed up immediately).  I never was able to
get through to ATI's technical support number.  

I sure like the way the card performs though.  I have the 2MB ATI ultra
pro - local bus, and it is fast even in 1024x768x16bpp mode.


Cheers,
Phil




In [11]:
y_train[1], train_news.target_names[y_train[1]]

(2, 'comp.os.ms-windows.misc')

In [12]:
test_news = fetch_20newsgroups(
    subset='test', random_state=2022, remove=('headers', 'quotes', 'footers')
)
X_test = test_news.data
y_test = test_news.target

In [13]:
len(X_train), len(X_test)

(11314, 7532)

### 피처 벡터화 변환 및 머신러닝 모델 학습 / 평가

- Case 1) CountVectorizer + LogisticRegression

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
cvect.fit(X_train)
X_train_cv = cvect.transform(X_train)
X_test_cv = cvect.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((11314, 101631), (7532, 101631))

In [15]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2022)
%time lrc.fit(X_train_cv, y_train)
lrc.score(X_test_cv, y_test)

Wall time: 43.7 s


0.6066117896972916

In [16]:
# DF만들기 위해서 데이터 형식 파악하기
list(news.data)

['From: positron@quip.eecs.umich.edu (Jonathan Haas)\nSubject: Quotes requested\nOrganization: University of Michigan EECS Dept., Ann Arbor\nLines: 11\n\nI need quotes from Jefferson, Hamilton, Madison, or any of the other founders,\nthat support the idea that the Second Amendment was written into the\nConstitution so that the populace could protect itself it the government\nbegan to degenerate into tyrrany. If you have any (with sources), please\nmail them to me. Thanks.\n\n-- \n__/\\__  Jonathan S. Haas         | Jake liked his women the way he liked\n\\    /  University of Michigan   | his kiwi fruit: sweet yet tart, firm-\n/_  _\\  positron@eecs.umich.edu  | fleshed yet yielding to the touch, and\n  \\/    Finger for PGP 2.2 key   | covered with short brown fuzzy hair.\n',
 "From: ecsd@well.sf.ca.us (Eric C. S. Dynamic)\nSubject: KAWAI K-4 way el cheapo - buy or be sorry, etc. etc.\nNntp-Posting-Host: well.sf.ca.us\nOrganization: The Whole Earth 'Lectronic Link, Sausalito, CA\nDist

In [17]:
# 데이터 자체가 1차원이기 때문에 만들 수 있는 DF형식이 제한적이다.
# 내용이 길어 DF를 만들면 비효율적이다.
df = pd.DataFrame({'data':news.data, 'target':news.target})
df.head()

Unnamed: 0,data,target
0,From: positron@quip.eecs.umich.edu (Jonathan H...,16
1,From: ecsd@well.sf.ca.us (Eric C. S. Dynamic)\...,6
2,From: astein@nysernet.org (Alan Stein)\nSubjec...,17
3,From: cramer@optilink.COM (Clayton Cramer)\nSu...,18
4,From: tod@cco.caltech.edu (Tod Edward Kurt)\nS...,12


- Case 2) TfidVectorizer + SVC

In [18]:
# 열에 해당하는 숫자가 같은지 확인할 것!
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((11314, 101631), (7532, 101631))

In [19]:
# from sklearn.svm import SVC 
# svc = SVC()
# %time svc.fit(X_train_tv, y_train)
# svc.score(X_test_tv, y_test)

### Pipeline/GridSearchCV로 최적 하이퍼 파라메터 도출

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [21]:
pipeline = Pipeline([
    ('tvect', TfidfVectorizer(stop_words='english')),
    ('lrc', LogisticRegression(random_state=2022))
]) # 파이프라인은 리스트의 튜플 형태

params = {
    'tvect__max_df': [300, 700],
    'tvect__ngram_range': [(1,1),(1,2)],
    'lrc__C': [1,10]
}

In [22]:
grid_pipe = GridSearchCV(
    pipeline, param_grid=params, scoring='accuracy', cv=3, n_jobs=-1
)   # extimator가 pipeline

In [23]:
%time grid_pipe.fit(X_train, y_train)

Wall time: 45min 41s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('lrc',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'lrc__C': [1, 10], 'tvect__max_df': [300, 700],
                         'tvect__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy')

In [24]:
grid_pipe.best_params_

{'lrc__C': 10, 'tvect__max_df': 700, 'tvect__ngram_range': (1, 1)}

In [25]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.6935740839086564