# 20 뉴스 그룹 분류

In [1]:
import numpy as np
import pandas as pd

## 데이터 다운로드 (fetch)

In [2]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset='all', random_state=156)

## 데이터 탐색

In [3]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print('target 클래스의 값과 분포도')
print(pd.Series(dataset.target).value_counts().sort_index())

target 클래스의 값과 분포도
0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64


In [6]:
print('target 클래스의 이름들 \n', dataset.target_names)

target 클래스의 이름들 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
print(dataset.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

## 훈련/테스트용 데이터 추출

In [8]:
train_dataset = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes'),
    random_state=156,
)
X_train = train_dataset.data
y_train = train_dataset.target

In [10]:
print(train_dataset.data[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave



In [14]:
train_dataset.target[0]

4

In [15]:
test_dataset = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes'),
    random_state=156,
)
X_test = test_dataset.data
y_test = test_dataset.target

In [16]:
len(X_train), len(X_test)

(11314, 7532)

In [18]:
from sklearn.datasets import fetch_20newsgroups

def fetch_20newsgroups_train_test_split(random_state=156):
    """return value is (X_train, X_test, y_train, y_test)"""
    train = fetch_20newsgroups(
        subset='train',
        remove=('headers', 'footers', 'quotes'),
        random_state=random_state,
    )
    test = fetch_20newsgroups(
        subset='test',
        remove=('headers', 'footers', 'quotes'),
        random_state=random_state,
    )
    return (train.data, test.data, train.target, test.target)

X_train, X_test, y_train, y_test = fetch_20newsgroups_train_test_split(random_state=156)

In [20]:
len(X_train), len(X_test), len(y_train), len(y_test)

(11314, 7532, 11314, 7532)

## 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가
### Case 1. CountVector

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(X_train)
X_train_vector = cv.transform(X_train)
X_test_vector = cv.transform(X_test)

In [25]:
X_train_vector.shape

(11314, 101631)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_vector, y_train)
pred = lr.predict(X_test_vector)

accuracy_score(y_test, pred)

0.6076739245884227

In [28]:
print(y_test[:5])
print(pred[:5])

[ 4 11  1  7  8]
[ 4 11  6  7  8]


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
tv.fit(X_train)
X_train_vector = tv.transform(X_train)
X_test_vector = tv.transform(X_test)

In [30]:
X_train_vector.shape

(11314, 101631)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_vector, y_train)
pred = lr.predict(X_test_vector)

accuracy_score(y_test, pred)

0.6736590546999469

In [33]:
print(y_test[:5])
print(pred[:5])

[ 4 11  1  7  8]
[5 1 1 7 8]


## 학습에 사용된 파라미터

In [35]:
print(tv.get_params())

{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}


In [36]:
print(lr.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


### Case 3. stop_words filtering, ngram(1,2), max_df=300

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),
    max_df=300
)
tv.fit(X_train)

X_train_vec = tv.transform(X_train)
X_test_vec = tv.transform(X_test)

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_vec, y_train)
pred = lr.predict(X_test_vec)

accuracy_score(y_test, pred)

0.6922464152947424

### case 4. case 3에서 LogisticRegression C값을 10으로 변경

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(C=10)
lr.fit(X_train_vec, y_train)
pred = lr.predict(X_test_vec)

accuracy_score(y_test, pred)

## Pipeline 과 GridSearchCV를 통한 하이퍼 파라미터 튜닝

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tv', TfidfVectorizer(stop_words='english')),
    ('lr', LogisticRegression())
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'tv__ngram_range': [(1,1), (1,2)],
    'tv__max_df': [300,700],
    'lr__C': [1, 10]
}

pipe_grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

pipe_grid.fit(X_train, y_train)

print(pipe_grid.best_params_, pipe_grid.best_score_)