# 문서 분류

## 데이터

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = [
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey'
]

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## TF-IDF

자주 나오는 단어의 비중을 줄이고 자주 안나오는 단어의 비중은 키운다.

이유는 분류문제를  해결하기 위해 문장에 비슷한 단어 보다는 가끔 나오는 단어가 더 중요하다 생각하기 때문 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

In [6]:
x_train = tfidf.fit_transform(newsgroups_train.data)

In [7]:
y_train = newsgroups_train.target

In [8]:
x_test = tfidf.transform(newsgroups_test.data)

In [9]:
y_test = newsgroups_test.target

## 소프트맥스와 로지스틱 함수

In [10]:
import tensorflow as tf

In [11]:
x = [1.0, 2.0, 3.0]

In [12]:
tf.nn.sigmoid(x)

<tf.Tensor: id=1, shape=(3,), dtype=float32, numpy=array([0.7310586 , 0.880797  , 0.95257413], dtype=float32)>

In [13]:
tf.nn.softmax(x)

<tf.Tensor: id=3, shape=(3,), dtype=float32, numpy=array([0.09003057, 0.24472848, 0.66524094], dtype=float32)>

## 다항 분류 모형

In [64]:
model = tf.keras.models.Sequential()

In [65]:
model.add(tf.keras.layers.Dense(
    3,
    kernel_regularizer=tf.keras.regularizers.l1_l2(0.1, 0.1),
    input_shape=(1000,),
    activation='softmax'))

In [66]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 3)                 3003      
Total params: 3,003
Trainable params: 3,003
Non-trainable params: 0
_________________________________________________________________


## 학습

In [67]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy'])

In [68]:
model.fit(x_train.toarray(), y_train,
          epochs=30, validation_split=.1,
          callbacks=[tf.keras.callbacks.EarlyStopping()])

Train on 1615 samples, validate on 180 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30


<tensorflow.python.keras.callbacks.History at 0x21c9265b788>

## 평가

In [69]:
model.evaluate(x_test.toarray(), y_test, verbose=0)

[1.134507371752306, 0.3324958]

## 계수

In [70]:
w, _ = model.weights

In [71]:
w.shape

TensorShape([1000, 3])

In [72]:
import pandas

In [73]:
weights = pandas.DataFrame(w.numpy())

In [74]:
weights.columns = ['motorcycle', 'baseball', 'hockey']

In [75]:
weights['word'] = tfidf.get_feature_names()

In [76]:
weights.sort_values('motorcycle', ascending=False).head()
# 계수가 -인 의미는 해당 단어가 많이 나올수록 해당 게시판일 가능성이 줄어든다는 의미임. 

Unnamed: 0,motorcycle,baseball,hockey,word
936,0.000436,0.000161,6e-06,unless
498,0.000423,-8.6e-05,2.3e-05,jeff
310,0.00041,-0.00017,-0.00015,didn
528,0.000406,5.4e-05,-0.000116,laurentian
27,0.0004,0.000111,-0.000155,1993apr6


In [None]:
# 다항 분류일 때는 +항을 많이 본다는 사실에 유의하기 