# 문서 분류

## 데이터

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = [
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey'
]

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

## TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

In [44]:
x_train = tfidf.fit_transform(newsgroups_train.data)

In [45]:
y_train = newsgroups_train.target

In [46]:
x_test = tfidf.transform(newsgroups_test.data)

In [47]:
y_test = newsgroups_test.target

## 소프트맥스와 로지스틱 함수

In [8]:
import tensorflow as tf

In [15]:
x = [1.0, 2.0, 3.0]

In [13]:
tf.nn.sigmoid(x)

<tf.Tensor: id=8, shape=(3,), dtype=float32, numpy=array([0.7310586 , 0.880797  , 0.95257413], dtype=float32)>

In [14]:
tf.nn.softmax(x)

<tf.Tensor: id=10, shape=(3,), dtype=float32, numpy=array([0.09003057, 0.24472848, 0.66524094], dtype=float32)>

## 다항 분류 모형

In [102]:
model = tf.keras.models.Sequential()

In [103]:
model.add(tf.keras.layers.Dense(
    3,
    kernel_regularizer=tf.keras.regularizers.l1_l2(0.1, 0.1),
    input_shape=(1000,),
    activation='softmax'))

In [104]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 3)                 3003      
Total params: 3,003
Trainable params: 3,003
Non-trainable params: 0
_________________________________________________________________


## 학습

In [105]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy'])

In [106]:
model.fit(x_train.toarray(), y_train,
          epochs=30, validation_split=.1,
          callbacks=[tf.keras.callbacks.EarlyStopping()])

Train on 1615 samples, validate on 180 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2874dddb0f0>

## 평가

In [107]:
model.evaluate(x_test.toarray(), y_test, verbose=0)

[0.38135143750077316, 0.9396985]

## 계수

In [108]:
w, _ = model.weights

In [109]:
w.shape

TensorShape([1000, 3])

In [110]:
import pandas

In [111]:
weights = pandas.DataFrame(w.numpy())

In [112]:
weights.columns = ['motorcycle', 'baseball', 'hockey']

In [113]:
weights['word'] = tfidf.get_feature_names()

In [116]:
weights.sort_values('motorcycle', ascending=False).head()

Unnamed: 0,motorcycle,baseball,hockey,word
318,0.97139,-1.099698,-1.007686,dod
181,0.910237,-1.055848,-0.974728,bike
767,0.905154,-0.892666,-0.862833,ride
770,0.859134,-0.825414,-0.841403,riding
610,0.856247,-0.92251,-0.861315,motorcycle
