## 텍스트 분류 - 뉴스
---
- scikit-learn의 dataset인 20대 뉴스 데이터 분류

In [29]:
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import sent_tokenize, word_tokenize
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import nltk

## [1] 데이터 준비
---

In [14]:
newsData=fetch_20newsgroups(subset='all', remove=('headers', 'footers'))

In [15]:
newsData.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [18]:
print(f'data => {len(newsData["data"])}')

data => 18846


In [20]:
newsData20=newsData["data"]
type(newsData20)

list

In [22]:
# 타겟 데이터 확인
target=newsData['target']
type(target)

numpy.ndarray

In [24]:
print(f'target => {newsData["target_names"]}')
print(f'target => {target}')

target => ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
target => [10  3 17 ...  3  1  7]


In [2]:
train_news=fetch_20newsgroups(subset='train', remove=('headers', 'footers'))

In [3]:
x_train=train_news.data
y_train=train_news.target

In [5]:
len(x_train), len(y_train)

(11314, 11314)

In [6]:
test_news=fetch_20newsgroups(subset='test', remove=('headers', 'footers'))

In [7]:
x_test=test_news.data
y_test=test_news.target

In [8]:
len(x_test), len(y_test)

(7532, 7532)

In [12]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [13]:
x_train[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [25]:
words=[]

for sent in newsData20:
    words.append(word_tokenize(sent))

In [26]:
len(words)

18846

In [27]:
len(words[0])

152

In [30]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [31]:
# 불용어 제거
  
for i in range(len(words)):
    for w in words[i]:
        if w not in en_stopwords:
            words[i].remove(w)

In [32]:
len(words[0])

92

In [33]:
myToken=Tokenizer()

In [34]:
myToken.fit_on_texts(words)

In [35]:
myToken.word_index

{'the': 1,
 ',': 2,
 '>': 3,
 '.': 4,
 'to': 5,
 'of': 6,
 'a': 7,
 'and': 8,
 'is': 9,
 'in': 10,
 'that': 11,
 '--': 12,
 'it': 13,
 ')': 14,
 '(': 15,
 'for': 16,
 'you': 17,
 ':': 18,
 "'ax": 19,
 'i': 20,
 'on': 21,
 'have': 22,
 'be': 23,
 'are': 24,
 'not': 25,
 'this': 26,
 'with': 27,
 '*': 28,
 '?': 29,
 'as': 30,
 '@': 31,
 'or': 32,
 'was': 33,
 'do': 34,
 "''": 35,
 'but': 36,
 'they': 37,
 'if': 38,
 'from': 39,
 'by': 40,
 'can': 41,
 'at': 42,
 'an': 43,
 '!': 44,
 'will': 45,
 'there': 46,
 'all': 47,
 'about': 48,
 'what': 49,
 'my': 50,
 '<': 51,
 '``': 52,
 'we': 53,
 'has': 54,
 '$': 55,
 ';': 56,
 'he': 57,
 "'s": 58,
 '#': 59,
 'your': 60,
 'so': 61,
 '%': 62,
 'any': 63,
 'me': 64,
 'who': 65,
 'some': 66,
 'which': 67,
 ']': 68,
 'no': 69,
 '-': 70,
 'out': 71,
 '|': 72,
 "'": 73,
 'article': 74,
 'were': 75,
 '[': 76,
 'more': 77,
 'their': 78,
 '&': 79,
 'just': 80,
 'other': 81,
 'up': 82,
 'does': 83,
 'when': 84,
 'had': 85,
 'them': 86,
 'than': 87,
 'bee

In [36]:
myToken.word_counts

OrderedDict([('am', 5013),
             ('some', 9857),
             ('of', 98674),
             ('fans', 199),
             ('are', 28150),
             ('confused', 64),
             ('about', 12794),
             ('the', 213530),
             ('any', 9952),
             ('puzzled', 7),
             ('too', 3282),
             ('and', 86058),
             ('a', 92220),
             ('relieved', 2),
             ('however', 1073),
             (',', 125026),
             ('to', 113078),
             ('an', 15003),
             ("'", 9023),
             ('with', 26682),
             ('for', 39764),
             ('they', 18522),
             ('those', 4100),
             ('worse', 206),
             ('than', 7459),
             ('i', 30405),
             ('thought', 642),
             ('jagr', 37),
             ('just', 8228),
             ('you', 38607),
             ('why', 3791),
             ('he', 10919),
             ('is', 65835),
             ('better', 960),
             ('his'

In [None]:
result=myToken.texts_to_sequences(words)