## 아마존 핸드폰 리뷰 데이터의 감성 분석

데이터 다운로드

In [None]:
import requests
res = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip')
with open('sentiment labelled sentences.zip', 'wb') as f:
    f.write(res.content)

압축 풀기

In [1]:
from zipfile import ZipFile
z = ZipFile('sentiment labelled sentences.zip')
data = z.open('sentiment labelled sentences/amazon_cells_labelled.txt')

파일 열기

In [2]:
import pandas as pd
df = pd.read_csv(data, sep="\t", header=None)
df.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


## 텐서플로

colab을 쓸 경우 텐서플로 2.x를 쓰도록 설정. (개인 PC의 경우 `!pip install -U tensorflow`로 텐서플로를 2.x로 버전업)

In [3]:
try:
    %tensorflow_version 2.x
except:
    pass

In [4]:
import tensorflow as tf

## 토큰화

In [5]:
tok = tf.keras.preprocessing.text.Tokenizer(
    num_words=1500,    # 총 단어 수. None이면 모든 단어를 포함
    lower=True,        # 소문자로 변환
    char_level=False,  # True면 글자 단위 토큰화
    oov_token='<OOV>')    # 포함되지 않는 단어의 표시 out of vocabulary

텍스트의 토큰에 번호를 붙인다.

In [6]:
tok.fit_on_texts(df[0])

토큰별 번호

In [8]:
tok.word_index

{'<OOV>': 1,
 'the': 2,
 'i': 3,
 'and': 4,
 'it': 5,
 'is': 6,
 'a': 7,
 'this': 8,
 'to': 9,
 'phone': 10,
 'my': 11,
 'for': 12,
 'of': 13,
 'not': 14,
 'with': 15,
 'very': 16,
 'great': 17,
 'was': 18,
 'on': 19,
 'in': 20,
 'that': 21,
 'good': 22,
 'have': 23,
 'you': 24,
 'product': 25,
 'quality': 26,
 'had': 27,
 'headset': 28,
 'works': 29,
 'battery': 30,
 'but': 31,
 'as': 32,
 'sound': 33,
 'so': 34,
 'are': 35,
 'well': 36,
 'one': 37,
 'all': 38,
 'use': 39,
 'ear': 40,
 'has': 41,
 'would': 42,
 'work': 43,
 'from': 44,
 'your': 45,
 'like': 46,
 'be': 47,
 'me': 48,
 'case': 49,
 'if': 50,
 'than': 51,
 "i've": 52,
 "don't": 53,
 'no': 54,
 'excellent': 55,
 'up': 56,
 'time': 57,
 "it's": 58,
 'after': 59,
 'price': 60,
 'recommend': 61,
 'does': 62,
 'really': 63,
 '2': 64,
 'at': 65,
 'or': 66,
 'best': 67,
 'out': 68,
 'only': 69,
 'service': 70,
 'get': 71,
 'when': 72,
 'nice': 73,
 "i'm": 74,
 'also': 75,
 'too': 76,
 'just': 77,
 'any': 78,
 'new': 79,
 'love'

번호별 토큰

In [9]:
tok.index_word

{1: '<OOV>',
 2: 'the',
 3: 'i',
 4: 'and',
 5: 'it',
 6: 'is',
 7: 'a',
 8: 'this',
 9: 'to',
 10: 'phone',
 11: 'my',
 12: 'for',
 13: 'of',
 14: 'not',
 15: 'with',
 16: 'very',
 17: 'great',
 18: 'was',
 19: 'on',
 20: 'in',
 21: 'that',
 22: 'good',
 23: 'have',
 24: 'you',
 25: 'product',
 26: 'quality',
 27: 'had',
 28: 'headset',
 29: 'works',
 30: 'battery',
 31: 'but',
 32: 'as',
 33: 'sound',
 34: 'so',
 35: 'are',
 36: 'well',
 37: 'one',
 38: 'all',
 39: 'use',
 40: 'ear',
 41: 'has',
 42: 'would',
 43: 'work',
 44: 'from',
 45: 'your',
 46: 'like',
 47: 'be',
 48: 'me',
 49: 'case',
 50: 'if',
 51: 'than',
 52: "i've",
 53: "don't",
 54: 'no',
 55: 'excellent',
 56: 'up',
 57: 'time',
 58: "it's",
 59: 'after',
 60: 'price',
 61: 'recommend',
 62: 'does',
 63: 'really',
 64: '2',
 65: 'at',
 66: 'or',
 67: 'best',
 68: 'out',
 69: 'only',
 70: 'service',
 71: 'get',
 72: 'when',
 73: 'nice',
 74: "i'm",
 75: 'also',
 76: 'too',
 77: 'just',
 78: 'any',
 79: 'new',
 80: 'l

텍스트를 번호로 모두 변환

In [10]:
data = tok.texts_to_sequences(df[0])

660번 텍스트

In [11]:
df.loc[660, 0]

'This does not fit the Palm Tungsten E2 and it broke the first time I tried to plug it in.'

번호로 변환된 0번 텍스트

In [12]:
data[660]

[8, 62, 14, 155, 2, 529, 1, 1, 4, 5, 201, 2, 89, 57, 3, 203, 9, 156, 5, 20]

번호를 단어로 표시(자주 나오지 않는 단어들은 <OOV>로 변환됨)

In [13]:
[tok.index_word[i] for i in data[660]]

['this',
 'does',
 'not',
 'fit',
 'the',
 'palm',
 '<OOV>',
 '<OOV>',
 'and',
 'it',
 'broke',
 'the',
 'first',
 'time',
 'i',
 'tried',
 'to',
 'plug',
 'it',
 'in']

## n-gram

In [15]:
from collections import Counter

In [16]:
ngram = Counter()

In [17]:
n = 2
for datum in data:
    for i in range(len(datum) - n + 1):
        pair = ' '.join(tok.index_word[i] for i in datum[i:i+n])
        ngram[pair] += 1

In [24]:
# 모든 경우에 1을 더해주는 경우가 add 1 
ngram['there are']

0

In [18]:
ngram_df = pd.DataFrame(ngram.items(), columns=['pair', 'n'])

In [19]:
ngram_df.sort_values('n', ascending=False).head()

Unnamed: 0,pair,n
4359,<OOV> <OOV>,54
40,i have,39
289,the phone,38
315,it is,35
94,and the,33


## 패딩

In [20]:
x = tf.keras.preprocessing.sequence.pad_sequences(data, padding='pre') # pre 앞에다가 채워라 post는 뒤에 채워라. 

In [21]:
x[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,  34, 118,   6,  54,
       215,  12,  48,   9, 156,   5,  20, 338,  20,   2, 547, 417,   3,
       242, 191,   7, 813])

In [22]:
x.shape

(1000, 30)

## 데이터 분할

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
y = df[1].values

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=2019)

In [28]:
x_train.shape

(800, 30)

In [29]:
x_test.shape

(200, 30)

In [30]:
y_train.shape

(800,)

In [31]:
y_test.shape

(200,)

## 순환신경망

In [32]:
INPUT_LEN = x.shape[1]

In [33]:
NUM_WORDS = x.max() + 1

In [38]:
# 단어 번역모델일 시
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=NUM_WORDS, output_dim=8, input_length=INPUT_LEN, mask_zero=True),
    tf.keras.layers.LSTM(8, return_sequences=True), 
    tf.keras.layers.Dense(1500, activation='softmax')
])
model.summary()

In [41]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=NUM_WORDS, output_dim=8, input_length=INPUT_LEN, mask_zero=True),
    tf.keras.layers.LSTM(8, return_sequences=False), # Return sequences는 인풋당 한개의 output을 내는게 아니라 input 전체에 대하여 작업 
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 8)             12000     
_________________________________________________________________
lstm_3 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 12,553
Trainable params: 12,553
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(loss='binary_crossentropy', metrics=['accuracy'])

In [43]:
model.fit(x_train, y_train, epochs=30, validation_split=.2, callbacks=[tf.keras.callbacks.EarlyStopping()])

Train on 640 samples, validate on 160 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


<tensorflow.python.keras.callbacks.History at 0x1901d849108>

## 평가

테스트 데이터에서 긍정(1)의 확률

In [44]:
model.predict(x_test)

array([[0.27654603],
       [0.6104963 ],
       [0.27151853],
       [0.19954202],
       [0.35759252],
       [0.6191312 ],
       [0.75335586],
       [0.32014644],
       [0.6397531 ],
       [0.08222243],
       [0.06108847],
       [0.85561955],
       [0.68500733],
       [0.44348952],
       [0.6869586 ],
       [0.31752923],
       [0.0835917 ],
       [0.39116722],
       [0.44477716],
       [0.4437891 ],
       [0.40703204],
       [0.0552516 ],
       [0.8511601 ],
       [0.2288315 ],
       [0.62337387],
       [0.5779424 ],
       [0.3484649 ],
       [0.31255996],
       [0.06145191],
       [0.8206489 ],
       [0.66612476],
       [0.0787386 ],
       [0.7381305 ],
       [0.5046533 ],
       [0.3727484 ],
       [0.83944154],
       [0.55832386],
       [0.49105877],
       [0.34787637],
       [0.62701684],
       [0.76027006],
       [0.08035383],
       [0.24189508],
       [0.05818924],
       [0.8182224 ],
       [0.11961398],
       [0.09498271],
       [0.630

테스트 데이터로 계산한 손실과 정확도

In [45]:
model.evaluate(x_test, y_test)



[0.5826774597167969, 0.765]