In [1]:
import urllib.request
urllib.request.urlretrieve('https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt', 'shopping.txt')

('shopping.txt', <http.client.HTTPMessage at 0x7f49b420ea30>)

In [None]:
import pandas as pd
import numpy as np

raw = pd.read_table('shopping.txt', names=['rating', 'review'])

raw['label'] = np.where(  raw['rating'] > 3, 1, 0 )
print(raw)

In [None]:
# make data prettier

raw['review'] = raw['review'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣0-9 ]', '') #regular expression to replace not(^) korean, number, and space to space

# print(raw.isnull().sum())

raw.drop_duplicates( subset=['review'], inplace=True )
print(raw)

# bag of words
unique_letter = raw['review'].tolist() #make a list of review
unique_letter = ''.join(unique_letter) #join all review in one string
unique_letter = list(set(unique_letter))
unique_letter.sort()
print(unique_letter[0:100])

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(char_level=True, oov_token='<OOV>') #convert char into int
# out of vocabulary for letters we never saw before

letter_list = raw['review'].tolist()
tokenizer.fit_on_texts(letter_list)

print(tokenizer.word_index)
#korean letter is of around 2000, here we have 3211 including typo
print(letter_list[0:10])

train_seq = tokenizer.texts_to_sequences(letter_list) #convert train dataset into integers
print(train_seq[0:10])

Y = raw['label'].tolist()
print(Y[0:10])

{'<OOV>': 1, ' ': 2, '요': 3, '이': 4, '고': 5, '다': 6, '아': 7, '는': 8, '어': 9, '하': 10, '니': 11, '가': 12, '서': 13, '도': 14, '지': 15, '네': 16, '에': 17, '데': 18, '좋': 19, '구': 20, '사': 21, '기': 22, '해': 23, '은': 24, '로': 25, '만': 26, '나': 27, '무': 28, '보': 29, '있': 30, '매': 31, '게': 32, '리': 33, '그': 34, '잘': 35, '안': 36, '한': 37, '주': 38, '라': 39, '배': 40, '거': 41, '너': 42, '했': 43, '습': 44, '시': 45, '송': 46, '용': 47, '품': 48, '으': 49, '제': 50, '같': 51, '을': 52, '입': 53, '상': 54, 'ㅠ': 55, '들': 56, '인': 57, '자': 58, '면': 59, '재': 60, '스': 61, '정': 62, '먹': 63, '비': 64, '합': 65, '부': 66, '대': 67, '없': 68, '여': 69, '마': 70, '려': 71, '음': 72, '맛': 73, '되': 74, '전': 75, '더': 76, '장': 77, '쓰': 78, '문': 79, '일': 80, '것': 81, 'ㅎ': 82, '건': 83, '저': 84, '않': 85, '수': 86, '생': 87, '르': 88, '빠': 89, '세': 90, '분': 91, '편': 92, '진': 93, '조': 94, '오': 95, '감': 96, '았': 97, '물': 98, '번': 99, '드': 100, '었': 101, '할': 102, '개': 103, '많': 104, '각': 105, '냥': 106, '겠': 107, '족': 108, '러': 109, '두': 110, 'ㅜ'

In [25]:
raw['length'] = raw['review'].str.len()
# max length of review = 140 letters

print(raw.head())
print(raw.describe())
# number of bad and not bad comments should be even as mean of label = 0.5
# 0 = bad, 1 = not bad

raw['length'][raw['length'] < 100].count() # 190496 out of 199425


   rating                                             review  label  length
0       5                                            배공빠르고 굿      1       7
1       2                      택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고      0      29
2       5  아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다 바느질이 조금 엉...      1      66
3       2  선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다 전화...      0     129
4       5                   민트색상 예뻐요 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ      1      32
              rating          label         length
count  199425.000000  199425.000000  199425.000000
mean        3.227088       0.500404      38.061432
std         1.645598       0.500001      28.049906
min         1.000000       0.000000       1.000000
25%         2.000000       0.000000      17.000000
50%         4.000000       1.000000      28.000000
75%         5.000000       1.000000      53.000000
max         5.000000       1.000000     140.000000


190496

In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = pad_sequences(train_seq, maxlen=100) 
#limit every review to 100 int = letter. If < 100, fill in the rest with 0

from sklearn.model_selection import train_test_split
trainX, valX, trainY, valY = train_test_split( X, Y, test_size=0.2, random_state=42 )

print(len(trainX)) # 19000 * 0.8 = 15000
print(len(valX)) # 19000 * 0.2 = 4000

159540
39885


In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    # good to use Embedding layer when there are many unique letters
    # add 1 for <OOV>
    tf.keras.layers.Embedding( len(word_index) + 1, 16), #convert each letter into a matrix of 16 random number between 0 and 1, optimize the numbers during training 
    # LSTM or GRU layer
])