In [2]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/8b/f7/a368401e630f0e390dd0e62c39fb928e5b23741b53c2360ee7d376660927/JPype1-1.0.2-cp36-cp36m-manylinux2010_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 51.3MB/s 
[?25hCollecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237

In [115]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.regularizers import l2, l1_l2
from tensorflow.keras.layers import Dense, LSTM, Flatten, Embedding, Conv1D, GlobalMaxPooling1D, GRU, Concatenate, Input, Dropout, GlobalAveragePooling1D, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import layers
from keras.models import load_model
#from keras import layers
from tensorflow import keras 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from konlpy.tag import Okt


class Modeling:
    def __init__(self, Data):
        self.Data = Data
        #self.batchsize = batchsize
        #self.epochs = epochs

    def pre_processing(self):
        data = self.Data

        data.review = data.review.str.replace("[^\w]|br", " ")
        data.review = data.review.replace("", np.nan)
        data.score = data.score.replace('', np.nan)
        data = data.dropna(how="any")
        print("Data pre-processing is done!")
        print(data.review.isnull().sum())
        
        return data


    def data_split(self, data):
        x_train, x_test, y_train, y_test = train_test_split(data.review, data.score, test_size=0.3, shuffle=True)
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
        return x_train, x_test, y_train, y_test


    def stop_wordsfiltering(self, x_train, x_test, stopwords_list):
        X_train = []
        for row in x_train:
            token = []
            words = Okt().morphs(row, stem=True)
            for word in words:
                if word not in stopwords_list:
                    token.append(word)
            X_train.append(token)

        X_test = []
        for row in x_test:
            token = []
            words = Okt().morphs(row, stem=True)
            for word in words:
                if word not in stopwords_list:
                    token.append(word)
            X_test.append(token)

        print("Tokenize is done")
        return X_train, X_test

 
    def Tokenizing_data(self, max_num, X_train, X_test):
        tokenize = Tokenizer(max_num)
        tokenize.fit_on_texts(X_train)
        print(tokenize.word_index, len(tokenize.word_index))

        X_train = tokenize.texts_to_sequences(X_train)
        X_test = tokenize.texts_to_sequences(X_test)

        print("Integer formating is done")

        return X_train, X_test, tokenize, len(tokenize.word_index)


    def check_max_length(self, X_train, X_test):
        maxlen = 0
        for row in X_train:
            if maxlen < len(row):
                maxlen = len(row)

        maxlen2 = 0
        for row in X_test:
            if maxlen2 < len(row):
                maxlen2 = len(row)

        return max(maxlen, maxlen2)



    def train_1Dcnn_ELU(self, X_train, X_test, y_train, y_test, max_len, epochs, batchsize, degree):
        X_train = pad_sequences(X_train, maxlen=max_len)
        X_test = pad_sequences(X_test, maxlen=max_len)

        inputs = Input((max_len,))
        embed = Embedding(degree, 64)(inputs)

        conv = BatchNormalization()(embed)
        conv = Conv1D(32, 3, kernel_regularizer=l1_l2(0.0001), padding="same")(conv)
        conv = layers.ELU(alpha=1.0)(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.6)(conv)

        conv = Conv1D(64, 3, kernel_regularizer=l1_l2(0.0001), padding="same")(embed)
        conv = layers.ELU(alpha=1.0)(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.6)(conv)

        conv = BatchNormalization()(embed)
        conv = Conv1D(128, 3, kernel_regularizer=l1_l2(0.0001), padding="same")(conv)
        conv = layers.ELU(alpha=1.0)(conv)
        conv = GlobalMaxPooling1D()(conv)
        conv = Dropout(0.6)(conv)

        conv = Flatten()(conv)
        conv = Dense(32, kernel_regularizer=l1_l2(0.0001))(conv)
        conv = layers.ELU(alpha=1.0)(conv)
        conv = Dropout(0.7)(conv)
        outputs = Dense(1, activation="sigmoid")(conv)

        model = Model(inputs, outputs)
        model.summary()

        early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=5)
        model_check = ModelCheckpoint('1dcnn_elu_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

        opt = keras.optimizers.RMSprop(learning_rate=0.0001)
        model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["acc"])
        model.fit(X_train, y_train, 
                  batch_size=batchsize, 
                  epochs=epochs, 
                  validation_data=(X_test, y_test), 
                  verbose=2)
        
        model.save("1dcnn_elu_model.h5")
        return model

   ## GRU 모델 , ELU 활성화함수,  
    def train_GRU_model(self, X_train, X_test, y_train, y_test, max_len, epochs, batchsize, degree):
        X_train = pad_sequences(X_train, maxlen=max_len)
        X_test = pad_sequences(X_test, maxlen=max_len)

        inputs = Input((max_len,))
        embed = Embedding(degree, 64)(inputs)

        gru = layers.GRU(max_len)(embed)
        gru = layers.ELU(alpha=1.0)(gru)
        outputs = Dense(1, activation="sigmoid")(gru)
        model = Model(inputs, outputs)
        model.summary()

        early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=5)
        model_check = ModelCheckpoint('lstm_best2.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
        #opt = keras.optimizers.RMSprop(learning_rate=0.00002)
        opt = keras.optimizers.Adam(learning_rate=0.00002)
        model.compile(loss="binary_crossentropy", optimizer= opt, metrics=["acc"])
        model.fit(X_train, y_train, 
                  batch_size=batchsize, 
                  epochs=epochs, 
                  validation_data=(X_test, y_test), 
                  verbose=2)
        model.save("gru_model.h5")
        return model


    def train_lstm(self, X_train, X_test, y_train, y_test, max_len, epochs, batchsize, degree):
        X_train = pad_sequences(X_train, maxlen=max_len)
        X_test = pad_sequences(X_test, maxlen=max_len)

        model = Sequential()
        model.add(Embedding(degree, 64))
        model.add(LSTM(max_len))
        model.add(Dense(64, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation="sigmoid"))

        early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=5)
        model_check = ModelCheckpoint('lstm_best2.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batchsize, callbacks=[early_stop, model_check])
        model.save("lstm_model.h5")
        return model


    def sentiment_predict_module(self, sentence, max_len, tokenize, model, stopwords_list):
        token_stc = Okt().morphs(sentence, stem=True)
        token_stc = [word for word in token_stc if not word in stopwords_list]
        encode_stc = tokenize.texts_to_sequences([token_stc])
        pad_stc = pad_sequences(encode_stc, maxlen=max_len)
        score = model.predict(pad_stc)
        return score



#from google.colab import drive

#drive.mount("/content/gdrive")
#Data = pd.read_csv("/content/gdrive/My Drive/워밍업프로젝트2/equalratio_data.csv")

Data = pd.read_csv("final_review_data.csv")

m = Modeling(Data)
data = m.pre_processing()
x_train, x_test, y_train, y_test = m.data_split(data)


stopwords_list = ["좀", "배송", "그냥", "너무", "제품", "더", "다", "하다", "에", "도", "을", "되다", "를", "로",
                  "면", "서", "택배", "해보다", "거", "제", "부분", '제품', '택배', '배송', '배송도', '배송이', '배송은', '빠른배송', '빠른', '빠르게', 
                  '빠르고', '빨라서', '포장도', '포장', '도', '안전하게', '별', '할', '근데', '그냥', '그리고', '그래서', '로', '때문에', '있어요', '합니다', 
                  '해서', '없는', '아직', '같습니다', '하나', '듯', '다른', '했는데', '번에', '있어서', '같아요', '일단', '를', '을', 
                  '있습', '그래', '같네', '였습', '니다', '은', '는', '이', '가']


X_train, X_test = m.stop_wordsfiltering(x_train, x_test, stopwords_list)
mX_train, mX_test, tokenize, degree = m.Tokenizing_data(4000, X_train, X_test)
max_len = m.check_max_length(mX_train, mX_test)


epochs = 50

#model = m.train_GRU_model(mX_train, mX_test, y_train, y_test, max_len, epochs, 16, degree)
model = m.train_1Dcnn_ELU(mX_train, mX_test, y_train, y_test, max_len, epochs, 16, degree)
#model = m.train_lstm(mX_train, mX_test, y_train, y_test, max_len, epochs, 16, degree)


# sentence = input()
# score = m.sentiment_predict_module(sentence, max_len, tokenize, model, stopwords_list)
# print(score)

Data pre-processing is done!
0
(14905,) (6389,) (14905,) (6389,)
Tokenize is done
{'좋다': 1, '자다': 2, '빠르다': 3, '있다': 4, '없다': 5, '받다': 6, '오다': 7, '같다': 8, '으로': 9, '보다': 10, '사용': 11, '이다': 12, '구매': 13, '조립': 14, '가격': 15, '안': 16, '잘': 17, '요': 18, '쓰다': 19, '케이스': 20, '않다': 21, '한': 22, '하고': 23, '만족하다': 24, '쿨러': 25, '것': 26, '만': 27, '감사하다': 28, '에서': 29, '아니다': 30, '사다': 31, '성능': 32, '상품': 33, '보내다': 34, '문제': 35, '되어다': 36, '비': 37, '생각': 38, '의': 39, '개': 40, '팬': 41, '고': 42, '가성': 43, '많이': 44, '소음': 45, '2': 46, '없이': 47, '설치': 48, '괜찮다': 49, '크다': 50, '중': 51, '파워': 52, '박스': 53, '가다': 54, '들': 55, '주문': 56, '메인보드': 57, '아주': 58, '안되다': 59, '컴퓨터': 60, '그렇다': 61, 'cpu': 62, '물건': 63, '저렴하다': 64, '램': 65, '불량': 66, '못': 67, '때': 68, '들다': 69, '해주다': 70, '적': 71, '모르다': 72, '이쁘다': 73, '써다': 74, '싸다': 75, '상태': 76, '네': 77, '구입': 78, '하드': 79, '정도': 80, '조금': 81, '작동': 82, '쓸다': 83, '저': 84, '정말': 85, '3': 86, '알다': 87, '이라': 88, '이상': 89, '후': 90, '인': 91, '늦다': 92, '인데': 93

In [116]:
sentence = input()
score = m.sentiment_predict_module(sentence, max_len, tokenize, model, stopwords_list)
print(score)

가성비 갑입니다 제품 적극 추천드립니다 최고!
[[0.997338]]


In [117]:
sentence = input()
score = m.sentiment_predict_module(sentence, max_len, tokenize, model, stopwords_list)
print(score)

최고의 제품입니다 만족!
[[0.98011553]]


In [118]:
sentence = input()
score = m.sentiment_predict_module(sentence, max_len, tokenize, model, stopwords_list)
print(score)

ㅅ1발
[[0.09985204]]


In [119]:
sentence = input()
score = m.sentiment_predict_module(sentence, max_len, tokenize, model, stopwords_list)
print(score)

ㅅ1발 이따구로 만들고 시장에 내놓은거냐 ? 나가 뒤져라
[[0.09757335]]


In [122]:
from tensorflow.keras.models import load_model

saved_model = load_model("1dcnn_elu_model.h5")

# 병욱님 -> ssdhdd, ram
# 현달님 -> power, mainboard
# 상민님 -> GPU, cpu
# 영석 -> 쿨러 , 케이스 

cpu_element_list = ["ram",
                    "case"]


for name in cpu_element_list:
    #file_url = f"/content/gdrive/My Drive/워밍업프로젝트2/review_data/{name}_review.csv"
    #data = pd.read_csv(file_url)
    data = pd.read_csv(f"{name}_review_data.csv")

    score_list = []
    for sentence in data.review:
        score = float(m.sentiment_predict_module(sentence, max_len, tokenize, saved_model, stopwords_list))
        score_list.append(score)

    #print(score_list)
    data.score = pd.Series(score_list)
    print(data.score)
    save_path = f"{name}_proba.csv"
    data.to_csv(save_path)

0       0.948485
1       0.398832
2       0.037328
3       0.955053
4       0.917489
          ...   
5100    0.036617
5101    0.054330
5102    0.324837
5103    0.000874
5104    0.859576
Name: score, Length: 5105, dtype: float64
0       0.925696
1       0.414319
2       0.014722
3       0.035396
4       0.010550
          ...   
7186    0.068779
7187    0.303567
7188    0.049803
7189    0.217347
7190    0.383046
Name: score, Length: 7191, dtype: float64
