### RNN-LSTM 기반의 영어 감정 분석기
##### FRIENDS 시트콤 데이터 - 모델 실행 및 평가

In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import re # 축약문 치환
from soynlp.normalizer import * # 반복문자 교정
from tensorflow.keras.preprocessing.text import text_to_word_sequence # 토큰화
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

from nltk.corpus import stopwords # StopWord(불용어)
stopwords.words('english')

new_stopwords = stopwords.words('english')
new_stopwords.append("phoebe")
new_stopwords.append("monica")
new_stopwords.append("ross")
new_stopwords.append("chandler")
new_stopwords.append("joey")
new_stopwords.append("rachel")
new_stopwords.append("dr")
new_stopwords.append("ms")
new_stopwords.append("mr")
new_stopwords.append("mrs")

## Lemma
# Lemmatization
from nltk.stem import WordNetLemmatizer
n = WordNetLemmatizer()

## Stem
# Stemming_Porter
from nltk.stem import PorterStemmer
p = PorterStemmer()

# Stemming_Lancaster
from nltk.stem import LancasterStemmer
l=LancasterStemmer()

# Stemming_SnowballStemmer
from nltk.stem import SnowballStemmer
s = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\server\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\server\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# rule-based 축약문 치환
def char_change(sentence):
    sentence = sentence.lower()
    
    sentence = re.sub(r"ain't", "have not", sentence)
    sentence = re.sub(r"can't", "can not", sentence)
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"'re", " are", sentence)
    sentence = re.sub(r"y'", "you ", sentence)
    sentence = re.sub(r"'ll", " will", sentence)
    sentence = re.sub(r"'ve", " have", sentence)
    sentence = re.sub(r"'d", " would", sentence)
    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"there's", "there is", sentence)
    sentence = re.sub(r"what's", "what is", sentence)
    sentence = re.sub(r"who's", "who is", sentence)
    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"let's", "let us", sentence)
    sentence = re.sub(r"c'mon", "come on", sentence)
    sentence = re.sub(r"c'mere", "come here", sentence)
    
    return sentence

In [3]:
max_len = 10

In [4]:
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [5]:
def sentiment_predict(new_sentence):
    
    #전처리(영어, 공백을 제외한 문자 제거)
    new_sentence = re.sub(r"[^A-Za-z ']", "", new_sentence)
    
    # 축약문 처리
    temp_X2 = char_change(new_sentence)
    
    # 반복문자 교정
    temp_X2 = repeat_normalize(temp_X2, num_repeats=1)
    
    #토큰화
    new_sentence = text_to_word_sequence(new_sentence)  
    
    #Lemmatization
    new_sentence = [n.lemmatize(word, 'v') for word in new_sentence] 
    
    # Stemming_Porter
    #new_sentence = [p.stem(word) for word in new_sentence]
    
    # Stemming_Lancaster
    #new_sentence = [l.stem(word) for word in new_sentence]
    
    # Stemming_SnowballStemmer
    #new_sentence = [s.stem(word) for word in new_sentence]
    
    #Stopword(불용어) 제거
    new_sentence = [word for word in new_sentence if not word in new_stopwords]
    #new_sentence = [word for word in new_sentence if not word in stopwords.words('english')]
    
    #정수 인코딩
    encoded = tokenizer.texts_to_sequences([new_sentence])
    
    #패딩
    pad_new = pad_sequences(encoded, maxlen = max_len)
    
    score = loaded_model2.predict(pad_new)

    #val = np.max(score)
    val_index = np.argmax(score)

    if(val_index==0):
        return "anger"
    elif(val_index==1):
        return "disgust"
    elif(val_index==2):
        return "fear"
    elif(val_index==3):
        return "joy"
    elif(val_index==4):
        return "neutral"
    elif(val_index==5):
        return "non-neutral"
    elif(val_index==6):
        return "sadness"
    elif(val_index==7):
        return "surprise"

In [6]:
loaded_model2 = load_model('friends_best_model.h5')

eval_data = pd.read_csv('./[전처리]en_data.csv', engine='python', encoding='utf-8')
eval_data['Predicted'] = 0

for i in range(len(eval_data)):
    Predicted = sentiment_predict(eval_data['utterance'][i])
    eval_data['Predicted'][i] = Predicted
    
eval_data.drop(['i_dialog'], axis='columns', inplace=True)
eval_data.drop(['i_utterance'], axis='columns', inplace=True)
eval_data.drop(['speaker'], axis='columns', inplace=True)
eval_data.drop(['utterance'], axis='columns', inplace=True)

eval_data.to_csv("result_2019512014_이동환.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eval_data['Predicted'][i] = Predicted
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
