# 비지도학습 감성분석 - Lexicon 기반

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Wordnet Synset 및 Sentiwordnet SentiSynset 클래스
    - wordnet: 단어간에 어떤 관계를 가지고 있는지를 정리한 온톨로지
        - 온톨로지 의미: https://ko.wikipedia.org/wiki/%EC%98%A8%ED%86%A8%EB%A1%9C%EC%A7%80
    - wordnet 튜토리얼 참고 : https://www.nltk.org/howto/wordnet.html
    - 참고: https://frhyme.github.io/python-lib/nltk-wordnet/
    - 과학백과사전: https://www.scienceall.com/category/science_learning/scidictionary/
    - senti_wordnet: 비지도학습을 바탕으로 wordnet의 synset에 감성스코어를 매긴 어휘사전
        - 참고: https://bab2min.tistory.com/573

In [3]:
# nltk의 단어 사전에서 'present'가 18번 등장한다.
from nltk.corpus import wordnet

term = 'present' # 단어의 뜻이 여러 개(n: 선물, 현재, v: 보여주다, etc...)
synsets = wordnet.synsets(term)

In [4]:
type(synsets), len(synsets)

(list, 18)

In [5]:
# synsets == 객체들의 리스트
print(synsets)

[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [8]:
for synset in synsets[:5]:
    print(f'#### name: {synset.name()} ####')
    print('POS:', synset.lexname())
    print('정의:', synset.definition())
    print('표제어:', synset.lemma_names())
    print()

#### name: present.n.01 ####
POS: noun.time
정의: the period of time that is happening now; any continuous stretch of time including the moment of speech
표제어: ['present', 'nowadays']

#### name: present.n.02 ####
POS: noun.possession
정의: something presented as a gift
표제어: ['present']

#### name: present.n.03 ####
POS: noun.communication
정의: a verb tense that expresses actions or states at the time of speaking
표제어: ['present', 'present_tense']

#### name: show.v.01 ####
POS: verb.perception
정의: give an exhibition of to an interested audience
표제어: ['show', 'demo', 'exhibit', 'present', 'demonstrate']

#### name: present.v.02 ####
POS: verb.communication
정의: bring forward and present to the mind
표제어: ['present', 'represent', 'lay_out']



- 어휘간의 유사도

In [10]:
for synset in wordnet.synsets('tiger'):
    print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [16]:
# 단어, 품사를 아는 경우에는 synset()
tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [17]:
# 단어간의 유사도 (의미의 유사성 0 ~ 1 사이의 값으로 알 수 있다.)
tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [18]:
# 5개 단어간의 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
    similarity = [entity.path_similarity(another) for another in entities]
    similarities.append(similarity)

In [20]:
df = pd.DataFrame(similarities, columns=['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
0,1.0,0.071429,0.071429,0.076923,0.125
1,0.071429,1.0,0.333333,0.25,0.166667
2,0.071429,0.333333,1.0,0.25,0.166667
3,0.076923,0.25,0.25,1.0,0.2
4,0.125,0.166667,0.166667,0.2,1.0


- SentiSynset 객체

In [23]:
# senti_synsets를 사용할때는 리스트로!
from nltk.corpus import sentiwordnet

senti_synsets = list(sentiwordnet.senti_synsets('slow'))

In [24]:
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
11
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [25]:
senti_synsets = list(sentiwordnet.senti_synsets('father'))
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
9
[SentiSynset('father.n.01'), SentiSynset('forefather.n.01'), SentiSynset('father.n.03'), SentiSynset('church_father.n.01'), SentiSynset('father.n.05'), SentiSynset('father.n.06'), SentiSynset('founder.n.02'), SentiSynset('don.n.03'), SentiSynset('beget.v.01')]


In [27]:
# 명사: father 단어의 긍정감성 지수, 부정감성 지수, 객관성 지수 
# 감정분석으로 활용가능 하다.
# 한 단어의 정해진 의미를 사용할 때는 senti_synset('단어.품사.01번째 의미')
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()  # 긍정적이지도 않고, 부정적이지도 않고, 객관적이다.
                                                            # obj == 중립지수
                                                            # 세 개 항목의 합이 1이 되어야 한다.

(0.0, 0.0, 1.0)

In [28]:
# 형용사: mother 단어의 긍정감성 지수, 부정감성 지수, 객관성 지수
# 감정분석으로 활용가능 하다.
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()  # 긍정적이지도 않고, 부정적이지도 않고, 객관적이다.

(0.0, 0.0, 1.0)

In [31]:
# 부사: fabulous 단어의 긍정감성 지수, 부정감성 지수, 객관성 지수
# 감정분석으로 활용가능 하다.
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()  # pos - neg = 0.75 로 긍정적
                                                                  

(0.875, 0.125, 0.0)

In [32]:
# 하나의 단어의 여러가지 사전적 정의를 사용할 때: senti_synsets()
list(sentiwordnet.senti_synsets('just'))

[SentiSynset('just.a.01'),
 SentiSynset('equitable.a.01'),
 SentiSynset('fair.a.01'),
 SentiSynset('good.s.07'),
 SentiSynset('merely.r.01'),
 SentiSynset('precisely.r.01'),
 SentiSynset('just.r.03'),
 SentiSynset('just.r.04'),
 SentiSynset('barely.r.01'),
 SentiSynset('just.r.06')]

In [35]:
# 동사: 단어의 긍정감성 지수, 부정감성 지수, 객관성 지수 있을 수 있다.
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()  

(0.5, 0.0, 0.5)

In [34]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

- 감성지수 계산

In [38]:
# wordnet은 문장이 아니라 단어를 줘야 한다.
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
# from nltk import word_tokenize, pos_tag  ===> 책에서 사용한 방식
sentence = "It's good to see you again."
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [39]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [41]:
tag = ('good', 'JJ')
tag[1].startswith('J')

True

In [54]:
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('V'):
        return wordnet.VERB
    return None

In [55]:
for word, tag in pos_tag(word_list):
    print(word, penn_to_wn(tag))

watched v
this None
video n
friend n
house n


In [56]:
# Sentence로부터 Senti_Synset 객체를 만드는 과정
sentence = "It's good to see you again."
word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [57]:
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [58]:
sentiment = 0
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
sentiment

0.75

In [59]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [62]:
sentiment = 0
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

In [63]:
from nltk import sent_tokenize
document = "I watched this video at a friend's house. I'm glad I did not waste money buying this one. The video cover has a scene from the 1975 movie Capricorn One. The movie starts out with several clips of rocket blow-ups, most not related to manned flight. Sibrel's smoking gun is a short video clip of the astronauts preparing a video broadcast. He edits in his own voice-over instead of letting us listen to what the crew had to say. The video curiously ends with a showing of the Zapruder film. His claims about radiation, shielding, star photography, and others lead me to believe is he extremely ignorant or has some sort of ax to grind against NASA, the astronauts, or American in general. His science is bad, and so is this video."

In [66]:
sentiment = 0.0
for sentence in sent_tokenize(document):
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
    for word, tag in pos_tag(word_list):
        wn_tag = penn_to_wn(tag)
        if wn_tag:
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
            if not synsets:
                print(word)
                continue
            synset = synsets[0]
            sentiment += synset.pos_score() - synset.neg_score()
print('긍정' if sentiment >= 0 else '부정')

scene
blow-ups
Sibrel
voice-over
Zapruder
others
부정


- 감성을 계산해주는 함수

In [90]:
def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출  
        word_list = [word for word in word_tokenize(raw_sentence) if len(word) > 2]
        tagged_sentence = pos_tag(word_list)
        for word, tag in tagged_sentence:
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB):
                continue                   # 오류에 대응하기 위함
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wordnet.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = sentiwordnet.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    return 1 if sentiment >= 0 else 0

- IMDB 영화평 감성분석

In [91]:
df = pd.read_csv('data\labeledTrainData.tsv\labeledTrainData.tsv', sep='\t', quoting=3) # 3: QUOTE-None
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [92]:
# <br /> 태그는 공백으로 변환
df.review = df.review.str.replace('<br />',' ')

In [93]:
# 구둣점, 소수점 제거
df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()

In [94]:
df.shape

(25000, 3)

In [95]:
# df = df.iloc[:1000, :]
df.shape

(25000, 3)

In [96]:
%time df['pred'] = df.review.apply(lambda x: swn_polarity(x))

Wall time: 5min 18s


In [98]:
# 정확도 62% => 정확도는 좋지 않은 편
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.62568

### VADER Lexicon을 이용한 감성분석
    - cf. nltk.tokenize : Low-level API
    - High-level API
    - 클래스 사용만으로 여러 수식없이 document 그대로 분석해준다.
    - cf. 딥러닝에서, Tensorflow(Low-level)         Keras(High-level API)
    - 참고: https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=ossiriand&logNo=220607426789, https://nicola-ml.tistory.com/45, https://jeonsm9575.tistory.com/62
        - https://kopio.tistory.com/8

In [99]:
# compound값이 0.1 이상이면 긍정, 0.1 이하라면 부정

from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_score = senti_analyzer.polarity_scores(df.review[0])
senti_score

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}

In [103]:
def vader_polarity(document, threshold=0.1):
    score = senti_analyzer.polarity_scores(document)
    return 1 if score['compound'] > threshold else 0

In [104]:
%time df['vader pred'] = df.review.apply(lambda x: vader_polarity(x, 0.1))

Wall time: 56 s


In [105]:
accuracy_score(df.sentiment, df['vader pred'])

0.69556

- 예측 비교

In [107]:
# pred는 senti_wordnet에서 예측한 값
# vader pred는 vader에서 예측한 값
cdf = df[['sentiment', 'pred', 'vader pred']]
cdf.head(10)

Unnamed: 0,sentiment,pred,vader pred
0,1,1,0
1,1,1,1
2,0,0,0
3,0,0,1
4,1,0,1
5,1,0,1
6,0,1,0
7,0,0,0
8,0,0,1
9,1,1,1
