# 비지도학습 감성분석 - Lexicon 기반

In [1]:
import numpy as np
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Wordnet Synset 및및 Sentiwordnet Sentisynset 클래스

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
from nltk.corpus import wordnet


In [8]:
term = 'present'
synsets = wordnet.synsets(term)

In [9]:
type(synsets), len(synsets)

(list, 18)

In [10]:
for synset in synsets:
  print(f'##### name: {synset.name()} #####')
  print('POS: ', synset.lexname())
  print('정의:' , synset.definition())
  print('표제어: ', synset.lemma_names())

##### name: present.n.01 #####
POS:  noun.time
정의: the period of time that is happening now; any continuous stretch of time including the moment of speech
표제어어:  ['present', 'nowadays']
##### name: present.n.02 #####
POS:  noun.possession
정의: something presented as a gift
표제어어:  ['present']
##### name: present.n.03 #####
POS:  noun.communication
정의: a verb tense that expresses actions or states at the time of speaking
표제어어:  ['present', 'present_tense']
##### name: show.v.01 #####
POS:  verb.perception
정의: give an exhibition of to an interested audience
표제어어:  ['show', 'demo', 'exhibit', 'present', 'demonstrate']
##### name: present.v.02 #####
POS:  verb.communication
정의: bring forward and present to the mind
표제어어:  ['present', 'represent', 'lay_out']
##### name: stage.v.01 #####
POS:  verb.creation
정의: perform (a play), especially on a stage
표제어어:  ['stage', 'present', 'represent']
##### name: present.v.04 #####
POS:  verb.possession
정의: hand over formally
표제어어:  ['present', 'submit']

- 어휘간의 유사도

In [12]:
# 단어, 품사를 모를 경우에는 synsets()으로 알아냄
for synset in wordnet.synsets('tiger'):
  print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [13]:
# 단어, 품사를 아는 경우에는 synset()
tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [15]:
# 단어간의의 유사도도
tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [16]:
# 5개 단어간의 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
  similarity = [entity.path_similarity(another) for another in entities]
  similarities.append(similarity)

In [18]:
df = pd.DataFrame(similarities, columns = ['tree', 'lion', 'tiger', 'cat', 'dog'],
                  index = ['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.071429,0.076923,0.125
lion,0.071429,1.0,0.333333,0.25,0.166667
tiger,0.071429,0.333333,1.0,0.25,0.166667
cat,0.076923,0.25,0.25,1.0,0.2
dog,0.125,0.166667,0.166667,0.2,1.0


- SentiSynset 클래스

In [19]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [20]:
from nltk.corpus import sentiwordnet
senti_synsets = list(sentiwordnet.senti_synsets('slow'))

In [21]:
senti_synsets

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [23]:
# father 단어의 긍정/부정/객관성 지수
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [24]:
# mother 단어의 긍정/부정/객관성 지수
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

In [30]:
# fabulous 단어의 긍정/부정/객관성 지수
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [31]:
# love 단어의 긍정/부정/객관성 지수
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

In [36]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

- 감성지수 계산

In [44]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [45]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again."
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [47]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [48]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [49]:
def penn_to_wordnet(tag):
  if tag.startswith('N'):
    return wordnet.NOUN
  if tag.startswith('J'):
    return wordnet.ADJ
  if tag.startswith('R'):
    return wordnet.ADV
  if tag.startswith('V'):
    return wordnet.VERB

In [50]:
for word, pos in pos_tag(word_list):
  print(word, penn_to_wordnet(pos))

It None
's v
good a
to None
see v
you None
again r
. None


- Sentence로부터 Senti_Synset 객체를 만드는 과정

In [51]:
sentence = "It's good to see you again."
word_list = [ word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [53]:
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag:                # None이 아닌 'n', 'a', 'r', 'v'
    synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
    synset = synsets[0]
    print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [54]:
sentiment = 0
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag:                # None이 아닌 'n', 'a', 'r', 'v'
    synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
    synset = synsets[0]
    sentiment += synset.pos_score() - synset.neg_score()
sentiment

0.75

In [55]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [56]:
# 표제어어 추출까지 고려려
sentiment = 0
for word, pos in pos_tag(word_list):
  wn_tag = penn_to_wordnet(pos)
  if wn_tag:                # None이 아닌 'n', 'a', 'r', 'v'
    lemma = lemmatizer.lemmatize(word, wn_tag)
    synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
    synset = synsets[0]
    sentiment += synset.pos_score() - synset.neg_score()
sentiment

0.75

- 도큐먼트에서 감성지수를 계산하는 과정 및 함수

In [58]:
from nltk import sent_tokenize
document = '''
This is a movie made purely to satisfy the fans and there should be no doubt about that. 
No Way Home, in my opinion, is even better than Homecoming and Far From Home, and pretty much one of the best MCU movies of all time. 
It's a simple story, but the execution is fantastic. 
Even the smallest of surprises have a huge impact, and I could feel that in the theatre as I joined several other Spider-Man fans cheer out for both heroes and villains. 
The action sequences were brilliant; seeing them in 3D is totally worth the price of admission. Every actor delivered a believable, realistic performance, and especially our lead actor Tom Holland. 
The visual effects too were top notch and the editing was stupendous. 
Two and a half hours flew by real quick while watching this popcorn action entertainer. 
It won't be fair to reveal anything, so here I conclude my review, and recommend you to check out this new world of Spidey-ness on the big screen and in 3D. 
And once you've seen it, please don't spoil it for others, just like you won't want it spoiled for yourself.
'''

In [61]:
sentiment = 0.0
for sentence in sent_tokenize(document):
  word_list = [ word for word in word_tokenize(sentence) if len(word) > 2]
  for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:                # None이 아닌 'n', 'a', 'r', 'v'
      lemma = lemmatizer.lemmatize(word, wn_tag)
      synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
      if not synsets:
        print(word)
        continue
      synset = synsets[0]
      sentiment += synset.pos_score() - synset.neg_score()
print('긍정' if sentiment >= 0 else '부정')

Homecoming
From
MCU
Spider-Man
lead
popcorn
n't
anything
Spidey-ness
've
n't
others
n't
긍정


In [63]:
def swn_polarity(text):
  lemmatizer = WordNetLemmatizer()
  sentiment = 0.0
  for sentence in sent_tokenize(text):
    word_list = [ word for word in word_tokenize(sentence) if len(word) > 2]
    for word, pos in pos_tag(word_list):
      wn_tag = penn_to_wordnet(pos)
      if wn_tag:                # None이 아닌 'n', 'a', 'r', 'v'
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        if not synsets:
          print(word)
          continue
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
  return 1 if sentiment >= 0  else 0

- IMDB 영화평 감성분석

In [65]:
df = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting = 3)   # 3: QUOTE_NONE
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [66]:
# <br /> 태그는 공백으로 변환
df.review = df.review.str.replace('<br />', ' ')

In [67]:
# 구둣점, 숫자 제거 - 영문자가 아닌 글자는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()

In [70]:
df.shape

(25000, 3)

In [71]:
df = df.head(10000)

In [72]:
%time df['pred'] = df.review.apply(lambda x: swn_polarity(x))

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Amidst
Her
Jindabyne
outcast
Claire
breed
Jindabyne
Everything
everyone
Jindabyne
beneath
none
dive
Jindabyne
Claire
exploring
doesn
grainy
sloppiness
enjoy
NORTHFORK
favorites
Jindabyne
turbid
didn
Susan
Sarandon
therefor
strange
facts
aircraft
fail
admit
Fallon
Catwomanly
didn
mesmerize
cheesy
Fallon
steeeeee
twoooooooo
SNL
don
buy
hey
couldn
Sox
whole
Pedro
Martinez
Anyway
Sox
admit
Sox
Anybody
Fallon
til
mantra
backdrop
Sox
don
marry
humor
silly
Ben
Sox
hasn
everybody
sorry
know
dinner
Lindsey
Sox
avoid
Been
Weeeeeell
Lindsey
Yeah
weird
don
deny
Farrelly
Lindsey
bros
ooze
Farrellys
Lindsey
Fallon
Particularly
Sox
NEVER
NCAA
declare
yours
THE
Fallon
Sox
matin
suicide
earth
briefly
Crappy
cute
lead
unappealing
yuck
fails
smarter
Stale
laughtrack
punctuate
nauseatingly
resembling
truly
brutal
tiresome
Visually
creepy
verisimilitude
something
insulting
Goldie
Hawn
celeb
Nicky
Hawn
JOE
WUSA
EASY
others
accomplishment
neither
New
audience

In [74]:
# 정확도 계산
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.6309

### VADER Lexicon을 이용한 감성 분석

In [77]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [80]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_analyzer.polarity_scores(df.review[0])

{'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

In [81]:
def vader_polarity(document, threshold = 0.1):
  score = senti_analyzer.polarity_scores(document)
  return 1 if score['compound'] >= threshold else 0

In [82]:
%time df['vader'] = df.review.apply(lambda x: vader_polarity(x, 0.1))

CPU times: user 31.7 s, sys: 87.2 ms, total: 31.8 s
Wall time: 32.6 s


In [83]:
accuracy_score(df.sentiment, df.vader)

0.6997