In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline

# 영화 댓글을 이용한 감성 분석

> ## 데이터 다운로드

- windows

In [None]:
# !bitsadmin /transfer get https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt %cd%\ratings_train.txt
# !bitsadmin /transfer get https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt %cd%\ratings_test.txt    

-  mac / linux

In [2]:
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt

--2018-09-15 17:32:20--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com... 151.101.72.133
Connecting to raw.githubusercontent.com|151.101.72.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt.1’

ratings_train.txt.1  13%[=>                  ]   1.93M   406KB/s    eta 49s    ^C
--2018-09-15 17:32:30--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
Resolving raw.githubusercontent.com... 151.101.72.133
Connecting to raw.githubusercontent.com|151.101.72.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4893335 (4.7M) [text/plain]
Saving to: ‘ratings_test.txt.1’

ratings_test.txt.1    8%[>                   ] 407.17K   269KB/s               ^C


-------

In [3]:
import codecs
import numpy as np
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    train = [line.split('\t') for line in f.read().splitlines()]
    train = train[1:]   # header 제외
f.close()

In [4]:
with codecs.open("ratings_test.txt", encoding='utf-8') as f:
    test = [line.split('\t') for line in f.read().splitlines()]
    test = test[1:]   # header 제외
f.close()

# 나이브베이즈 모델을 이용한 감성 분석 (Sentiment Analysis)
- 네이버 영화 리뷰
    - 평점 3점 이상이면 긍정 / 3점 미만이면 부정
- 사전확률 계산
- likelihood 계산
- 모델 실행

In [5]:
train[:3]

[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['10265843', '너무재밓었다그래서보는것을추천한다', '0']]

In [6]:
test[:3]

[['6270596', '굳 ㅋ', '1'],
 ['9274899', 'GDNTOPCLASSINTHECLUB', '0'],
 ['8544678', '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', '0']]

## 1. 사전확률 계산
- ## $ P(y = C_k) $
    - 각 클래스 (C) 의 확률 계산
        - 클래스별 비율을 계산하는 것임.

In [8]:
def count_target(data):
    from collections import defaultdict
    counts = defaultdict(int) ## value의 기본 자료형이 int인 dict
    for i, row in enumerate(data):
        sentiment = row[2] # 긍정인지, 부정인지
        counts[sentiment] += 1
            
    return counts

In [9]:
count_target(train)

defaultdict(int, {'0': 75173, '1': 74827})

In [10]:
def prob_target(data):
    pos_prob = count_target(data)['1']/sum(count_target(data).values())
    neg_prob = count_target(data)['0']/sum(count_target(data).values())
    return pos_prob, neg_prob

In [11]:
prob_target(train)

(0.49884666666666666, 0.5011533333333333)

## 2. likelihood 계산
- ## $P(x|y=C_k)$
- Laplace Smoothing
    - ### $\frac {N_i + \alpha}{N+\alpha K}$
    - 매우 작은 값을 추가하여 값이 0이 되지 않도록 한다. 
    - 여러값의 곱을 취할 경우 하나만 0이되면 전체가 0이 되는 문제가 있기 때문이다.
        - ex. '메가박스'라는 단어가 긍정을 표현한 리뷰에만 포함된 경우
            - P(메가박스 | C_neg) = 0 이 되므로 P(C_neg | 메가박스) 도 0이 되어버린다.
            - 라플라스 스무딩 적용 시, P(메가박스 | C_neg) = k / n+k 가 된다.

### (1) 단어별 카운트

In [12]:
def count_words(data):
    from collections import defaultdict
    counts = defaultdict(lambda: [0, 0]) ## value의 기본 자료형이 [0, 0]인 dict
    for i, row in enumerate(data):
        review_words = row[1].split()
        sentiment = row[2]
        for word in review_words:
            counts[word][0 if sentiment == '1' else 1] += 1 ## word count | C_pos, word count | C_neg
    return counts

In [13]:
# 정우성이라는 단어는 긍정에서 몇번, 부정에서 몇번 나왔는지

count_words(train)['정우성']

[16, 12]

In [14]:
## count_words(train)의 결과는 아래와 같은 형태를 가지고 있다. [긍정, 부정]

{'정우성' : [16, 12]}

{'정우성': [16, 12]}

### (2) 단어별 확률계산

In [15]:
def word_probabilities(counts, target, k=1):
    """laplace smoothing 적용"""
    from collections import defaultdict
    probabilities = defaultdict(dict)
    total_pos = target['1'] ## 클래스 1의 총 개수
    total_neg = target['0'] ## 클래스 0의 총 개수
    
    for w, (positive, negative) in counts.items():
        probabilities[w] = ((positive + k) / (total_pos + k),  ## P(word | C_pos)
                            (negative + k) / (total_neg + k))  ## P(word | C_neg)
        
    return probabilities

In [16]:
word_probabilities(count_words(train), count_target(train))['정우성']
# (P(정우성 | y=긍정), P(정우성 | y=부정))

(0.0002271876837547442, 0.00017293213078990076)

## 3. 사후확률 계산
- ## $P(y=C_k|x)$
- 언더플로우 방지를 위한 log 연산
    - 확률 계산 시 확률을 계속 곱해주게되면 매우 낮은 값들이 나와 언더플로우 현상이 발생할 수 있다.
    - 이를 방지하기 위해 $P(x∣y=C_{ k })$를 $log(P(x∣y=C_{ k }))$ 로 바꿔서 연산한다. 
    - ## $log(P(x∣y=C_{ k })) =\sum _{i=1}^{P}log({P(x_j|y=C_k)})$

In [17]:
def sentiment_probability(word_probs, prob_target, review):
    import math
    review_words = review.split()
    log_prob_if_neg = log_prob_if_pos = 0.0
    pos_prob = prob_target[0]
    neg_prob = prob_target[1]
    log_each_word_prob = 0
    
    for word in review_words:
        # 단어가 긍정/부정에서 출현할 확률 (likelihood)
        
        if word in word_probs:
            ## 리뷰의 단어가 기존 학습 데이터에 있는 단어일 때
            log_prob_if_pos += math.log(word_probs[word][0])
            log_prob_if_neg += math.log(word_probs[word][1])
            
        else:
            pass
        
    prob_if_pos = log_prob_if_pos + math.log(pos_prob) ## log(P(x|C_pos)P(C_pos)) = log(P(x|C_pos)) + log(P(C_pos))
    prob_if_neg = log_prob_if_neg + math.log(neg_prob) ## log(P(x|C_neg)P(C_neg)) = log(P(x|C_neg)) + log(P(C_neg))
    
    return prob_if_pos , prob_if_neg

In [18]:
word_probs = word_probabilities(count_words(train), count_target(train))

In [19]:
target_prob = prob_target(train)

In [20]:
test[2]

['8544678', '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', '0']

In [21]:
import math
pos_proba, neg_proba = sentiment_probability(word_probs, target_prob, test[2][1])
print('긍정 리뷰 확률 : ', pos_proba, math.exp(pos_proba))
print('부정 리뷰 확률 : ', neg_proba, math.exp(neg_proba))

긍정 리뷰 확률 :  -62.31275723227796 8.66788981205466e-28
부정 리뷰 확률 :  -59.9225108890304 9.462026915642333e-27


In [22]:
# pos_proba

## 3. 모델링

In [25]:
class NaiveBayesClassifier:
    def __init__(self, k = 1):
        self.k = k
        
    def count_target(self, data):
        from collections import defaultdict
        counts = defaultdict(int)
        for i, row in enumerate(data):
            sentiment = row[2]
            if sentiment == '1':
                counts[sentiment] += 1
            else:
                counts[sentiment] += 1
        return counts
    
    def prob_target(self, data):
        pos_prob = self.count_target(data)['1']/sum(self.count_target(data).values())
        neg_prob = self.count_target(data)['0']/sum(self.count_target(data).values())
        return pos_prob, neg_prob

    def count_words(self, data):
        from collections import defaultdict
        counts = defaultdict(lambda: [0, 0])
        for i, row in enumerate(data):
            review_words = row[1].split()
            sentiment = row[2]
            for word in review_words:
                counts[word][0 if sentiment == '1' else 1] += 1      
        return counts

    def word_probabilities(self, counts, target, k=1):
        from collections import defaultdict
        """laplace smoothing 적용"""
        probabilities = defaultdict(dict)
        self.total_pos = target['1']
        self.total_neg = target['0']
        for w, (positive, negative) in counts.items():
            probabilities[w] = ((positive + k) / (self.total_pos + k), 
                                (negative + k) / (self.total_neg + k))

        return probabilities

    def sentiment_probability(self, word_prob, review):
        import math
        review_words = review.split()
        log_prob_if_neg = log_prob_if_pos = 0.0
        log_prob_if_not_pos = log_prob_if_not_neg = 0.0
        
        pos_prob = self.pos_prob
        neg_prob = self.neg_prob
        
        for word in review_words:
            if word in word_prob:
                ## 리뷰의 단어가 기존 학습 데이터에 있는 단어일 때
                log_prob_if_pos += math.log(word_prob[word][0])
                log_prob_if_neg += math.log(word_prob[word][1])
                
            else:
                ## 리뷰의 단어가 기존 학습 데이터에 없는 새로운 단어일 때 
                log_prob_if_pos += math.log((0 + self.k)/self.total_pos + self.k)
                log_prob_if_neg += math.log((0 + self.k)/self.total_neg + self.k)                
                
        prob_if_pos = log_prob_if_pos + math.log(pos_prob)
        prob_if_neg = log_prob_if_neg + math.log(neg_prob)
        return prob_if_pos , prob_if_neg
  
    
    def train(self, data):
        self.pos_prob, self.neg_prob = self.prob_target(data)
        word_counts = self.count_words(data)
        target_counts = self.count_target(data)
        self.word_prob = self.word_probabilities(word_counts, target_counts, k = self.k)
    
    def predict(self, review):
        prob_if_pos , prob_if_neg = self.sentiment_probability(self.word_prob, review)
        if prob_if_pos > prob_if_neg:
            return '1'
        else:
            return '0'

## 4. Training

In [26]:
classifier = NaiveBayesClassifier()

In [27]:
classifier.train(train)

## 5. 결과

In [28]:
true_target = []
predicted_target = []
for i, row in enumerate(test):
    true_target.append(row[2])
    predicted_target.append(classifier.predict(row[1]))

In [29]:
from sklearn.metrics import classification_report

print(classification_report(true_target, predicted_target))

             precision    recall  f1-score   support

          0       0.79      0.83      0.81     24827
          1       0.82      0.78      0.80     25173

avg / total       0.81      0.80      0.80     50000



## Scikit-learn

In [30]:
X = list(zip(*train))[1]
y = np.array(list(zip(*train))[2], dtype=int)

In [31]:
X_test = list(zip(*test))[1]
y_test = np.array(list(zip(*test))[2], dtype=int)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
## http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

model = Pipeline([
            ('vect', CountVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [33]:
model.fit(X, y)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [34]:
print(classification_report(y_test, model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.82     25173

avg / total       0.83      0.83      0.83     50000



In [35]:
## Pipeline 없이 사용

vectorizer = CountVectorizer().fit(X)
x_vec = vectorizer.transform(X)
model = MultinomialNB().fit(x_vec, y)

In [38]:
print(classification_report(y_test, model.predict(vectorizer.transform(X_test))))

             precision    recall  f1-score   support

          0       0.81      0.84      0.83     24827
          1       0.84      0.81      0.82     25173

avg / total       0.83      0.83      0.83     50000

