# 지도학습 기반

In [8]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [24]:
review_df = pd.read_csv("labeledTrainData.tsv", header = 0, sep = '\t', quoting = 3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


- **id**: 각 데이터의 id
- **sentiment**: 영화평(review)의 Sentiment 결과값(Target Label)
    - 긍정적 평가(1), 부정적 평가(0)
- **review**: 영화평의 텍스트

In [4]:
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [5]:
review_df['review'] = review_df['review'].str.replace('<br />', ' ')

# 파이썬의 정규표현식 모듈인 re를 이용해 영어 문자열이 아닌 문자는 모두 공백으로 반환
review_df['review'] = review_df['review'].apply(lambda x: re.sub("[^a-zA-Z']", " ", x))


In [6]:
class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis = 1, inplace = False)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size = 0.3, random_state = 156)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17500, 1), (7500, 1), (17500,), (7500,))

- 학습용 데이터는 17500개의 리뷰
- 테스트용 데이터는 7500개의 리뷰

In [13]:
# Count 벡터화
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words = 'english', ngram_range = (1, 2))),
    ('lr_clf', LogisticRegression(C = 10))
])

In [17]:
pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1] # roc_auc 때문에 수행

In [19]:
print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred),
                                                roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8860, ROC-AUC는 0.9503


In [20]:
# TF-IDF 벡터화
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words = 'english', ngram_range = (1, 2))),
    ('lr_clf', LogisticRegression(C = 10))
])

In [21]:
pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1] # roc_auc 때문에 수행

In [22]:
print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred),
                                                roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8936, ROC-AUC는 0.9598


TF-IDF 기반 피처 벡터화의 예측 성능이 조금 나아졌다.

# 비지도학습 기반

## 1. SentiWordNet

In [18]:
import numpy as np
from nltk.corpus import wordnet as wn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score

In [14]:
# 간단한 NTLK PennTreebackTag를 기반으로 WordNet 기반의 품사 Tag로 반환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB

In [10]:
def swn_polarity(text):
    # 감성 지수 초기화
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text) # 1. 문장 토큰
    
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynSet 생성 -> 감성지수 합산
    for raw_sentence in raw_sentences:
        
        # 2. 문장별로 단어 토큰 -> NTLK 기반의 품사 태킹 문장 추출
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
        
        for word, tag in tagged_sentence:
            
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
                
            lemma = lemmatizer.lemmatize(word, pos = wn_tag)
            if not lemma:
                continue
                
            # 3. 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성
            synsets = wn.synsets(lemma, pos = wn_tag)
            if not synsets:
                continue
                
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 4. 모든 단어에 대해 긍정 감성 지수는 +로, 부정 감성 지수는 -로 합산해 감성 지수 계산
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1
        
    if not tokens_count:
        return 0
    
    # 5. 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    if sentiment >= 0:
        return 1
    
    return 0

In [15]:
# review_df에 preds 칼럼을 추가해 swn_polarity로 반환된 감성 평가를 담는다.
review_df['preds'] = review_df['review'].apply(lambda x: swn_polarity(x))
y_target = review_df['sentiment'].values
preds = review_df['preds'].values

In [19]:
# 감성 분석 예측 성능
print(confusion_matrix(y_target, preds))
print('정확도: ', np.round(accuracy_score(y_target, preds), 4))
print('정밀도: ', np.round(precision_score(y_target, preds), 4))
print('재현율: ', np.round(recall_score(y_target, preds), 4))

[[7649 4851]
 [3578 8922]]
정확도:  0.6628
정밀도:  0.6478
재현율:  0.7138


## 2. VADER

In [22]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    | 

[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package mte_teip5 to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package mte_teip5 is already up-to-date!
[nltk_data]    | Downloading package mwa_ppdb to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package mwa_ppdb is already up-to-date!
[nltk_data]    | Downloading package names to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Downloading package nombank.1.0 to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package nombank.1.0 is already up-to-date!
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package nonbreaking_prefixes is already up-to-date!
[nltk_data]    | Downloading package nps_chat

[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package unicode_samples is already up-to-date!
[nltk_data]    | Downloading package universal_tagset to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package universal_tagset is already up-to-date!
[nltk_data]    | Downloading package universal_treebanks_v20 to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package universal_treebanks_v20 is already up-to-
[nltk_data]    |       date!

In [23]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [25]:
senti_analyzer = SentimentIntensityAnalyzer()
senti_score = senti_analyzer.polarity_scores(review_df['review'][0]) # 감성 점수
print(senti_score)

{'neg': 0.13, 'neu': 0.744, 'pos': 0.126, 'compound': -0.8278}


In [26]:
def vader_polarity(review, threshold = 0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    # compound 값에 기반해 threshold 입력값보다 크면 1, 아니면 0 반환
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

In [28]:
# 레코드 별로 vader_polarity()를 수행하고 결과를 vader_preds에 저장
review_df['vader_preds'] = review_df['review'].apply(lambda x: vader_polarity(x, 0.1))
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

In [29]:
print(confusion_matrix(y_target, vader_preds))
print('정확도: ', np.round(accuracy_score(y_target, vader_preds), 4))
print('정밀도: ', np.round(precision_score(y_target, vader_preds), 4))
print('재현율: ', np.round(recall_score(y_target, vader_preds), 4))

[[ 6819  5681]
 [ 1936 10564]]
정확도:  0.6953
정밀도:  0.6503
재현율:  0.8451
