# 지도학습 기반 감성 분석

In [2]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings(action='ignore')
os.chdir('/Users/younghun/Desktop/gitrepo/data')

In [7]:
# tsv 파일도 csv호출함수로 불러올 수 있음!
# headers=0이면 헤더값(칼럼이름) 출력, 1이면 헤더값 없이 바로 출력
# quoting=3 이면 ""(큰따음포) 인용구는 무시하고 출력함!
review_df = pd.read_csv('labeledTrainData.tsv', encoding='utf-8',
                       header=0, sep='\t', quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [8]:
# review 데이터 하나만 살펴보기
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [9]:
# 정규표현식으로 HTML태그와 숫자 삭제하기
import re
# Series의 replace함수로 HTML태그 없애주기
review_df['review'] = review_df['review'].str.replace('<br />',' ')

In [14]:
# re.sub('패턴','뭘로바꿀지',바꿀문자열)
review_df['review'] = review_df['review'].apply(lambda x : re.sub('[^a-zA-Z]',' ',x))

In [23]:
# 학습, 테스트 데이터 분리
from sklearn.model_selection import train_test_split

target = review_df['sentiment']
feature = review_df['review']

X_train, X_test, y_train, y_test = train_test_split(feature,
                                                   target,
                                                   test_size=0.3,
                                                   random_state=42)
X_train.shape, X_test.shape

((17500,), (7500,))

In [24]:
# Pipeline을 통해서 텍스트를 벡터화시키고 모델 학습시키기!
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

# Vectorizer인자로는 불용어 제거, ngram범위 설정해주기
pipeline = Pipeline([('cnt_vect',CountVectorizer(stop_words='english',
                                               ngram_range=(1,2))),
                    ('lr_clf', LogisticRegression(C=10))])

# 위에서 설정해준 Pipleline으로 데이터 학습시킬 때는 벡터화시키기 전의 원본 데이터
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# '1' label로 예측될 확률 추출 for roc_auc_score
y_pred_proba = pipeline.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"정확도:{acc : .3f}\nAUC score:{auc : .3f}")

정확도: 0.885
AUC score: 0.951


In [25]:
# 이번엔 Count기반이 아닌 Tf-idf기반으로 Vectorizer한 후 Pipeline으로 학습
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([('tf_idf',TfidfVectorizer(stop_words='english',
                                              ngram_range=(1,2))),
                    ('lr_clf',LogisticRegression(C=10))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:,1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"정확도:{acc : .3f}\nAUC score:{auc : .3f}")

정확도: 0.892
AUC score: 0.959


# 비지도 학습기반 감성분석

## VADER lexicon을 이용한 Sentiment Analysis

- nltk의 vader에서 SentimentIntensityAnalyzer 클래스 제공

In [27]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/younghun/nltk_data...


True

In [28]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
# polarity_score로 하나의 텍스트에 대해 각 부정/객관/긍정 그리고 총 합한 감성지수 출력(-1~1)
senti_score = senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_score)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


In [29]:
# 임계치설정(보통 0.1)을 통해 compound(총 감성지수)가 임계치값보다 높으면 긍정, 낮으면 부정으로 분석
def get_sentiment(review, threshold):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    compound_score = scores['compound']
    final_sentiment = 1 if compound_score >= threshold else 0
    return final_sentiment

In [30]:
# 각 텍스트 데이터에 위에서 설정한 감성 label 얻는 함수 적용하기
# 임계값은 0.1로 설정
review_df['vader_pred'] = review_df['review'].apply(lambda x : get_sentiment(x, 0.1))
# 원본 데이터에서 주어진 정답 label과 VADER로 예측한 label 비교
y_target = review_df['sentiment']
y_pred = review_df['vader_pred']

In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score

print(confusion_matrix(y_target, y_pred))
print("정확도 :", accuracy_score(y_target, y_pred))
print("정밀도 :", precision_score(y_target, y_pred))
print("재현율 :", recall_score(y_target, y_pred))
print("F1 score :", f1_score(y_target, y_pred))

[[ 6730  5770]
 [ 1857 10643]]
정확도 : 0.69492
정밀도 : 0.64844939986596
재현율 : 0.85144
F1 score : 0.7362086258776328
