## Sentiment Analysis: Supervised learning

In [1]:
# quoting=3, ignore all quotings and this is needed especially in text analysis
import pandas as pd

review_df = pd.read_csv('./labeledTrainData.tsv', header=0, sep="\t", quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [2]:
# Checking review data
print(review_df['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [3]:
import re

# replace html tag into space
review_df['review'] = review_df['review'].str.replace('<br />', ' ')

# replace things other than alphabet into space
review_df['review'] = review_df['review'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))


In [4]:
# as usual, divide train/test data
from sklearn.model_selection import train_test_split

target = review_df['sentiment']
feature = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=156)

X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [5]:
# 처음해보는거라 잘 모르겠지만, 일단 .values 처럼 X_train에서 특정컬럼만 빼야됨
# CountVec < Tfidf 라고 해놓고 CountVec 계속하는 이유는, 확률분포 적용할때 기반이 CountVec 이기 떄문(후술)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10, max_iter=500))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_proba = pipeline.predict_proba(X_test['review'])[:, 1]

print('Accuracy:{}, ROC_AUC:{}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_proba)))

Accuracy:0.886, ROC_AUC:0.9502710989073525


In [6]:
# Tfidf 적용시:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression(C=10, max_iter=500))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_proba = pipeline.predict_proba(X_test['review'])[:, 1]

print('Accuracy:{}, ROC_AUC:{}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_proba)))

Accuracy:0.8936, ROC_AUC:0.959799823582973


## Sentiment Analysis: Unsupervised learning

In [7]:
# import nltk
# nltk.download('all')

In [8]:
# i.e: in terms of word 'present', it has about 18 meanings according to nltk library
# 이건 그냥 사전적 의미를 쭉 찾는개념
from nltk.corpus import wordnet as wn

term = 'present'

synsets = wn.synsets(term)
print(type(synsets), len(synsets))
print(synsets)

<class 'list'> 18
[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [9]:
# pos: 품사, def: 의미, lemma: 기본형
for synset in synsets:
    print('#####')
    print(synset.name())
    print(synset.lexname())
    print(synset.definition())
    print(synset.lemma_names())

#####
present.n.01
noun.time
the period of time that is happening now; any continuous stretch of time including the moment of speech
['present', 'nowadays']
#####
present.n.02
noun.possession
something presented as a gift
['present']
#####
present.n.03
noun.communication
a verb tense that expresses actions or states at the time of speaking
['present', 'present_tense']
#####
show.v.01
verb.perception
give an exhibition of to an interested audience
['show', 'demo', 'exhibit', 'present', 'demonstrate']
#####
present.v.02
verb.communication
bring forward and present to the mind
['present', 'represent', 'lay_out']
#####
stage.v.01
verb.creation
perform (a play), especially on a stage
['stage', 'present', 'represent']
#####
present.v.04
verb.possession
hand over formally
['present', 'submit']
#####
present.v.05
verb.stative
introduce
['present', 'pose']
#####
award.v.01
verb.possession
give, especially as an honor or reward
['award', 'present']
#####
give.v.08
verb.possession
give as a prese

In [24]:
# synset 객체를 단어별로 생성 
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree , lion , tiger , cat , dog]
similarities = []

# 이건 그냥 index용
entity_names = [entity.name().split('.')[0] for entity in entities]

for entity in entities:
    similarity = [entity.path_similarity(compared_entity) for compared_entity in entities]
    similarities.append(similarity)
    
similarity_df = pd.DataFrame(similarities, columns=entity_names, index=entity_names)
similarity_df


Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.071429,0.071429,0.076923,0.125
lion,0.071429,1.0,0.333333,0.25,0.166667
tiger,0.071429,0.333333,1.0,0.25,0.166667
cat,0.076923,0.25,0.25,1.0,0.2
dog,0.125,0.166667,0.166667,0.2,1.0


In [34]:
from nltk.corpus import sentiwordnet as swn

# list로 씌워주면 목록이 나타남(이건 다른거에도 적용 해봄직함)
senti_synsets = list(swn.senti_synsets('slow'))
print(type(senti_synsets), len(senti_synsets))
print(senti_synsets)

<class 'list'> 11
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [46]:
# 사실 slow로 해도 되는데, 다른거로도 해보자 
from nltk.corpus import sentiwordnet as swn

father = swn.senti_synset('father.n.01')
print('Father: positive:{}, negative:{}, objective:{}'.format(father.pos_score(), father.neg_score(), father.obj_score()))
print()

fabulous = swn.senti_synset('fabulous.a.01')
print('Fabulous: positive:{}, negative:{}, objective:{}'.format(fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()))


Father: positive:0.0, negative:0.0, objective:1.0

Fabulous: positive:0.875, negative:0.125, objective:0.0


In [47]:
from nltk.corpus import wordnet as wn

# 간단한 NTLK PennTreebank Tag를 기반으로 WordNet기반의 품사 Tag로 변환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return

In [48]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출  
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
        for word , tag in tagged_sentence:
            
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN , wn.ADJ, wn.ADV):
                continue                   
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wn.synsets(lemma , pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    if sentiment >= 0 :
        return 1
    
    return 0

In [49]:
review_df['preds'] = review_df['review'].apply( lambda x : swn_polarity(x) )
y_target = review_df['sentiment'].values
preds = review_df['preds'].values

In [50]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score 
from sklearn.metrics import recall_score, f1_score, roc_auc_score

print(confusion_matrix( y_target, preds))
print("정확도:", accuracy_score(y_target , preds))
print("정밀도:", precision_score(y_target , preds))
print("재현율:", recall_score(y_target, preds))

[[7668 4832]
 [3636 8864]]
정확도: 0.66128
정밀도: 0.647196261682243
재현율: 0.70912


## Vader

In [71]:
## Vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_scores)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


In [72]:
## Vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])
senti_scores

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}

In [76]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    
    return final_sentiment

review_df['vader_preds'] = review_df['review'].apply(lambda x: vader_polarity(x, 0.1))
y_target = review_df['sentiment'].values
vader_pred = review_df['vader_preds'].values


In [78]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

print('confusion_matrix:\n', confusion_matrix(y_target, vader_pred))
print('accuracy:', accuracy_score(y_target, vader_pred))
print('precision_score:', precision_score(y_target, vader_pred))
print('recall_score:', recall_score(y_target, vader_pred))
print()

print('f1_score:', f1_score(y_target, vader_pred))
print('roc_auc_score:', roc_auc_score(y_target, vader_pred))

confusion_matrix:
 [[ 6747  5753]
 [ 1858 10642]]
accuracy: 0.69556
precision_score: 0.6491003354681305
recall_score: 0.85136

f1_score: 0.7365980273403703
roc_auc_score: 0.6955600000000001
