In [1]:
# 머신러닝 기반 감정분석

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
import nltk
from nltk.corpus import movie_reviews
from textblob import  TextBlob
from afinn import Afinn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
import pandas as pd

# nltk 데이터 다운로드
nltk.download('movie_reviews', quiet=True)
nltk.download('vader_lexicon', quiet=True)

# 영화 리뷰 데이터 로드
fileids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileid) for fileid in fileids[:50]] + [movie_reviews.raw(fileid) for fileid in fileids[-50:]]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids[:50]] +[movie_reviews.categories(fileid)[0] for fileid in fileids[-50:]]
len(reviews), categories.count('pos'), categories.count('neg')

(100, 50, 50)

<span style="color: Gold"> 1. 나이브 베이즈

In [None]:
# 나이브 베이즈
# 베이즈 정리
# '좋다' 단어를 본 후 이 리뷰가 긍정일 확률
# P( 긍정 | 좋다 ) = P( 좋다 | 긍정) X P(긍정) / P(좋다)
# -> 전체 리뷰에서 좋다가 나올 확률

In [14]:
# 데이터 분할
#x_train, x_test, y_train, y_test = train_test_split(reviews, categories, test_size=0.2,random_state=42, stratify=categories) # 아래 절차이나 다른 방식으로 표현
dataset = train_test_split(reviews, categories, test_size=0.2,random_state=42, stratify=categories)

len(dataset[0])

80

In [None]:
# tf-idf 벡터화
vectorizer = TfidfVectorizer(max_features=1000)
x_train = vectorizer.fit_transform(dataset[0])
x_test = vectorizer.transform(dataset[1])

y_train = dataset[2]
y_test = dataset[3]



vectorizer = TfidfVectorizer(max_features=1000)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [16]:
# 1. mnb
mnb_clf = MultinomialNB()
mnb_clf.fit(x_train, y_train)
predict = mnb_clf.predict(x_test)
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

         neg       0.54      0.70      0.61        10
         pos       0.57      0.40      0.47        10

    accuracy                           0.55        20
   macro avg       0.55      0.55      0.54        20
weighted avg       0.55      0.55      0.54        20



<span style="color: Gold">  2. Logiscticregression

In [24]:
# 2. Logiscticregression

logi = LogisticRegression()
logi.fit(x_train,y_train)
predict_logi = logi.predict(x_test)
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

         neg       0.54      0.70      0.61        10
         pos       0.57      0.40      0.47        10

    accuracy                           0.55        20
   macro avg       0.55      0.55      0.54        20
weighted avg       0.55      0.55      0.54        20



---

In [35]:
# 성능향상
    # 소문자변환 - 연속된 문자열 중에 3글자 이상인 것만 가져오기 - 어간추출(형태소분석) - 불용어제거
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [61]:
# 전처리
def custom_tokenizer(text):
    text = text.lower()     # 소문자 변환
    tokenizer = RegexpTokenizer(r"[\w']{3,}")   # 단어 문자(알파벳, 숫자, _)와 작은따옴표(')가 섞여서 3글자 이상 연속되는 모든 문자열을 찾아라
    tokens = tokenizer.tokenize(text)
    porter = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    return [porter.stem(token) for token in tokens if token not in stop_words]
vector = TfidfVectorizer(
    tokenizer  = custom_tokenizer
    ,max_features=1000
    ,min_df=5
    ,max_df=0.5
    ,token_pattern = r"[\w']{3,}"
)
x_train = vector.fit_transform(dataset[0])
x_test = vector.transform(dataset[1])



In [None]:
def evaluate_model(model):    
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    print(classification_report(y_test, predict))


In [63]:
evaluate_model(LogisticRegression())

              precision    recall  f1-score   support

         neg       0.80      0.80      0.80        10
         pos       0.80      0.80      0.80        10

    accuracy                           0.80        20
   macro avg       0.80      0.80      0.80        20
weighted avg       0.80      0.80      0.80        20



---