# 분류기의 정확도 확인

In [1]:
import urllib
urllib.request.urlretrieve('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', filename='ratings_train.txt')
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")


('ratings_test.txt', <http.client.HTTPMessage at 0x2b4c9d69390>)

In [2]:
from nltk.tokenize import word_tokenize
import nltk
from konlpy.tag import Okt
okt = Okt()

In [52]:
def load_data(filename):
    train = []
    count = 0
    with open(filename, 'r', encoding='utf-8') as f:      
        for line in f.readlines():
            if count == 1000: break
            
            line = line.strip()
            id, document, label = line.split('\t')
            if label == '1': label = 'pos'
            else: label = 'neg'    
            train_tup = (document, label)
            train.append(train_tup)
            count += 1
    return train

def pos_tokenize(raw_sent):
    pos_sent = []
    
    sent = okt.pos(raw_sent, norm=True, stem=True)
    for tup in sent:
        word, tag = tup
        word_tag = word + '/' + tag
        pos_sent.append(word_tag)
       
    return ' '.join(pos_sent)



In [53]:
def make_word_dict(train, use_morph = False):
    all_words = set()
    for tup in train:
        sent, label = tup
        if use_morph: sent = pos_tokenize(sent)
        word_list = word_tokenize(sent)
        for word in word_list:
            all_words.add(word)
    return all_words

In [54]:
def make_train_feats(train, all_words, use_morph = False):
    train_features = []
    for tup in train:
        sent, label = tup # sent:'I like you' label: 'pos'
        if use_morph: sent = pos_tokenize(sent)
        word_list = word_tokenize(sent) # ['I', like, you]
        tmp = {set_word: set_word in word_list for set_word in all_words}
        train_feature = (tmp, label)
        train_features.append(train_feature)
        
    return train_features

In [55]:
train = load_data('ratings_train.txt')[1:]
#use_morph= True
all_words = make_word_dict(train,use_morph=True)
train_features = make_train_feats(train,all_words,use_morph=True)

In [56]:
classifier = nltk.NaiveBayesClassifier.train(train_features)
classifier.show_most_informative_features(n=10)

Most Informative Features
                쓰레기/Noun = True              neg : pos    =     11.9 : 1.0
                 인생/Noun = True              pos : neg    =     10.0 : 1.0
                 최고/Noun = True              pos : neg    =      9.5 : 1.0
           괜찮다/Adjective = True              pos : neg    =      8.6 : 1.0
                       ; = True              neg : pos    =      8.3 : 1.0
          재미없다/Adjective = True              neg : pos    =      8.1 : 1.0
       ㅡㅡ/KoreanParticle = True              neg : pos    =      8.1 : 1.0
           아깝다/Adjective = True              neg : pos    =      7.6 : 1.0
          지루하다/Adjective = True              neg : pos    =      7.5 : 1.0
           재밌다/Adjective = True              pos : neg    =      7.5 : 1.0


In [57]:
classifier.classify(test_features[0][0])

'pos'

# 정확도 계산하기

##  use_morph = False (count == 500)

In [25]:
test = load_data('ratings_test.txt')[1:]
test_features = make_train_feats(test,all_words,use_morph=False)
nltk.classify.accuracy(classifier,test_features)

0.6432865731462926

##  use_morph = True (count == 500)

In [51]:
test_feature = make_train_feats(test,all_words,use_morph=True)
nltk.classify.accuracy(classifier,test_feature)

0.7097097097097097

##  use_morph = False (count == 1000)

In [38]:
test = load_data('ratings_test.txt')[1:]
test_features = make_train_feats(test,all_words,use_morph=False)
nltk.classify.accuracy(classifier,test_features)

0.6656656656656657

##  use_morph = True (count == 1000)

In [58]:
test_feature = make_train_feats(test,all_words,use_morph=True)
nltk.classify.accuracy(classifier,test_feature)

0.7747747747747747

## 결과 비교

- use_morph = True를 했을 때의 값이 False 했을 때의 값보다 더 크다
- count 값을 올리면 결과도 그에 따라 증가되어 나타난다. 데이터 양이 많을 수록 더 정확하게 동작할 수 있다