In [1]:
import numpy as np
import pickle

from konlpy.tag import Okt
from scipy.sparse import lil_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [2]:

"""
Req 1-1-1. 데이터 읽기
read_data(): 데이터를 읽어서 저장하는 함수
"""

def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        datas = [line.split('\t') for line in f.read().splitlines()]
        datas = datas[1:]
    return datas

In [3]:

"""
Req 1-1-2. 토큰화 함수
tokenize(): 텍스트 데이터를 받아 KoNLPy의 okt 형태소 분석기로 토크나이징
"""
okt = Okt()

def tokenize(doc):
    tt = okt.pos(doc, norm=True, stem=True)
    return ['/'.join(t) for t in tt]


In [4]:

"""
데이터 전 처리
"""

# train, test 데이터 읽기
train_data = read_data('ratings_train.txt')
test_data = read_data('ratings_test.txt')


# Req 1-1-2. 문장 데이터 토큰화
# train_docs, test_docs : 토큰화된 트레이닝, 테스트  문장에 label 정보를 추가한 list


train_docs = [(tokenize(i[1]), i[2]) for i in train_data]
test_docs = [(tokenize(i[1]), i[2]) for i in test_data]



In [5]:


# Req 1-1-3. word_indices 초기화
word_indices = {}

# Req 1-1-3. word_indices 채우기

for vocas in train_docs:
     # print(vocas[0])
     for voca in vocas[0]:
         text = voca.split('/')[0]
         if text not in word_indices:
             word_indices[text] = len(word_indices)

In [7]:
X = lil_matrix((len(train_docs), len(word_indices)))
X_test = lil_matrix((len(test_docs), len(word_indices)))

In [8]:
Y = np.zeros(len(train_docs))
Y_test = np.zeros(len(test_docs))

In [9]:
def one_hot_encoding(word, word2index):
    one_hot_vector = [0]*(len(word2index))
    index=word2index.get(word)
    if index is not None:
        one_hot_vector[index]=1
    return one_hot_vector

In [None]:
for (idx1, vocas) in enumerate(train_docs):
     for (idx2, voca) in enumerate(vocas[0]):
        text = voca.split('/')[0]
        X[idx1] = one_hot_encoding(text, word_indices)

In [None]:
for (idx1, vocas) in enumerate(test_docs):
     for (idx2, voca) in enumerate(vocas[0]):
        text = voca.split('/')[0]
        X_test[idx1] = one_hot_encoding(text, word_indices)

In [None]:
for (idx1, vocas) in enumerate(train_docs):
     Y[idx1] = vocas[1]
for (idx1, vocas) in enumerate(test_docs):
     Y_test[idx1] = vocas[1]

In [None]:
print(X)

In [None]:
print(X_test)

In [None]:
# Req 1-2-1. Naive bayesian model 학습
clf = MultinomialNB()

# Req 1-2-2. Logistic regression model 학습
clf2 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')

In [None]:
clf.fit(X, Y)

In [None]:
clf2.fit(X, Y)

In [None]:
# Req 1-3-1. 문장 데이터에 따른 예측된 분류값 출력
print("Naive bayesian classifier example result: {}, {}".format(test_data[3][1], clf.predict(X_test[3])))
print("Logistic regression exampleresult: {}, {}".format(test_data[3][1], clf2.predict(X_test[3])))

# Req 1-3-2. 정확도 출력
print("Naive bayesian classifier accuracy: {}".format(clf.score(X_test, Y_test)))
print("Logistic regression accuracy: {}".format(clf2.score(X_test, Y_test)))
