In [1]:
import pandas as pd
import numpy as np
import sys


In [2]:
df = pd.read_json('/elice/data_r1/train_data.json')
df.tail()


ValueError: Expected object or value

In [None]:
df.shape[0]

In [None]:
df.describe()

In [None]:
y = df["rating"].values

a = []
for i, j in enumerate(y):
    if j <= 3:
        a.append("NEG")
    elif j >= 4 and j <= 7:
        a.append("NEU")
    else:
        a.append("POS")
        
df["char"] = a
df.tail()
        

In [None]:
df.groupby(df.char).count()

In [None]:
import math, sys
from konlpy.tag import Twitter

class BayesianFilter:
    """ 베이지안 필터 """
    def __init__(self):
        self.words = set() # 출현한 단어 기록
        self.word_dict = {} # 카테고리마다의 출현 횟수 기록
        self.category_dict = {} # 카테고리 출현 횟수 기록
    # 형태소 분석하기 --- (※1)
    def split(self, text):
        results = []
        twitter = Twitter()
        # 단어의 기본형 사용
        malist = twitter.pos(text, norm=True, stem=True)
        for word in malist:
            # 어미/조사/구두점 등은 대상에서 제외 
            if not word[1] in ["Josa", "Eomi", "Punctuation"]:
                results.append(word[0])
        return results
    # 단어와 카테고리의 출현 횟수 세기 --- (※2)
    def inc_word(self, word, category):
        # 단어를 카테고리에 추가하기
        if not category in self.word_dict:
            self.word_dict[category] = {}
        if not word in self.word_dict[category]:
            self.word_dict[category][word] = 0
        self.word_dict[category][word] += 1
        self.words.add(word)
    def inc_category(self, category):
        # 카테고리 계산하기
        if not category in self.category_dict:
            self.category_dict[category] = 0
        self.category_dict[category] += 1
    
    # 텍스트 학습하기 --- (※3)
    def fit(self, text, category):
        """ 텍스트 학습 """
        word_list = self.split(text)
        for word in word_list:
            self.inc_word(word, category)
        self.inc_category(category)
    
    # 단어 리스트에 점수 매기기--- (※4)
    def score(self, words, category):
        score = math.log(self.category_prob(category))
        for word in words:
            score += math.log(self.word_prob(word, category))
        return score
    
    # 예측하기 --- (※5)
    def predict(self, text):
        best_category = None
        max_score = -sys.maxsize 
        words = self.split(text)
        score_list = []
        for category in self.category_dict.keys():
            score = self.score(words, category)
            score_list.append((category, score))
            if score > max_score:
                max_score = score
                best_category = category
        return best_category, score_list
    # 카테고리 내부의 단어 출현 횟수 구하기
    def get_word_count(self, word, category):
        if word in self.word_dict[category]:
            return self.word_dict[category][word]
        else:
            return 0
    # 카테고리 계산
    def category_prob(self, category):
        sum_categories = sum(self.category_dict.values())
        category_v = self.category_dict[category]
        return category_v / sum_categories
        
    # 카테고리 내부의 단어 출현 비율 계산 --- (※6)
    def word_prob(self, word, category):
        n = self.get_word_count(word, category) + 1 # ---(※6a)
        d = sum(self.word_dict[category].values()) + len(self.words)
        return n / d

## Training

In [None]:
bf = BayesianFilter()

for i in range(1, 700000):
    bf.fit(df[i-1:i]["review"].values[0], df[i-1:i]["char"].values[0])
    


## Prediction

In [None]:
pre, scorelist = bf.predict(df[699998:699999]["review"].values[0])
print("결과 =", pre)
print(scorelist)
print(df[699998:699999]["review"].values[0])

## TEST

In [None]:
test_in = open("/elice/data_r1/test.input", 'r')

p = []
lines = test_in.readlines()
for line in lines:
    print(line)
    pre, scorelist = bf.predict(line)
    p.append(pre)
    print("결과 =", pre)
test_in.close()

In [None]:
import re
test_out = open("/elice/data_r1/test.output", 'r')

o = []
lines = test_out.readlines()
for line in lines:
    m = re.match('[A-Z]+', line)
    o.append(m[0])
    print(m[0])
test_out.close()

In [None]:
print(p.count('POS'))
print(p.count('NEU'))
print(p.count('NEG'))

In [None]:
#print(o)
print(o.count('POS'))
print(o.count('NEU'))
print(o.count('NEG'))

In [None]:
len(p)

In [None]:
#del p[-1]
#len(p)

In [None]:
t = []
for i in range(len(o)):
    if o[i] == p[i]:
        t.append("True")
    else:
        t.append("false")

#print(t)
print(t.count('True')/len(o)*100)

## Test submission

In [None]:
with open('submission.txt', 'w') as submission:
    for item in p:
        submission.write("%s\n" % item)


In [None]:
import elice_challenge as ec

In [None]:
ec.login()

In [None]:
#제출파일 테스트
ec.test()

In [None]:
grading_in = open("/elice/data_r1/grading.input", 'r')

final = []
lines = grading_in.readlines()
for line in lines:
    print(line)
    pre, scorelist = bf.predict(line)
    final.append(pre)
    print("결과 =", pre)
grading_in.close()

## final submission

In [None]:
with open('submission.txt', 'w') as submission:
    for item in final:
        submission.write("%s\n" % item)



In [None]:
ec.upload()

In [None]:
#제출 및 채점
ec.submit()