In [20]:
from collections import defaultdict
from glob import glob
import os
import re
import time

from lxml import html
import numpy as np
import pandas as pd
import requests

#import utils
#sudo pip install utils 설치
import json


BASEURL     = 'http://movie.naver.com/movie/point/af/list.nhn'
RATINGURL   = BASEURL + '?&page=%s'
MOVIEURL    = BASEURL + '?st=mcode&target=after&sword=%s&page=%s'

DATADIR     = 'data/ratings'
INDEXFILE   = 'index.txt'
TMPFILE     = 'data/ratings_all.txt'
RATINGSFILE = 'data/ratings.txt'
SEED        = 1234
SLEEP       = 600
NDOCS       = 200000


extract_nums = lambda s: re.search('\d+', s).group(0)
sanitize_str = lambda s: s.strip()




def parse_item(item):
    try:
        return {'review_id': item.xpath('./td[@class="ac num"]/text()')[0],     # num
                'rating': item.xpath('./td[@class="point"]/text()')[0],         # point
                'movie_id': extract_nums(item.xpath('./td[@class="title"]/a/@href')[0]),
                'review': sanitize_str(' '.join(item.xpath('./td[@class="title"]/text()'))),   
                'author': item.xpath('./td[@class="num"]/a/text()')[0],
                'date': item.xpath('./td[@class="num"]/text()')[0]
        }
    except (IndexError, AttributeError) as e:
        print(e, item.xpath('.//text()'))
        return None
    except (AssertionError) as e:
        print(e, 'Sleep for %s' % SLEEP)
        time.sleep(SLEEP)
    except Exception as e:
        print(e, '음 여기까진 생각을 못했는데...')


def crawl_rating_page(url):
    resp = requests.get(url)
    root = html.fromstring(resp.text)
    items = root.xpath('//body//table[@class="list_netizen"]//tr')[1:]
    npages = max(map(int, ([0] + root.xpath('//div[@class="paging"]//a/span/text()'))))
    return list(filter(None, [parse_item(item) for item in items])), npages


def write_json(items,filenames):
    with open(filenames,'w') as outfile:      #wb -> w 로 바꿈.
        json.dump(items,outfile)

def write_txt(contents, filename):
    with open(filename,'w') as f:
        f.write(contents)

def read_txt(filename):
    with open(filename,'r') as f:
        x = f.read()
        return x

def read_json(filenames):
    with open(filenames) as f:
        return json.loads(f.read())



In [21]:
def crawl_movie(movie_id):
    items = []
    for page_num in range(10):  # limit to 100 recent ratings per movie
        url = MOVIEURL % (movie_id, page_num + 1)
        page_items, npages = crawl_rating_page(url)
        items.extend(page_items)
        if len(items)==0:
            return []
        if page_num >= npages - 1:
            break
    if items:
        #utils.write_json(items, '%s/%s.json' % (DATADIR, movie_id))
        write_json(items, '%s/%s.json' % (DATADIR, movie_id))
        return items
    else:
        return []



In [22]:

def get_index(filename):
    if os.path.exists(filename):
        #movie_id, total = map(int, utils.read_txt(filename).split('\n')[0].split(','))
        movie_id, total = map(int, read_txt(filename).split('\n')[0].split(','))
    else:
        movie_id, total = 129406, 0
    print(movie_id, total)
    return [movie_id, total]


def put_index(movie_id, total, filename):
    #utils.write_txt('%s,%s' % (movie_id, total), filename)
    write_txt('%s,%s' % (movie_id, total), filename)




In [23]:
def merge_ratings():

    def balance_classes(df, ndocs_per_class):
        df_pos = df[df['label']==1][:int(ndocs_per_class)]
        df_neg = df[df['label']==0][:int(ndocs_per_class)]
        return df_pos.append(df_neg)


    sub_space = lambda s: re.sub('\s+', ' ', s)
    write_row = lambda l, f: f.write('\t'.join(l) + '\n')

    filenames = glob('%s/*' % DATADIR)
    with open(TMPFILE, 'w') as f:
        write_row('id document label'.split(), f)
        for filename in filenames:
            #for review in utils.read_json(filename):
            for review in read_json(filename):
                rating = int(review['rating'])
                if rating > 7:      # positive 8, 9, 10       
                    write_row([review['review_id'], sub_space(review['review']), '1'], f)
                elif rating < 5:    # negative 1, 2, 3, 4
                    write_row([review['review_id'], sub_space(review['review']), '0'], f)
                else:               # neutral (중립)
                    pass
    print('Ratings merged to %s' % TMPFILE)

    df = pd.read_csv(TMPFILE, sep='\t', quoting=3)
    df = df.fillna('')
    np.random.seed(SEED)
    df = df.iloc[np.random.permutation(len(df))]
    df = balance_classes(df, NDOCS/2)
    df.to_csv(RATINGSFILE, sep='\t', index=False)
    print('Ratings written to %s' % RATINGSFILE)


In [55]:
if __name__=='__main__':
    movie_id, total = get_index(INDEXFILE)
    Ntotal=10000
    while total < Ntotal and movie_id > 0:      # 10000대신 원래 작성자는 1000000개로 함.
        items = crawl_movie(movie_id)
        total += len(items)
        put_index(movie_id, total, INDEXFILE)
        print(MOVIEURL % (movie_id, 1), len(items), total)
        movie_id -= 1
    merge_ratings()

104786 90983
Ratings merged to data/ratings_all.txt
Ratings written to data/ratings.txt


In [None]:
#####################

In [63]:
####### test data, train data ######
import numpy as np; np.random.seed(1234)
import pandas as pd


ntrain = 7000      #전체 중 대략 20% 되는 데이터를 test data로.

data = pd.read_csv('data/ratings.txt', sep='\t', quoting=3)
data = pd.DataFrame(np.random.permutation(data))
trn, tst = data[:ntrain], data[ntrain:]

header = 'id document label'.split()
trn.to_csv('data/ratings_train.txt', sep='\t', index=False, header=header)
tst.to_csv('data/ratings_test.txt', sep='\t', index=False, header=header)

In [93]:
##########

## data

In [24]:
import platform 

print(platform.architecture())


('64bit', '')


In [59]:
import konlpy      #sudo pip install konlpy
import jpype
from konlpy.tag import Twitter
#from konlpy.tag import Okt

import pandas as pd
import numpy as np

pos_tagger = Twitter()

def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

def read_raw_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        print('loading data')
        data = [line.split('\t') for line in f.read().splitlines()]

        print('pos tagging to token')
        data = [(tokenize(row[1]), int(row[2])) for row in data[1:]]
    return data

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [26]:

def build_vocab(tokens):
    print('building vocabulary')
    vocab = dict()
    vocab['#UNKOWN'] = 0
    vocab['#PAD'] = 1
    for t in tokens:
        if t not in vocab:
            vocab[t] = len(vocab)
    return vocab

def get_token_id(token, vocab):
    if token in vocab:
        return vocab[token]
    else:
        0 # unkown

def build_input(data, vocab):

    def get_onehot(index, size):
        onehot = [0] * size
        onehot[index] = 1
        return onehot

    print('building input')
    result = []
    for d in data:
        sequence = [get_token_id(t, vocab) for t in d[0]]
        while len(sequence) > 0:
            seq_seg = sequence[:60]
            sequence = sequence[60:]

            padding = [1] *(60 - len(seq_seg))
            seq_seg = seq_seg + padding

            result.append((seq_seg, get_onehot(d[1], 2)))

    return result 

def save_data(filename, data):
    def make_csv_str(d):
        output = '%d' % d[0]
        for index in d[1:]:
            output = '%s,%d' % (output, index)
        return output

    with open(filename, 'w', encoding='utf-8') as f:
        for d in data:
            data_str = make_csv_str(d[0])
            label_str = make_csv_str(d[1])
            f.write (data_str + '\n')
            f.write (label_str + '\n')

def save_vocab(filename, vocab):
    with open(filename, 'w', encoding='utf-8') as f:
        for v in vocab:
            f.write('%s\t%d\n' % (v, vocab[v]))
            
def load_data(filename):
    result = []
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(int(len(lines)/2)):
            data = lines[i*2]
            label = lines[i*2 + 1]

            result.append(([int(s) for s in data.split(',')], [int(s) for s in label.split(',')]))
    return result

def load_vocab(filename):
    result = dict()
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            ls = line.split('\t')
            result[ls[0]] = int(ls[1])
    return result


In [None]:


if __name__ == '__main__':
    data = read_raw_data('data/ratings_train.txt')
    tokens = [t for d in data for t in d[0]]
    vocab = build_vocab(tokens)
    d = build_input(data, vocab)
    
    save_data('data/test_data.txt', d)
    save_vocab('data/test_vocab.txt', vocab)

    d2 = load_data('data/test_data.txt')
    vocab2 = load_vocab('data/test_vocab.txt')

    assert(len(d2) == len(d))
    for i in range(len(d)):
        assert(len(d2[i]) ==  len(d[i]))
        for j in range(len(d[i])):
            assert(d2[i][j] == d[i][j])

    for index in vocab:
        assert(vocab2[index] == vocab[index])

In [36]:
import numpy as np
import tensorflow as tf

class TextCNN(object):
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters):
        # input,  dropout
        input = tf.placeholder(tf.int32, [None, sequence_length], name='input')
        label = tf.placeholder(tf.float32, [None, num_classes], name='label')
        dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
        
        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)

        with tf.name_scope('embedding'):
            tf.get_variable_scope().reuse_variables()
            W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name='W')
            # [None, sequence_length, embedding_size]
            embedded_chars = tf.nn.embedding_lookup(W, input)
            # [None, sequence_length, embedding_size, 1]
            embedded_chars = tf.expand_dims(embedded_chars, -1)

        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope('conv-maxpool-%s' % filter_size):
                tf.get_variable_scope().reuse_variables()
                # convolution
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name='b')
                conv = tf.nn.conv2d(
                    embedded_chars,
                    W,
                    strides=[1,1,1,1],
                    padding='VALID',
                    name='conv')
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name='pool')
                pooled_outputs.append(pooled)
        # 
        num_filters_total = num_filters * len(filter_sizes)
        h_pool = tf.concat(pooled_outputs,3)
        h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

        # dropout
        with tf.name_scope('dropout'):
            tf.get_variable_scope().reuse_variables()
            h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

        # prediction
        with tf.name_scope('output'):
            tf.get_variable_scope().reuse_variables()
            W1 = tf.Variable(tf.random_uniform([num_filters_total, num_classes], -1.0, 1.0))
            #W = tf.get_variable('W', shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b')

            scores = tf.nn.xw_plus_b(h_drop, W1, b, name='scores')
            predictions = tf.argmax(scores, 1, name='predictions')

        with tf.name_scope('loss'):
            tf.get_variable_scope().reuse_variables()
            losses = tf.nn.softmax_cross_entropy_with_logits(labels = label,logits = scores)
            loss = tf.reduce_mean(losses)

        with tf.name_scope('accuracy'):
            tf.get_variable_scope().reuse_variables()
            correct_predictions = tf.equal(predictions, tf.argmax(label, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')

        # variables
        self.input = input
        self.label = label
        self.dropout_keep_prob = dropout_keep_prob
        self.predictions = predictions
        self.loss = loss
        self.accuracy = accuracy

        
if __name__ == '__main__':
    TextCNN(59, 2, 100, 128, [3,4,5], 128)

In [68]:
#### train & test
from data import *
#from textcnn import TextCNN
import tensorflow as tf
import random
import numpy as np
import os
import sys

TRAIN_FILENAME = 'data/ratings_train.txt'
TRAIN_DATA_FILENAME = TRAIN_FILENAME + '.data'
TRAIN_VOCAB_FILENAME = TRAIN_FILENAME + '.vocab'

TEST_FILENAME = 'data/ratings_test.txt'
TEST_DATA_FILENAME = TRAIN_FILENAME + '.data'
TEST_VOCAB_FILENAME = TRAIN_FILENAME + '.vocab'

def train():

    if (os.path.exists(TRAIN_DATA_FILENAME) and os.path.exists(TRAIN_VOCAB_FILENAME)):
        print('load prebuilt train data & vocab file') 
        input = load_data(TRAIN_DATA_FILENAME)
        vocab =  load_vocab(TRAIN_VOCAB_FILENAME)
    else:
        print('build train data & vocab from raw text')
        data = read_raw_data(TRAIN_FILENAME)
        tokens = [t for d in data for t in d[0]]
        
        vocab = build_vocab(tokens)
        input = build_input(data, vocab)

        print('save train data & vocab file')
        save_data(TRAIN_DATA_FILENAME, input)
        save_vocab(TRAIN_VOCAB_FILENAME, vocab)
    
    if (os.path.exists(TEST_DATA_FILENAME) and os.path.exists(TEST_VOCAB_FILENAME)):
        print('load prebuilt test data & vocab file ')
        test_input = load_data(TEST_DATA_FILENAME)
        test_vocab = load_vocab(TEST_VOCAB_FILENAME)
    else:
        print('build test data & vocab from raw text')
        data = read_raw_data(TEST_FILENAME)
        tokens = [t for d in data for t in d[0]]
        
        test_vocab = build_vocab(tokens)
        test_input = build_input(data, test_vocab)

        print('save test data & vocab file')
        save_data(TEST_DATA_FILENAME, test_input)
        save_vocab(TEST_VOCAB_FILENAME, test_vocab)

    

    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        seq_length = np.shape(input[0][0])[0]
        num_class = np.shape(input[0][1])[0]

        print('initialize cnn filter')
        print('sequence length %d,  number of class %d, vocab size %d' % (seq_length, num_class, len(vocab)))
        
        cnn = TextCNN(seq_length, num_class, len(vocab), 128, [3,4,5], 128)

        global_step = tf.Variable(0, name='global_step', trainable=False)
        #optimizer = tf.train.AdamOptimizer(1e-3)
        optimizer = tf.train.GradientDescentOptimizer(0.001)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        def train_step(x_batch, y_batch):
            feed_dict = {
                cnn.input : x_batch,
                cnn.label : y_batch,
                cnn.dropout_keep_prob : 0.5
            }

            _, step, loss, accuracy = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

        def evaluate(x_batch, y_batch):
            feed_dict = {
                cnn.input : x_batch,
                cnn.label : y_batch,
                cnn.dropout_keep_prob : 1.0
            }

            step, loss, accuracy = sess.run([global_step, cnn.loss, cnn.accuracy], feed_dict)
            print("step %d, loss %f, acc %f" % (step, loss, accuracy))

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        
        for i in range(100):
            try:
                batch = random.sample(input, 64) 
            
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % 100 == 0:
                    batch = random.sample(test_input, 64)
                    x_test, y_test = zip(*batch)
                    evaluate(x_test, y_test)
                if current_step % 1000 == 0:
                    save_path = saver.save(sess, './textcnn.ckpt')
                    print('model saved : %s' % save_path)
            except:
                print ("Unexpected error:", sys.exc_info()[0])
                raise

if __name__ == '__main__':
    train()

load prebuilt train data & vocab file
load prebuilt test data & vocab file 
initialize cnn filter
sequence length 60,  number of class 2, vocab size 49897


ValueError: At least two variables have the same name: conv-maxpool-5_8/b

In [38]:
from data import *
#from textcnn import TextCNN
import tensorflow as tf
import numpy as np

TRAIN_FILENAME = 'data/ratings_train.txt'
TRAIN_DATA_FILENAME = TRAIN_FILENAME + '.data'
TRAIN_VOCAB_FILENAME = TRAIN_FILENAME + '.vocab'

SEQUENCE_LENGTH = 60
NUM_CLASS = 2

def test():
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        vocab = load_vocab(TRAIN_VOCAB_FILENAME)
        cnn = TextCNN(SEQUENCE_LENGTH, NUM_CLASS, len(vocab), 128, [3,4,5], 128)
        #saver = tf.train.Saver()
        
        new_saver = tf.train.import_meta_graph('textcnn.ckpt.meta')
        new_saver.restore(sess, tf.train.latest_checkpoint('./'))

        #saver.restore(sess, eval_dir)
        #saver.restore(sess,tf.train.latest_checkpoint('./'))


        print('model restored')

        input_text = input('사용자 평가를 문장으로 입력하세요: ')
        tokens = tokenize(input_text)
        print('입력 문장을 다음의 토큰으로 분해:')
        print(tokens)

        sequence = [get_token_id(t, vocab) for t in tokens]
        x = []
        while len(sequence) > 0:
            seq_seg = sequence[:SEQUENCE_LENGTH]
            sequence = sequence[SEQUENCE_LENGTH:]

            padding = [1] *(SEQUENCE_LENGTH - len(seq_seg))
            seq_seg = seq_seg + padding

            x.append(seq_seg)
            
        sess.run(tf.global_variables_initializer())
        feed_dict = {
            cnn.input : x,
            cnn.dropout_keep_prob : 1.0
        }

        predict = sess.run([cnn.predictions], feed_dict)
        result = np.mean(predict)
        if (result > 0.75):
            print('추천')
        elif (result < 0.25):
            print('비추천')
        else:
            print('평가 불가능')


In [66]:
if __name__ == '__main__':
    test()

INFO:tensorflow:Restoring parameters from ./textcnn.ckpt
model restored
사용자 평가를 문장으로 입력하세요: 안녕
입력 문장을 다음의 토큰으로 분해:
['안녕/Noun']
추천
