In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt

import pickle
from gensim.models import word2vec
import random

from sklearn.preprocessing import StandardScaler
%matplotlib inline

### Step 0. Loading dataset

#### Step 0.1 load article cutted and article df and define y

In [2]:
with open("data/article_cutted", "rb") as file:
    docs = pickle.load(file)

In [3]:
df = pd.read_csv('data/article_preprocessed.csv')
diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()
df['type'] = np.clip(df['push']-df['boo'], 0, 1)

#### Step 0.2 create word id mapping and word vector

In [4]:
w2v = word2vec.Word2Vec.load('word2vec_model/CBOW')

In [5]:
word2id = {k:i for i, k in enumerate(w2v.wv.vocab.keys())}
id2word = {i:k for k, i in word2id.items()}

In [6]:
words_len = len(word2id)

In [30]:
embedding = np.zeros((words_len+1, 256))
for k, v in word2id.items():
    embedding[v] = w2v.wv[k]

#### Step 0.3 sentence to seq transform

In [12]:
input_length = 80
docs_id = []
for doc in docs:
    text = doc[:input_length]
    ids = [words_len]*input_length
    ids[:len(text)] = [word2id[w]if w in word2id else words_len for w in text]
    docs_id.append(ids)

In [13]:
print(docs[1])

['dear', 'all', '逢甲', '碟仙', '發生', '民國', '七十五年', '三月中', '事情', '一堆', '大學生', '玩', '碟仙', '後發', 'bbs', '成功', '預測', '地震', '小弟', '預言', '都還沒', '出生', '後面', '說', '預言', '一百', '一十六年', '兩岸', '統一', '統一', '對岸', '對岸', '統一', '應該', '不用', '猜', '真的', '存在', '預言', '這種', '事情', '倒底', '被統', '知道', '資料庫', '發文', '日期', '輕鬆', '改變', '拍照', '狀況', '下', '碟仙', '真的假', '有沒有', '科學', '經驗', '法則', '破解', '謠言', '真實', '八卦']


In [14]:
print(len(docs[1]))
print(docs_id[1])

62
[56490, 22659, 59674, 93407, 1033, 39421, 100034, 100034, 96212, 78226, 21218, 57996, 93407, 44458, 4826, 54699, 34341, 70278, 83778, 74462, 54631, 49746, 4896, 20272, 74462, 34350, 100034, 22717, 81771, 81771, 55226, 55226, 81771, 46333, 70659, 46874, 15388, 86371, 74462, 63651, 96212, 48061, 5672, 52477, 38637, 73291, 14000, 66874, 13909, 10027, 89524, 75154, 93407, 86017, 83640, 15932, 90201, 95345, 82212, 90413, 71932, 65779, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034, 100034]


### Step 1. Data preprocessing

#### Step 1.1 Creating Training and Testing sets and creating generator

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, stratify=df['type'])

In [17]:
def train_data_generator(df, bz, docs_id):
    # bz: batch size 
    
    dfs = [sub_df for key,sub_df in df.groupby('type')]
    df_n = len(dfs)
    
    docs_id = np.array(docs_id)
    while True:
        selected = pd.concat([sub_df.sample(int(bz/2)) for sub_df in dfs], axis=0)
        selected = selected.sample(frac=1)
        x = docs_id[selected['idx']]
        y = np.array(selected['type'].tolist()).reshape((bz,1))
                    
        yield x, y
        
def test_data_generator(df, docs_id):
    docs_id = np.array(docs_id)
    x = docs_id[df['idx']]
    y = np.array(df['type'].tolist()).reshape((len(x),1))

    return x, y

In [18]:
X_test, Y_test = test_data_generator(test, docs_id) 

### Let's create the RNN

In [19]:
epochs = 100
batch_size = 32
update_per_epochs = 100

In [20]:
def LSTM_cell(hidden_layer_size, batch_size, number_of_layers, dropout=True, dropout_rate=0.8):
    
    layer = tf.contrib.rnn.BasicLSTMCell(hidden_layer_size)
    
    if dropout:
        layer = tf.contrib.rnn.DropoutWrapper(layer, output_keep_prob=dropout_rate)
        
    cell = tf.contrib.rnn.MultiRNNCell([layer]*number_of_layers)
    
    init_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, init_state

In [21]:
def output_layer(lstm_output, in_size, out_size):
    
    x = lstm_output[:, -1, :]
    print(x)
    weights = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=0.05), name='output_layer_weights')
    bias = tf.Variable(tf.zeros([out_size]), name='output_layer_bias')
    
    output = tf.matmul(x, weights) + bias
    output = tf.nn.sigmoid(output)
    return output

In [22]:
def opt_loss(logits, targets, learning_rate, grad_clip_margin):
    
    loss = tf.losses.sigmoid_cross_entropy(targets, logits)
    
    #Cliping the gradient loss
    gradients = tf.gradients(loss, tf.trainable_variables())
    clipper_, _ = tf.clip_by_global_norm(gradients, grad_clip_margin)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_optimizer = optimizer.apply_gradients(zip(gradients, tf.trainable_variables()))
    return loss, train_optimizer

In [23]:
class TextClassificationRNN(object):
    
    def __init__(self, learning_rate=0.001, hidden_layer_size=64, number_of_layers=1, dropout=True, 
                 dropout_rate=0.8, number_of_classes=1, gradient_clip_margin=4, input_length=input_length, wv=embedding):
    
        self.inputs = tf.placeholder(tf.int32, [None, input_length], name='input_data')
        self.targets = tf.placeholder(tf.float32, [None, 1], name='targets')
        self.bz = tf.placeholder(tf.int32, [], name='batch_size')
        
        ## embedding lookup table
        em_W = tf.Variable(wv.astype(np.float32), trainable=True)
        x = tf.nn.embedding_lookup(em_W, self.inputs)

        cell, init_state = LSTM_cell(hidden_layer_size, self.bz, number_of_layers, dropout, dropout_rate)

        outputs, states = tf.nn.dynamic_rnn(cell, x, initial_state=init_state)

        self.logits = output_layer(outputs, hidden_layer_size, number_of_classes)

        self.loss, self.opt = opt_loss(self.logits, self.targets, learning_rate, gradient_clip_margin)

In [24]:
tf.reset_default_graph()
model = TextClassificationRNN()

Tensor("strided_slice:0", shape=(?, 64), dtype=float32)
INFO:tensorflow:logits.dtype=<dtype: 'float32'>.
INFO:tensorflow:multi_class_labels.dtype=<dtype: 'float32'>.
INFO:tensorflow:losses.dtype=<dtype: 'float32'>.


In [25]:
tf.global_variables()

[<tf.Variable 'Variable:0' shape=(100035, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0' shape=(320, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0' shape=(256,) dtype=float32_ref>,
 <tf.Variable 'output_layer_weights:0' shape=(64, 1) dtype=float32_ref>,
 <tf.Variable 'output_layer_bias:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Variable/Adam:0' shape=(100035, 256) dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_1:0' shape=(100035, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel/Adam:0' shape=(320, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel/Adam_1:0' shape=(320, 256) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias/Adam:0' shape=(256,) dtype=float32_ref>,
 <tf

### Time to train the network

In [26]:
session =  tf.Session()

In [27]:
session.run(tf.global_variables_initializer())

In [28]:
from sklearn.metrics import roc_auc_score

train_generate = train_data_generator(train, batch_size, docs_id)

train_loss = []
train_auc = []
test_loss = []
test_auc = []
for i in range(epochs):
    traind_scores = []
    epoch_loss = []
    for j in range(update_per_epochs):
        X_batch, y_batch = next(train_generate) 
        
        o, c, _ = session.run([model.logits, model.loss, model.opt], feed_dict={
            model.inputs:X_batch, 
            model.targets:y_batch,
            model.bz:np.array(batch_size)
        })
        
        epoch_loss.append(c)
        traind_scores.append(roc_auc_score(y_batch, o))
    
    to, tc = session.run([model.logits, model.loss], feed_dict={
        model.inputs:X_test, 
        model.targets:Y_test,
        model.bz:np.array(len(X_test))
    })
    
    train_loss.append(np.mean(epoch_loss))
    train_auc.append(np.mean(traind_scores))
    test_loss.append(tc)
    test_auc.append(roc_auc_score(Y_test, to))
    
    if (i % 5) == 0:
        print('Epoch {}/{}'.format(i, epochs), ' Train loss: {}'.format(np.mean(epoch_loss)), 
              ' Train auc: {}'.format(np.mean(traind_scores)), 
             ' Test loss: {}'.format(tc), ' Test auc: {}'.format(roc_auc_score(Y_test, to)))

Epoch 0/100  Train loss: 0.6988576650619507  Train auc: 0.50599609375  Test loss: 0.6556000709533691  Test auc: 0.5337848836617798
Epoch 5/100  Train loss: 0.6469938158988953  Train auc: 0.741015625  Test loss: 0.4998680651187897  Test auc: 0.7179777395692383


KeyboardInterrupt: 