In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, LSTM
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers.core import Dropout
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from keras.utils import plot_model
import keras.backend as K
import numpy as np
import pandas as pd
from functools import partial
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
def normalize_y_pred(y_pred):
    return K.one_hot(K.argmax(y_pred), y_pred.shape[-1])

def class_true_positive(class_label, y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.cast(K.equal(y_true[:, class_label] + y_pred[:, class_label], 2), K.floatx())

def class_accuracy(class_label, y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.cast(K.equal(y_true[:, class_label], y_pred[:, class_label]),
                  K.floatx())

def class_precision(class_label, y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.sum(class_true_positive(class_label, y_true, y_pred)) / (K.sum(y_pred[:, class_label]) + K.epsilon())


def class_recall(class_label, y_true, y_pred):
    return K.sum(class_true_positive(class_label, y_true, y_pred)) / (K.sum(y_true[:, class_label]) + K.epsilon())


def class_f_measure(class_label, y_true, y_pred):
    precision = class_precision(class_label, y_true, y_pred)
    recall = class_recall(class_label, y_true, y_pred)
    return (2 * precision * recall) / (precision + recall + K.epsilon())


def true_positive(y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.cast(K.equal(y_true + y_pred, 2),
                  K.floatx())


def micro_precision(y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.sum(true_positive(y_true, y_pred)) / (K.sum(y_pred) + K.epsilon())


def micro_recall(y_true, y_pred):
    return K.sum(true_positive(y_true, y_pred)) / (K.sum(y_true) + K.epsilon())


def micro_f_measure(y_true, y_pred):
    precision = micro_precision(y_true, y_pred)
    recall = micro_recall(y_true, y_pred)
    return (2 * precision * recall) / (precision + recall + K.epsilon())


def average_accuracy(y_true, y_pred):
    class_count = y_pred.shape[-1]
    class_acc_list = [class_accuracy(i, y_true, y_pred) for i in range(class_count)]
    class_acc_matrix = K.concatenate(class_acc_list, axis=0)
    return K.mean(class_acc_matrix, axis=0)


def macro_precision(y_true, y_pred):
    class_count = y_pred.shape[-1]
    return K.sum([class_precision(i, y_true, y_pred) for i in range(class_count)]) / K.cast(class_count, K.floatx())


def macro_recall(y_true, y_pred):
    class_count = y_pred.shape[-1]
    return K.sum([class_recall(i, y_true, y_pred) for i in range(class_count)]) / K.cast(class_count, K.floatx())


def macro_f_measure(y_true, y_pred):
    precision = macro_precision(y_true, y_pred)
    recall = macro_recall(y_true, y_pred)
    return (2 * precision * recall) / (precision + recall + K.epsilon())

def weight_variable(shape):
    return K.truncated_normal(shape, stddev = 0.01)

In [3]:
#データの読み込み
all_data = pd.read_csv(filepath_or_buffer="Datas/all_data/A_extract_allData.csv", encoding="utf_8", sep=",")
print(len(all_data))
all_data.info()

57926
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57926 entries, 0 to 57925
Data columns (total 12 columns):
screen_name    57926 non-null object
user_id        57926 non-null float64
tweet_id       57926 non-null float64
tweet          57926 non-null object
tweet2         57823 non-null object
postdate       57926 non-null object
cos_day        57926 non-null float64
sin_day        57926 non-null float64
tag            57926 non-null object
image_url      57926 non-null object
image          57926 non-null int64
retweet        57926 non-null float64
dtypes: float64(5), int64(1), object(6)
memory usage: 5.3+ MB


In [4]:
#NaNデータのカウント
print(all_data.isnull().sum())
#NaNのデータを削除
use_data = all_data.dropna(how='any')
#掲載したツイート数のカウント
published_post = use_data['retweet'] == 1
published_post.sum()

screen_name      0
user_id          0
tweet_id         0
tweet            0
tweet2         103
postdate         0
cos_day          0
sin_day          0
tag              0
image_url        0
image            0
retweet          0
dtype: int64


10068

In [5]:
maxlen = 50
train = 0.7
validation = 0.1
max_words = 35000

#データをランダムにシャッフル
use_data_s = use_data.sample(frac=1, random_state=1)

# word indexを作成
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(use_data_s['tweet2'])
sequences = tokenizer.texts_to_sequences(use_data_s['tweet2'])

word_index = tokenizer.word_index
print("Found {} unique tokens.".format(len(word_index)))

data = pad_sequences(sequences, maxlen=maxlen)

# バイナリの行列に変換
categorical_labels = to_categorical(use_data_s['retweet'])
labels = np.asarray(categorical_labels)

print("Shape of data tensor:{}".format(data.shape))
print("Shape of label tensor:{}".format(labels.shape))

indices = [int(len(labels) * n) for n in [train, train + validation]]
x_train, x_validation, x_test = np.split(data, indices)
y_train, y_validation, y_test = np.split(labels, indices)

Found 34080 unique tokens.
Shape of data tensor:(57823, 50)
Shape of label tensor:(57823, 2)


In [6]:
#学習データ内の掲載データ数のカウント
count = 0
for i in y_train:
    if i[1] == 1.0:
        count+=1
print(count)

7040


In [7]:
model = Sequential()
model.add(Embedding(max_words, 50, input_length=maxlen))
model.add(Dropout(0.5))
model.add(LSTM(32, kernel_initializer=weight_variable))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
opt = Adam(lr=1e-3, beta_1 = 0.9, beta_2 = 0.999)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc', macro_precision, macro_recall, macro_f_measure])
model.summary()
#plot_model(model, show_shapes=True, show_layer_names=True, to_file='N_method1_LSTM1024_model.png')

early_stopping = EarlyStopping(patience=0, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            2000000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                10624     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 2,010,690
Trainable params: 2,010,690
Non-trainable params: 0
_________________________________________________________________


In [8]:
history = model.fit(x_train, y_train,
                    epochs=100, 
                    batch_size = 256,
                    validation_data=(x_validation, y_validation),
                    class_weight={0:1., 1:4.73},
                    callbacks=[early_stopping])

Train on 40476 samples, validate on 5782 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping


In [9]:
loss_and_metrics = model.evaluate(x_test, y_test)
print(loss_and_metrics)

[0.47227756208379806, 0.7783830523181673, 0.6724640017817456, 0.7451197832835684, 0.7055356652304768]
