In [8]:
from gensim.models import word2vec
from functools import partial
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import keras.backend as K
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, LSTM
from keras.layers.core import Dropout
from keras.callbacks import EarlyStopping, TensorBoard
import matplotlib.pyplot as plt

In [9]:
#データ読み込み用関数
def read_data(file):
    f = open(file, "r", encoding="utf-8")
    datas = f.readlines()
    f.close()
    return datas

#ベクトル化用関数群
#シークエンスに変換
def sequences_tokenizer(texts, max_words, maxlen):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.text_to_sequences(texts)
    
    word_index = tokenizer.word_index
    print("Found {} unique tokens.".format(len(word_index)))
    data = pad_sequences(sequences, maxlen=maxlen)
    return data

#学習したモデルから各文章のベクトル化
def document_vector(text, model, num_features):
    word2vec_model = word2vec.Word2Vec.load(model)
    bag_of_centroids = np.zeros(num_features, dtype = 'float32')
    
    for word in text:
        try:
            temp = word2vec_model[word]
        except:
            continue
        bag_of_centroids += temp
        
    bag_of_centroids = bag_of_centroids / len(text)
    return bag_of_centroids

#学習データと訓練データに分ける
def train_data_split(data, labels, training_samples, validation_samples):
    x_train = data[:training_samples]
    y_train = labels[:training_samples]
    x_val = data[training_samples: training_samples + validation_samples]
    y_val = labels[training_samples: training_samples + validation_samples]
    return x_train, y_train, x_val, y_val

In [10]:
#評価関数群
def normalize_y_pred(y_pred):
    return K.one_hot(K.argmax(y_pred), y_pred.shape[-1])

def class_true_positive(class_label, y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.cast(K.equal(y_true[:, class_label] + y_pred[:, class_label], 2), K.floatx())

def class_accuracy(class_label, y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.cast(K.equal(y_true[:, class_label], y_pred[:, class_label]),
                  K.floatx())

def class_precision(class_label, y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.sum(class_true_positive(class_label, y_true, y_pred)) / (K.sum(y_pred[:, class_label]) + K.epsilon())


def class_recall(class_label, y_true, y_pred):
    return K.sum(class_true_positive(class_label, y_true, y_pred)) / (K.sum(y_true[:, class_label]) + K.epsilon())


def class_f_measure(class_label, y_true, y_pred):
    precision = class_precision(class_label, y_true, y_pred)
    recall = class_recall(class_label, y_true, y_pred)
    return (2 * precision * recall) / (precision + recall + K.epsilon())


def true_positive(y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.cast(K.equal(y_true + y_pred, 2),
                  K.floatx())


def micro_precision(y_true, y_pred):
    y_pred = normalize_y_pred(y_pred)
    return K.sum(true_positive(y_true, y_pred)) / (K.sum(y_pred) + K.epsilon())


def micro_recall(y_true, y_pred):
    return K.sum(true_positive(y_true, y_pred)) / (K.sum(y_true) + K.epsilon())


def micro_f_measure(y_true, y_pred):
    precision = micro_precision(y_true, y_pred)
    recall = micro_recall(y_true, y_pred)
    return (2 * precision * recall) / (precision + recall + K.epsilon())


def average_accuracy(y_true, y_pred):
    class_count = y_pred.shape[-1]
    class_acc_list = [class_accuracy(i, y_true, y_pred) for i in range(class_count)]
    class_acc_matrix = K.concatenate(class_acc_list, axis=0)
    return K.mean(class_acc_matrix, axis=0)


def macro_precision(y_true, y_pred):
    class_count = y_pred.shape[-1]
    return K.sum([class_precision(i, y_true, y_pred) for i in range(class_count)]) / K.cast(class_count, K.floatx())


def macro_recall(y_true, y_pred):
    class_count = y_pred.shape[-1]
    return K.sum([class_recall(i, y_true, y_pred) for i in range(class_count)]) / K.cast(class_count, K.floatx())


def macro_f_measure(y_true, y_pred):
    precision = macro_precision(y_true, y_pred)
    recall = macro_recall(y_true, y_pred)
    return (2 * precision * recall) / (precision + recall + K.epsilon())

In [None]:
#メイン
maxlen = 100
training_samples = 8000 # training data 80 : validation data 20
validation_samples = len(lines) - training_samples
max_words = 20000
wordlen = 200
N_model = "Datas/noun_tweet.model"
A_model = "Datas/other_tweet.model"

#データの読み込み
N_texts = read_data("Datas/N_extract_tweet.txt")
A_texts = read_data("Datas/A_extract_tweet.txt")
label = read_data("Datas/label.txt")

#sequenceに変換
N_data = sequences_tokenizer(N_texts, max_words, maxlen)
A_data = sequences_tokenizer(A_texts, max_words, maxlen)

#Word2Vecのベクトルに変換
WN_data = [document_vector(a, N_model, wordlen) for a in N_texts]
WA_data = [document_vector(b, A_model, wordlen) for b in A_texts]

#ラベルをバイナリの行列に変換
categorical_labels = to_categorical(label, nb_classes=2)
labels = np.asarray(categorical_labels)

#print("Shape of data tensor:{}".format(data.shape))
#print("Shape of label tensor:{}".format(labels.shape))

# 行列をランダムにシャッフルする
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
N_data = N_data[indices]
A_data = A_data[indices]
WN_data = WN_data[indices]
WA_data = WA_data[indices]
labels = labels[indices]

N_x_train, N_y_train, N_x_val, N_y_val = train_data_split(N_data, labels, training_samples, validation_samples)
A_x_train, A_y_train, A_x_val, A_y_val = train_data_split(A_data, labels, training_samples, validation_samples)
WN_x_train, WN_y_train, WN_x_val, WN_y_val = train_data_split(N_data, labels, training_samples, validation_samples)
WA_x_train, WA_y_train, WA_x_val, WA_y_val = train_data_split(N_data, labels, training_samples, validation_samples)