In [6]:
import os
import numpy as np
from keras import backend as K
import csv

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
import string

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
# data path initialization
TEXT_DATA_FILE_NEG = "./data/train_neg.csv"
TEXT_DATA_FILE_POS = "./data/train_pos.csv"
HEADER = False

# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

In [3]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
            
            Only computes a batch-wise average of recall.
            
            Computes the recall, a metric for multi-label classification of
            how many relevant items are selected.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
    
    def precision(y_true, y_pred):
        """Precision metric.
            
            Only computes a batch-wise average of precision.
            
            Computes the precision, a metric for multi-label classification of
            how many selected items are relevant.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

In [8]:
def load_data():
    x = []
    y = []
    
    fin1 = open(TEXT_DATA_FILE_NEG, "r", encoding = "utf-8")
    fin1_reader = csv.reader(fin1)
    fin2 = open(TEXT_DATA_FILE_POS, "r", encoding = "utf-8")
    fin2_reader = csv.reader(fin2)
    
    if HEADER:
        next(fin1_reader)
        next(fin2_reader)
    for row in fin1_reader:
        x.append(row[0])
        y.append(0)
    for row in fin2_reader:
        x.append(row[0])
        y.append(1)
    return x, y

In [9]:
data, labels = load_data()
labels = np.asarray(labels, dtype = 'int8')

In [10]:
# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(data, 
                                                                  np.asarray(labels, dtype = 'int8'), 
                                                                  test_size = VALIDATION_SPLIT, 
                                                                  random_state = RANDOM_SEED, 
                                                                  stratify = labels)

In [11]:
# initialize dictionary size and maximum sentence length
MAX_NB_WORDS = 74
MAX_SEQUENCE_LENGTH = 400
eng_alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [12]:
def create_vocab_set():
    alphabet = (list(eng_alphabet) + list(string.digits) + list(string.punctuation) + list(string.whitespace))
    vocab_size = len(alphabet)
    vocab = {}
    for ix, t in enumerate(alphabet):
        vocab[t] = ix+1
    return vocab, vocab_size

In [13]:
def text2sequence(text, vocab):
    temp = []
    for review in text:
                     temp.append([])
                     for i in review:
                         char = vocab.get(i,0)
                         if char != 0:
                            temp[-1].append(char)
    return temp

In [14]:
vocab, vocab_size = create_vocab_set()

X_train = text2sequence(data_train, vocab)
X_val = text2sequence(data_val, vocab)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

In [15]:
from keras.models import Sequential
from keras.layers import GlobalMaxPooling1D, Conv1D, Dropout, Embedding, Dense
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

In [20]:
NAME = "char_cnn_emb"
EMBEDDING_DIM = 100

# initialize model
model = Sequential()
model.add(Embedding(vocab_size+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model.add(Conv1D(activation="relu", filters=200, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=200, kernel_size=4, padding="valid"))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# callbacks initialization
# automatic generation of learning curves
#callback_1 = TensorBoard(log_dir='../logs/logs_{}'.format(NAME), histogram_freq=0,
#                             write_graph=False, write_images=False)
# stop training model if accuracy does not increase more than five epochs
callback_2 = EarlyStopping(monitor='val_f1', min_delta=0, patience=5, verbose=0, mode='max')
# best model saving
callback_3 = ModelCheckpoint("models/model_{}.hdf5".format(NAME), monitor='val_f1',
                                 save_best_only=True, verbose=0, mode='max')

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=[f1])

model.summary()
model.fit(X_train, labels_train, validation_data=[X_val, labels_val],
          batch_size=1024, epochs=1000, callbacks=[callback_2, callback_3])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 400, 100)          7500      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 397, 200)          80200     
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 394, 200)          160200    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 100)               10100     
__________

<keras.callbacks.History at 0x1ea92b2e898>

In [22]:
model.load_weights('models/model_char_cnn_emb.hdf5')

In [23]:
arr = model.predict_classes(X_val)



In [24]:
labels_predicted = arr.reshape((len(arr)))

In [25]:
from sklearn.metrics import f1_score

In [26]:
f1_score(labels_val, labels_predicted)

0.67361111111111105