In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from keras.models import load_model

import argparse
import time

import tensorflow as tf
import numpy as np

import build_model
import load_data
import vectorize_data
import explore_data
import train_fine_tuned_sequence_model

Using TensorFlow backend.


In [2]:
'''LOAD_DATA'''

import csv
text = []
label = []
data = []
with open('../clean.csv') as f:
    reader = csv.reader(f, delimiter = ',')
    for row in reader:
        text.append(row[0])
        label.append(row[1])
        data.append(row)

text = text[1:]
label = label[1:]
label = list(map(int, label))

In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(text, 
                                                    label, 
                                                    test_size = 0.1, 
                                                    random_state = 42)

# data = train_test_split(text, label, test_size = 0.3, random_state = 42)

In [4]:
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000


def _data_generator(x, y, num_features, batch_size):
    """Generates batches of vectorized texts for training/validation.
    # Arguments
        x: np.matrix, feature matrix.
        y: np.ndarray, labels.
        num_features: int, number of features.
        batch_size: int, number of samples per batch.
    # Returns
        Yields feature and label data in batches.
    """
    num_samples = x.shape[0]
    num_batches = num_samples // batch_size
    if num_samples % batch_size:
        num_batches += 1

    while 1:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            if end_idx > num_samples:
                end_idx = num_samples
            x_batch = x[start_idx:end_idx]
            y_batch = y[start_idx:end_idx]
            yield x_batch, y_batch

In [5]:
'''Get test data'''

def preprocess(path):
	
	f = open(path, 'r')
	lines = f.readlines()

	temp_data = []
	id = []
	text = []
	for l in lines:
		temp = l.split('|')
		temp[1] = ''.join([c for c in temp[1] if c.isalpha() or c == ' '])
		temp_data.append(temp)

		id.append(temp[0])
		text.append(temp[1])

	# print(temp_data)
	return temp_data, id, text

data, id, val_text = preprocess('../query_test.csv')

In [6]:
def batch_train_sequence_model(data,
                               learning_rate=1e-3,
                               epochs=10,
                               batch_size=64,
                               blocks=2,
                               filters=64,
                               dropout_rate=0.2,
                               embedding_dim=200,
                               kernel_size=3,
                               pool_size=3):
    """Trains sequence model on the given dataset.
    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of sepCNN layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        embedding_dim: int, dimension of the embedding vectors.
        kernel_size: int, length of the convolution window.
        pool_size: int, factor by which to downscale input at MaxPooling layer.
    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val, word_index, tokenizer, max_length = vectorize_data.sequence_vectorize(
            train_texts, val_texts)
    
    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)
    
    ''' '''
    embedding_data_dir = 'glove.6B'
    embedding_matrix = train_fine_tuned_sequence_model._get_embedding_matrix(word_index, embedding_data_dir, embedding_dim)
    ''' '''
    
    # Create model instance.
    model = build_model.sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2), 
                 tf.keras.callbacks.ModelCheckpoint('.mdl_wts.hdf5',
                                                    save_best_only=True, monitor='val_loss', mode = 'min')]

    # Create training and validation generators.
    training_generator = _data_generator(
        x_train, train_labels, num_features, batch_size)
    validation_generator = _data_generator(
        x_val, val_labels, num_features, batch_size)

    # Get number of training steps. This indicated the number of steps it takes
    # to cover all samples in one epoch.
    steps_per_epoch = x_train.shape[0] // batch_size
    if x_train.shape[0] % batch_size:
        steps_per_epoch += 1

    # Get number of validation steps.
    validation_steps = x_val.shape[0] // batch_size
    if x_val.shape[0] % batch_size:
        validation_steps += 1

    # Train and validate model.
    history = model.fit_generator(
            generator=training_generator,
            steps_per_epoch=steps_per_epoch,
            validation_data=validation_generator,
            validation_steps=validation_steps,
            callbacks=callbacks,
            epochs=epochs,
            verbose=1)  # Logs once per epoch.

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('amazon_reviews_sepcnn_model.h5')
    # return history['val_acc'][-1], history['val_loss'][-1]

    return tokenizer, max_length
    

In [None]:
data = (x_train, y_train), (x_test, y_test)
tokenizer, max_length = batch_train_sequence_model(data)

Epoch 1/10

In [None]:
x_val = tokenizer.texts_to_sequences(val_text)
x_val = sequence.pad_sequences(x_val, maxlen=max_length)

In [None]:
model = load_model('/media/vdev/Paradise/Project/Hackathon/SepCNN/amazon_reviews_sepcnn_model.h5')
pred = model.predict(x_val)
preds = np.argmax(pred, axis = 1)

In [None]:
import csv

sub = list(zip(id, preds))
with open('submit1.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(sub)