# Code

## Google Colab

In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
!mkdir -p drive
!google-drive-ocamlfuse drive
!pip install -q keras==2.0.8
!pip install -q gensim==3.1.0
import keras
keras.__version__

In [None]:
LABEL_TRAINING_PATH = "drive/ml-2018spring-hw5/training_label.txt"
UNLABEL_TRAINING_PATH = "drive/ml-2018spring-hw5/training_nolabel.txt"
TESTING_PATH = "drive/ml-2018spring-hw5/testing_data.txt"
STOPLIST_PATH = "drive/stoplist.txt"
WORD2VEC_MODEL_PATH = "drive/word2vec.model"
RNN_MODEL_PATH = "drive/rnn_semi4real.h5"
OUTPUT_PATH = "drive/output.csv"

## Import modules

In [None]:
from gensim.models import word2vec
from keras import regularizers
from keras.models import Model
from keras.layers import Input, Embedding, Dense, LSTM, Dropout, GRU, average, Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import logging
import string
import itertools
import pickle
import os
import re

# Logging config
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

EMBED_DIM = 300
DICT_SIZE = 30000
MAX_SENT_LEN = 40
REMOVE_PUNC = False
REMOVE_STOPWORDS = False
REMOVE_DUPLICHAR = False

## File path

In [None]:
LABEL_TRAINING_PATH = "ml-2018spring-hw5/training_label.txt"
UNLABEL_TRAINING_PATH = "ml-2018spring-hw5/training_nolabel.txt"
TESTING_PATH = "ml-2018spring-hw5/testing_data.txt"
STOPLIST_PATH = "model/stoplist.txt"
WORD2VEC_MODEL_PATH = "model/word2vec.model"
RNN_MODEL_PATH = "model/rnn.h5"
OUTPUT_PATH = "output.csv"

In [None]:
def get_label_training_data():
    X_train = []
    y_train = []
    
    with open(LABEL_TRAINING_PATH, 'r') as file:
        for line in file:
            label, text = line.strip().split(" +++$+++ ")
            if REMOVE_PUNC:
                text = remove_punctuation(text)
            X_train.append(text)
            y_train.append(int(label))
    if REMOVE_STOPWORDS:
        X_train = remove_stopwords(X_train)
    if REMOVE_DUPLICHAR:
        X_train = remove_duplichar(X_train)
    return X_train, y_train

def get_unlabel_training_data():
    X_train = []

    with open(UNLABEL_TRAINING_PATH, 'r') as file:
        for line in file:
            text = line.strip()
            if REMOVE_PUNC:
                text = remove_punctuation(text)
            X_train.append(text)
    if REMOVE_STOPWORDS:
        X_train = remove_stopwords(X_train)
    if REMOVE_DUPLICHAR:
        X_train = remove_duplichar(X_train)
    return X_train

def get_testing_data():
    X_test = []
    
    with open(TESTING_PATH, 'r') as file:
        # Ignore header
        file.readline()
        for line in file:
            no, text = line.strip().split(',', 1)
            if REMOVE_PUNC:
                text = remove_punctuation(text)
            X_test.append(text)
    if REMOVE_STOPWORDS:
        X_test = remove_stopwords(X_test)
    if REMOVE_DUPLICHAR:
        X_test = remove_duplichar(X_test)
    return X_test

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(X):
    """
    Remove stopwords and split sentence to words
    """
    stoplist = set('')
    X_ = [[word for word in sentence.split() if word not in stoplist] for sentence in X]
    return X_

def remove_duplichar(X):
    """
    Remove duplicate characters
    """
    for i, text in enumerate(X):
        X[i] = [''.join(ch for ch, _ in itertools.groupby(word)) for word in text]
    return X

def preprocess(string):
    # Remove duplicate (over 3 times) char, ex. 'heeeeelloooo' -> 'hello'
    # string = re.sub(r"(\w)\1{2,}", r'\1', string)
    string = re.sub(r"(.)\1{2,}", r'\1', string)
    return string
    
def output_file(output):
    with open(OUTPUT_PATH, 'w') as file:
        file.write("id,label\n")
        file.write('\n'.join(['{},{}'.format(index, label) for index, label in enumerate(output)]))
        
def word2sent(X):
    for i, v in enumerate(X):
        X[i] = ' '.join(v)
    return X

def sent2word(X):
    for i in range(len(X)):
        X[i] = [word for word in X[i].split()]
    return X

def split_data(X, y, frac=0.1):
    val_size = int(len(X)*frac)
    return (X[:-val_size], y[:-val_size]), (X[-val_size:], y[-val_size:])

def get_semi_data(label, threshold):
    index = (label>1-threshold) + (label<threshold)
    y = np.round(label[index])
    return np.where(index), y

## Load data

In [None]:
X_label, y_label = get_label_training_data()
X_unlabel = get_unlabel_training_data()
# X_label = [preprocess(sent) for sent in X_label]
# X_unlabel = [preprocess(sent) for sent in X_unlabel]
X_all = X_label + X_unlabel
X_test = get_testing_data()

### Tokenize

In [None]:
tokenizer = Tokenizer(num_words=DICT_SIZE)
tokenizer.fit_on_texts(X_all)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(X_label)
X = pad_sequences(X, maxlen=MAX_SENT_LEN)
X_semi = tokenizer.texts_to_sequences(X_unlabel)
X_semi = pad_sequences(X_semi, maxlen=MAX_SENT_LEN)
print('Shape of data tensor:', X.shape)

(X_train, y_train), (X_val, y_val) = split_data(X, y_label, frac=0.1)

## Train word2vec

In [None]:
X_train_w2v = sent2word(X_train_label+X_train_unlabel)
w2v_model = word2vec.Word2Vec(X_train_w2v, size=EMBED_DIM)
w2v_model.save(WORD2VEC_MODEL_PATH)

### Load word2vec model

In [None]:
w2v_model = word2vec.Word2Vec.load(WORD2VEC_MODEL_PATH)

### Embegging matrix

In [None]:
embeddings_index = {}
for k, v in w2v_model.wv.vocab.items():
    embeddings_index[k] = w2v_model.wv[k]

embedding_matrix = np.zeros((DICT_SIZE + 1, EMBED_DIM))
for word, i in word_index.items():
    if i > DICT_SIZE:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## RNN

In [None]:
sequence_input = Input(shape=(MAX_SENT_LEN,), dtype='int32')

# Embedding
embedded_sequences = Embedding(len(embedding_matrix),
                            EMBED_DIM,
                            weights=[embedding_matrix],
                            trainable=False)(sequence_input)
# RNN
output = Bidirectional(GRU(512,
             return_sequences=True,
             dropout=0.3))(embedded_sequences)
output = GRU(256,
             return_sequences=False,
             dropout=0.3)(output)
# DNN
output = Dense(256,
               activation='relu',
               kernel_regularizer=regularizers.l2(0.1))(output)
output = Dropout(0.3)(output)
preds = Dense(1, activation='sigmoid')(output)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

checkpoint = ModelCheckpoint(RNN_MODEL_PATH,
                             monitor="val_acc",
                             verbose=1,
                             save_best_only=True,
                             mode="max")

## BOW

In [None]:
sequence_input = Input(shape=(DICT_SIZE,))

# DNN
output = Dense(256,
               activation='relu',
               kernel_regularizer=regularizers.l2(0.1))(sequence_input)
output = Dropout(0.3)(output)
preds = Dense(1, activation='sigmoid')(output)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

checkpoint = ModelCheckpoint(RNN_MODEL_PATH,
                             monitor="val_acc",
                             verbose=1,
                             save_weights_only=True,
                             save_best_only=True,
                             mode="max")

## Train

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                epochs=10,
                batch_size=128,
                callbacks=[checkpoint])

### Semi

In [None]:
account = np.array([[None, 0] for i in range(len(X_semi))]) # [label, count]

for i in range(10):
    print ('-- iteration %d  X_train size: %d' %(i+1, len(X_train)))
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=2,
                    batch_size=128,
                    callbacks=[checkpoint])
    
    y_semi = model.predict(X_semi, batch_size=1024, verbose=True)
    y_semi = np.squeeze(y_semi)
    indices, label = get_semi_data(y_semi, 0.05)
    transfer_set = []
    
    for j, index in enumerate(indices):
        if account[index][0] == None:
            account[index][0] = label[j]
        elif account[index][0] != label[j]: # Unstable labeling
            account[index][1] = 0
        account[index][1] += 1
        if account[index][1] >= accum_num:
            transfer_set.append(index)
    
    if len(semi_set) > 0:
        # Append semi data to training data
        X_train = np.append(X_train, X_semi[transfer_set], 0)
        y_train = np.append(y_train, account[transfer_set, 0])
        # Delete those data from semi data
        X_semi = np.delete(X_semi, transfer_set, 0)
        account = np.delete(account, transfer_set, 0)

## Predict

### RNN

In [None]:
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_SENT_LEN)
y_test = model.predict(X_test, batch_size=1024, verbose=True)

y = np.around(y_test).astype(np.int32).flatten()
output_file(y)

### BOW

In [None]:
model.load_weights(RNN_MODEL_PATH)

X_test = tokenizer.texts_to_matrix(X_test, mode="count")
y_test = model.predict(X_test, batch_size=1024, verbose=True)

y = np.around(y_test).astype(np.int32).flatten()
output_file(y)

## Ensemble

In [None]:
from keras.models import load_model

def ensemble_models(models, model_input):
    # Collect outputs of models in a list
    y_models = [model(model_input) for model in models]
    
    # Averaging outputs
    y_avg = average(y_models)
    
    model_ens = Model(inputs=model_input, outputs=y_avg, name='ensemble')
    
    return model_ens

models = [model_1, model_2, model_3]
model_input = Input(shape=models[0].input_shape[1:])
model_ens = ensemble_models(models, model_input)
model_ens.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_ens.summary()

In [None]:
label = model_ens.predict(X_test, batch_size=1024, verbose=True)
label = np.squeeze(label)
y_test = np.around(label).astype(np.int32)
output_file(y_test)

In [None]:
model_ens.save("drive/ensemble.h5")

# Report

### RNN

In [None]:
from keras.models import load_model

text = ["today is a good day, but it is hot", "today is hot, but it is a good day"]
text = tokenizer.texts_to_sequences(text)
seq = pad_sequences(text, maxlen=MAX_SENT_LEN)
result = model.predict(seq)
print(result)

### BOW

In [None]:
text = ["today is a good day, but it is hot", "today is hot, but it is a good day"]
text = tokenizer.texts_to_matrix(text, mode="count")
result = model.predict(text)
print(result)