In [None]:
import numpy as np
import string
import pandas as pd
import nltk
import keras
import re
from numpy import asarray
from sklearn import random_projection
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, Dropout, Conv2D, MaxPool2D, Concatenate, Input, Reshape, Flatten
from keras.models import Model
from keras.models import Sequential
from keras.optimizers import SGD
from keras import metrics
from nltk.stem.snowball import SnowballStemmer
from keras import callbacks

In [None]:

stop_words = set(stopwords.words('english') + list(string.punctuation))
stemmer = SnowballStemmer(language='english')

# -------------- Helper Functions --------------
def tokenize(text):
    tokens = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stop_words and not word.isnumeric():
            if re.search('[a-zA-Z]',word):
                tokens.append(stemmer.stem(word))
    return tokens


def get_sequence(data, seq_length, vocab_dict):
    data_matrix = np.zeros((len(data), seq_length), dtype=int)
    for i, doc in enumerate(data):
        for j, word in enumerate(doc):
            if j == seq_length:
                break
            word_idx = vocab_dict.get(word, 1) # 1 means the unknown word
            data_matrix[i, j] = word_idx

    return data_matrix


def read_data(file_name, input_length):
    df = pd.read_csv(file_name)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['text'])
    sequences = tokenizer.texts_to_sequences(df['text'])
    data_matrix = pad_sequences(sequences,maxlen=input_length) 
    word_index = tokenizer.word_index

    # df['words'] = df['text'].apply(tokenize)
    # if vocab is None:
    #     vocab = set()
    #     for i in range(len(df)):
    #         for word in df.iloc[i]['words']:
    #             vocab.add(word)
    # vocab_dict = dict()
    # vocab_dict['<pad>'] = 0 # 0 means the padding signal
    # vocab_dict['<unk>'] = 1 # 1 means the unknown word
    # vocab_size = 2
    # for v in vocab:
    #     vocab_dict[v] = vocab_size
    #     vocab_size += 1
    # data_matrix = get_sequence(df['words'], input_length, vocab_dict)
    stars = df['stars'].apply(int) - 1
    return df['review_id'], stars, data_matrix, word_index


def load_data(input_length):
    # Load training data and vocab
    train_id_list, train_data_label, train_data_matrix, word_index = read_data("../yelp-review-dataset/train.csv", input_length)
    K = max(train_data_label)+1  # labels begin with 0

    # Load valid data  whether valid has vocab
    valid_id_list, valid_data_label, valid_data_matrix, _ = read_data("../yelp-review-dataset/valid.csv", input_length)

    # Load testing data
    test_id_list, _, test_data_matrix, _ = read_data("../yelp-review-dataset/test.csv", input_length)
    
    print("Vocabulary Size:", len(word_index))
    print("Training Set Size:", len(train_id_list))
    print("Validation Set Size:", len(valid_id_list))
    print("Test Set Size:", len(test_id_list))
    print("Training Set Shape:", train_data_matrix.shape)
    print("Validation Set Shape:", valid_data_matrix.shape)
    print("Testing Set Shape:", test_data_matrix.shape)

    # Converts a class vector to binary class matrix.
    # https://keras.io/utils/#to_categorical
    train_data_label = keras.utils.to_categorical(train_data_label, num_classes=K)
    valid_data_label = keras.utils.to_categorical(valid_data_label, num_classes=K)
    return  train_data_matrix, train_data_label, \
         valid_data_matrix, valid_data_label, \
        test_id_list, test_data_matrix, word_index


if __name__ == '__main__':
    input_length = 300
    embedding_size = 100
    hidden_size = 100
    batch_size = 200
    dropout_rate = 0.2
    filters = 100
    kernel_sizes = [3, 4, 5]
    padding = 'valid'
    activation = 'relu'
    # strides = 1
    strides = embedding_size
    pool_size = 2
    learning_rate = 0.01
    total_epoch = 50

    train_data_matrix, train_data_label, valid_data_matrix, \
    valid_data_label,test_data_matrix, test_data_label, word_index = load_data(input_length)

    # Data shape
    N = train_data_matrix.shape[0]
    K = train_data_label.shape[1]

    input_size = len(word_index) 
    output_size = K

    # load the whole embedding into memory
    # embeddings_index = dict()
    # f = open('glove.6B.100d.txt','rb')
    # for line in f:
    #     values = line.split()
    #     word = values[0]
    #     coefs = asarray(values[1:], dtype='float32')
    #     embeddings_index[word] = coefs
    # f.close()
    # print('Loaded %s word vectors.' % len(embeddings_index))


    # # # create a weight matrix for words in training docs
    # embedding_matrix = np.zeros((input_size, embedding_size))
    # for word, i in word_index.items():
    #     embedding_vector = embeddings_index.get(word)
    #     if embedding_vector is not None:
    #         embedding_matrix[i] = embedding_vector

    
    # New model
    x = Input(shape=(input_length, ))
    print('x',x)
    # embedding layer and dropout
    # e = Embedding(input_dim=input_size, output_dim=embedding_size,input_length=input_length)(x)
    e = Embedding(input_dim=input_size, 
                output_dim=embedding_size, 
                weights=[embedding_matrix],\
                input_length=input_length,
                trainable=False)(x)
    e_d = Dropout(dropout_rate)(e)
    # print('e_d',e_d)

    # construct the sequence tensor for CNN
    e_d = Reshape((input_length, embedding_size, 1))(e_d)
    # print('new e_d',e_d)

    # CNN layers
    conv_blocks = []
    for kernel_size in kernel_sizes:
        conv = Conv2D(filters=filters, kernel_size=(kernel_size, embedding_size), 
            padding=padding, activation=activation, strides=(strides, strides))(e_d)
        maxpooling = MaxPool2D(pool_size=((input_length-kernel_size)//strides+1, 1))(conv)
        faltten = Flatten()(maxpooling)
        conv_blocks.append(faltten)

    # concatenate CNN results
    c = Concatenate()(conv_blocks) if len(kernel_sizes) > 1 else conv_blocks[0]
    c_d = Dropout(dropout_rate)(c)

    # dense layer
    d = Dense(hidden_size, activation=activation)(c_d)

    # output layer
    y = Dense(output_size, activation='softmax')(d)

    # build your own model
    model = Model(x, y)
    
    
    early_stopping = callbacks.EarlyStopping(monitor='loss',min_delta=0,patience=2,verbose=1, mode='auto')

    # hyperparameter 
    dacay = [1e-6]
    lr = [0.0005,0.002,0.01]
    max_valid_score = 0
    best_lr = 0 
    best_dacay = 0
    for i in lr:
        for j in dacay:
            adam = keras.optimizers.Adam(lr=i, beta_1=0.9, beta_2=0.999, epsilon=None, decay=j, amsgrad=False)
            # sgd = SGD(lr=i, decay=j, momentum=0.9, nesterov=True)

            # compile model
            model_lstm.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
            
            # # training
            model_lstm.fit(train_data_matrix, train_data_label, validation_data=(valid_data_matrix,valid_data_label),
                epochs=total_epoch, batch_size=batch_size,callbacks=[early_stopping])
            # # testing
            train_score = model_lstm.evaluate(train_data_matrix, train_data_label, batch_size=batch_size)
            print('lr: {}\n decay: {}\n Training Loss: {}\n Training Accuracy: {}\n'.format(i, j, train_score[0], train_score[1]))
            valid_score = model_lstm.evaluate(valid_data_matrix, valid_data_label, batch_size=batch_size)
            print('lr: {}\n decay: {}\n Validation Loss: {}\n Validation Accuracy: {}\n'.format(i, j, valid_score[0], valid_score[1]))
            if valid_score[1] > max_valid_score:
                max_valid_score = valid_score[1]
                best_lr = i
                best_dacay = j
    print ('best learning rate:{}\n best decay rate: {}\n'.format(best_lr,best_dacay))
    # predicting
    test_pre = model_lstm.predict(test_data_matrix, batch_size=batch_size).argmax(axis=-1) + 1

