In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K

from sklearn.model_selection import train_test_split, cross_val_score, KFold

  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)
Using TensorFlow backend.


In [2]:
CRAWL_EMBEDDING_PATH = './crawl_embedding/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = './glove_embedding/glove.840B.300d.txt'
EMBED_SIZE = 600
MAX_FEATURES = 100000
MAX_LEN = 256

In [3]:
#Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
def preprocess(data):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

x_train = preprocess(train['overview'])
y_train = np.asarray(train['vote_average'])

x_test = preprocess(test['overview'])
x_actual = np.asarray(test['vote_average'])

In [5]:
tokenizer = Tokenizer(num_words = MAX_FEATURES)
tokenizer.fit_on_texts(list(x_train))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

#Pad the sequences
x_train = pad_sequences(x_train, maxlen=MAX_LEN)
x_test = pad_sequences(x_test, maxlen=MAX_LEN)

In [6]:
#Logic to build the embedding matrix taken and modified from:
#    https://www.kaggle.com/bminixhofer/simple-lstm-pytorch-version

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path, encoding='utf-8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            continue
    return embedding_matrix

glove_matrix = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
crawl_matrix = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)

del glove_matrix
del crawl_matrix
gc.collect()

2196017it [03:29, 10462.07it/s]
1999996it [03:06, 10739.17it/s]


9

In [7]:
#LSTM and MLP Definition
units = 64
def custom_LSTM(embedding_matrix):
    inp = Input(shape=(MAX_LEN,))
    x = Embedding(embedding_matrix.shape[0], EMBED_SIZE, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.5)(x)
    x, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(units, return_sequences=True, return_state=True))(x)
    h_state = concatenate([forward_h, backward_h])
    c_state = concatenate([forward_c, backward_c])
    
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    
    x = concatenate([avg_pool, max_pool, h_state, c_state])
    
    #MLP Definition
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='relu')(x)
    
    #Compiling the models together
    model=Model(inputs=inp, outputs=output)
    model.compile(loss='mean_absolute_percentage_error', optimizer=Adam(lr=1e-3, decay=0), metrics=['accuracy'])
    
    return model
    

In [8]:
#Validation using KFold
num_folds = 5
num_epochs = 3
folds = KFold(n_splits=num_folds, shuffle=True)
model = custom_LSTM(embedding_matrix)
def train(x_train, y_train, x_test):
    prediction = np.zeros((len(x_test), 1))
    
    #KFold Validation
    for fold_index, (train_index, valid_index) in enumerate(folds.split(x_train, y_train)):
        x_train_split = x_train[train_index]
        y_train_split = y_train[train_index]
        x_validation = x_train[valid_index]
        y_validation = y_train[valid_index]
        
        model.fit(x_train_split, y_train_split, batch_size=512, epochs = num_epochs, validation_data = (x_validation, y_validation))
        
        prediction += model.predict(x_test, batch_size = 512, verbose = 1)
        
    prediction /= fold_index
    
    return prediction
prediction = train(x_train, y_train, x_test)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 3032 samples, validate on 759 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 3033 samples, validate on 758 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 3033 samples, validate on 758 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 3033 samples, validate on 758 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 3033 samples, validate on 758 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
def calculate_accuracy(predictions, actual):
    margin_error = .25
    num_correct = 0
    for i in range(actual.size):
        difference = abs(predictions[i] - actual[i])
        if(difference <= margin_error):
            num_correct += 1
            
    return num_correct/actual.size
    
print(calculate_accuracy(prediction, x_actual))

[[7.03029537]
 [6.63233602]
 [7.05228829]
 [6.73217261]
 [7.13432753]
 [6.43455625]
 [6.77892661]
 [7.06108618]
 [7.2117548 ]
 [6.54170299]
 [6.87599349]
 [6.79725146]
 [6.2257911 ]
 [6.68903315]
 [6.17887414]
 [6.40115786]
 [6.74483383]
 [7.1735307 ]
 [6.5861243 ]
 [7.01486254]
 [7.06796134]
 [6.9933579 ]
 [6.66255701]
 [6.82735825]
 [6.23902273]
 [6.41591859]
 [7.17097592]
 [6.67372251]
 [6.87125349]
 [6.18485248]
 [6.65235078]
 [6.33224952]
 [7.10833359]
 [6.63595676]
 [7.08970797]
 [5.88676286]
 [6.88455391]
 [6.80971384]
 [5.86179185]
 [6.78562677]
 [6.27749026]
 [6.45777321]
 [6.95089734]
 [6.88942325]
 [6.43518114]
 [6.84863853]
 [6.39019275]
 [6.39884913]
 [6.90913022]
 [6.92496407]
 [6.8880682 ]
 [6.31056643]
 [6.95201278]
 [6.79368496]
 [6.71131229]
 [6.65659988]
 [6.77406585]
 [6.74157882]
 [7.11001289]
 [6.74103391]
 [6.4058429 ]
 [7.15133369]
 [7.31707013]
 [6.58558393]
 [6.39478326]
 [7.13413215]
 [6.32456958]
 [6.75087464]
 [7.08661711]
 [7.07520556]
 [6.38419724]
 [7.14

 [6.39376575]]
0.21285563751317177
