In [None]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
print(os.listdir("../input"))

In [None]:
from keras.models import Model
from keras.layers import Dense, Input, Conv1D, GlobalMaxPool1D, Dropout, Concatenate, Layer, InputSpec, CuDNNLSTM, Embedding
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.utils.conv_utils import conv_output_length
from keras.regularizers import l2
from keras.constraints import maxnorm

In [None]:
train_file = bz2.BZ2File('../input/train.ft.txt.bz2')
test_file = bz2.BZ2File('../input/test.ft.txt.bz2')

In [None]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()
del train_file, test_file

In [None]:
train_file_lines[0]

In [None]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [None]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

In [None]:
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

In [None]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

In [None]:
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [None]:
del train_file_lines, test_file_lines

In [None]:
gc.collect()

In [None]:
max_features = 40000
maxlen = 100

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_sentences)
tokenized_train = tokenizer.texts_to_sequences(train_sentences)

In [None]:
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [None]:
X_train[0], X_train.shape

In [None]:
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [None]:
from pickle import dump
with open('tokenizer.dat','wb') as fh:
    dump(tokenizer, fh)

In [None]:
del tokenized_test, tokenized_train, train_sentences, test_sentences
gc.collect()

In [None]:
batch_size= 2048
gc.collect()

In [None]:
del tokenizer
gc.collect()

In [None]:
embed_dim = 128
X_inp = Input(shape=(maxlen,), dtype='int32')
X = Embedding(max_features, embed_dim)(X_inp)
X = Dropout(0.25)(X)
X = Conv1D(2*embed_dim, kernel_size=3)(X)
X = Conv1D(2*embed_dim, kernel_size = 3)(X)
for strides in [1, 1, 2]:
    X = Conv1D(128*2**(strides), strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(X)
    X_1 = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(X)  
    X_2 = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(X)
X = Concatenate()([X_1, X_2])
X = Dropout(0.3)(X)
X = Dense(64, activation='relu')(X)
X = Dropout(0.1)(X)
X = Dense(1, activation='sigmoid')(X)
sentiment_model = Model(inputs=X_inp, outputs=X)


In [None]:
sentiment_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
sentiment_model.summary()

In [None]:
weight_path="weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=3)
callbacks = [checkpoint, early_stopping]

In [None]:
sentiment_model.fit(X_train, train_labels, batch_size=batch_size, epochs=12, validation_split=0.15, shuffle=True, callbacks=callbacks)

In [None]:
sentiment_model.load_weights(weight_path)
score, acc = sentiment_model.evaluate(X_test, test_labels, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)