In [9]:
import pandas as pd
import numpy as np
data1 = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding = 'ISO-8859-1')
data1['review'] = data1['review'].str.replace('<br /><br />', ' ')

data2 = pd.read_csv('unlabeledTrainData.tsv', sep='\t', encoding = 'ISO-8859-1', error_bad_lines=False)
data2['review'] = data2['review'].str.replace('<br /><br />', ' ')

data3 = pd.read_csv('testData.tsv', sep='\t', encoding = 'ISO-8859-1')
data3['review'] = data3['review'].str.replace('<br /><br />', ' ')

data4 = pd.read_csv('test.csv')
data4['review'] = data4['review'].str.replace('<br /><br />', ' ')
texts = list(data1['review']) + list(data4['review'])
print(len(texts))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

x_train = data1['review']
y_train = data1['sentiment']
x_test = data4['review']
y_test = data4['sentiment']

max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

b'Skipping line 43043: expected 2 fields, saw 3\n'


25000
Found 88548 unique tokens.


In [10]:
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb


def create_ngram_set(input_list, ngram_value=2):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

ngram_range = 2
# max_features = 20000
maxlen = 500
batch_size = 32
embedding_dims = 50
epochs = 6

print('Loading data...')
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(
    np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
    np.mean(list(map(len, x_test)), dtype=int)))

if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))

    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1

    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)), dtype=int)))
print(max_features)

Loading data...
20000 train sequences
5000 test sequences
Average train sequence length: 227
Average test sequence length: 225
Adding 2-gram features
Average train sequence length: 453
Average test sequence length: 413
1062002


In [63]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

model.add(GlobalAveragePooling1D())

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

checkpoint = ModelCheckpoint('./fasttext2.model', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test),
          callbacks=callbacks_list)


Loading data...
20000 train sequences
5000 test sequences
Average train sequence length: 227
Average test sequence length: 225
Adding 3-gram features
Average train sequence length: 678
Average test sequence length: 514
Pad sequences (samples x time)
x_train shape: (20000, 500)
x_test shape: (5000, 500)
Build model...
Train on 20000 samples, validate on 5000 samples
Epoch 1/6

Epoch 00001: val_acc improved from -inf to 0.83620, saving model to ./fasttext2.model
Epoch 2/6

Epoch 00002: val_acc improved from 0.83620 to 0.87740, saving model to ./fasttext2.model
Epoch 3/6

Epoch 00003: val_acc improved from 0.87740 to 0.89460, saving model to ./fasttext2.model
Epoch 4/6

Epoch 00004: val_acc improved from 0.89460 to 0.89740, saving model to ./fasttext2.model
Epoch 5/6

Epoch 00005: val_acc improved from 0.89740 to 0.89800, saving model to ./fasttext2.model
Epoch 6/6

Epoch 00006: val_acc improved from 0.89800 to 0.89980, saving model to ./fasttext2.model


<keras.callbacks.History at 0x7feb09218390>

In [59]:
result = model.predict(x_test)

In [60]:
test_id = list(data3['id'])
print(len(test_id))
out = open('submission.csv', 'w')
out.write('id,sentiment\n')
for i in range(len(result)):
    out.write(test_id[i] + ',' + str(int(np.rint(result[i][0]))) + '\n')

5000
