In [11]:
import pandas as pd
import numpy as np

data = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding = 'ISO-8859-1')
data['review'] = data['review'].str.replace('<br /><br />', ' ')
x_train = data['review']
y_train = data['sentiment']

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

checkpoint = ModelCheckpoint('./keras.model', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

Found 80278 unique tokens.


In [15]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

data1 = pd.read_csv('labeledTrainData.tsv', sep='\t', encoding = 'ISO-8859-1')
data1['review'] = data1['review'].str.replace('<br /><br />', ' ')

data2 = pd.read_csv('unlabeledTrainData.tsv', sep='\t', encoding = 'ISO-8859-1', error_bad_lines=False)
data2['review'] = data2['review'].str.replace('<br /><br />', ' ')
texts = list(data1['review']) + list(data2['review'])
print(len(texts))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

x_train = data['review']
y_train = data['sentiment']
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
x_train = tokenizer.texts_to_sequences(x_train)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

checkpoint = ModelCheckpoint('./keras.model', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

b'Skipping line 43043: expected 2 fields, saw 3\n'


69998
Found 148988 unique tokens.


In [16]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# max_features = 5000
maxlen = 600
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 4


print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
print('x_train shape:', x_train.shape)

print('Build model...')
model = Sequential()

model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1,
          callbacks=callbacks_list)


Pad sequences (samples x time)
x_train shape: (20000, 600)
Build model...
Train on 18000 samples, validate on 2000 samples
Epoch 1/4

Epoch 00001: val_acc improved from -inf to 0.88150, saving model to ./keras.model
Epoch 2/4

Epoch 00002: val_acc improved from 0.88150 to 0.88600, saving model to ./keras.model
Epoch 3/4

Epoch 00003: val_acc improved from 0.88600 to 0.89200, saving model to ./keras.model
Epoch 4/4

Epoch 00004: val_acc did not improve from 0.89200


<keras.callbacks.History at 0x7fae9f512588>

In [None]:
model.save('keras.model')

In [17]:
test_data = pd.read_csv('testData.tsv', sep='\t', encoding = 'ISO-8859-1')
test_data['review'] = test_data['review'].str.replace('<br /><br />', ' ')
x_test = test_data['review']
x_test = tokenizer.texts_to_sequences(x_test)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

result = model.predict(x_test)

In [18]:
test_id = list(test_data['id'])
print(len(test_id))
out = open('submission.csv', 'w')
out.write('id,sentiment\n')
for i in range(len(result)):
    out.write(test_id[i] + ',' + str(int(np.rint(result[i][0]))) + '\n')

5000
