In [1]:
seed = 0

import random
import numpy as np
from tensorflow import set_random_seed

random.seed(seed)
np.random.seed(seed)
set_random_seed(seed)

In [2]:
TRAIN_DATA_FILE='data/trainingset.csv'
TEST_DATA_FILE='data/testa.csv'
VALIDATE_DATE_FILE = 'data/validationset.csv'

In [3]:
import pandas as pd

train_only = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
validation = pd.read_csv(VALIDATE_DATE_FILE)

In [4]:
train = pd.concat([train_only, validation],axis=0,ignore_index=True)

In [5]:
sentiments = train.columns.tolist()[2:]

In [7]:
def punctuation_filter(content):
    return content.replace('\"','').replace('，','').replace('。','').replace('【',
    '').replace('】','').replace('\n','').replace('；','').replace('.','').replace('～','')

import jieba

def cut(content):
    return ' '.join(jieba.cut(content))

    
def punc_cut(content):
    return punctuation_filter(cut(content))

In [15]:
def format_data(train, test, max_features, maxlen):
    """
    Convert data to proper format.
    1) Shuffle
    2) Lowercase
    3) Sentiments to Categorical
    4) Tokenize and Fit
    5) Convert to sequence (format accepted by the network)
    6) Pad
    7) Voila!
    """
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    
    train = train.sample(frac=1).reset_index(drop=True)
    train['content'] = train['content'].apply(lambda x: punc_cut(x))
    test['content'] = test['content'].apply(lambda x: punc_cut(x))

    X = train['content']
    test_X = test['content']

    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=maxlen)
    test_X = tokenizer.texts_to_sequences(test_X)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    return X, test_X

In [16]:
maxlen = 400
max_features = 20000

X, test_X = format_data(train, test, max_features, maxlen)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Mark\AppData\Local\Temp\jieba.cache
Loading model cost 1.259 seconds.
Prefix dict has been built succesfully.


In [13]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

Using TensorFlow backend.


In [14]:
epochs = 5
batch_size = 32

In [None]:
sentiments

In [18]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

def new_model():
    model_ = Sequential()

    # Input / Embdedding
    model_.add(Embedding(max_features, 150, input_length=maxlen))

    # CNN
    model_.add(SpatialDropout1D(0.2))

    model_.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
    model_.add(MaxPooling1D(pool_size=2))

    model_.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
    model_.add(MaxPooling1D(pool_size=2))

    model_.add(Flatten())

    # Output layer
    model_.add(Dense(4, activation='sigmoid'))
    model_.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_

for senti in sentiments:
    print('model_'+ senti +'.h5')
    Y_i = to_categorical(train[senti].values, 4)
    X_train, X_val, Y_train_i, Y_val_i = train_test_split(X, Y_i, test_size=0.25, random_state=seed)
    model_ = new_model()
    model_.fit(X_train, Y_train_i, validation_data=(X_val, Y_val_i), epochs=epochs, batch_size=batch_size, verbose=2)
    model_.save('model_'+ senti +'.h5')

model_price_cost_effective.h5
Train on 90000 samples, validate on 30000 samples
Epoch 1/5
 - 387s - loss: 0.7225 - acc: 0.7651 - val_loss: 0.7263 - val_acc: 0.7597
Epoch 2/5
 - 398s - loss: 0.7017 - acc: 0.7653 - val_loss: 0.7409 - val_acc: 0.7597
Epoch 3/5
 - 439s - loss: 0.5470 - acc: 0.7937 - val_loss: 0.8873 - val_acc: 0.7153
Epoch 4/5
 - 389s - loss: 0.2940 - acc: 0.8922 - val_loss: 1.2716 - val_acc: 0.6746
Epoch 5/5
 - 389s - loss: 0.1530 - acc: 0.9457 - val_loss: 1.6767 - val_acc: 0.6630
model_price_discount.h5
Train on 90000 samples, validate on 30000 samples
Epoch 1/5
 - 397s - loss: 0.9952 - acc: 0.6142 - val_loss: 1.0014 - val_acc: 0.6073
Epoch 2/5
 - 389s - loss: 0.9783 - acc: 0.6143 - val_loss: 1.0139 - val_acc: 0.6073
Epoch 3/5
 - 399s - loss: 0.8310 - acc: 0.6143 - val_loss: 1.1557 - val_acc: 0.6073
Epoch 4/5
 - 398s - loss: 0.5945 - acc: 0.6143 - val_loss: 1.6645 - val_acc: 0.6073
Epoch 5/5
 - 399s - loss: 0.4696 - acc: 0.6155 - val_loss: 2.1125 - val_acc: 0.6073
model_

In [None]:
sentiments