In [1]:
seed = 0

import random
import numpy as np
from tensorflow import set_random_seed

random.seed(seed)
np.random.seed(seed)
set_random_seed(seed)

In [2]:
TRAIN_DATA_FILE='data/trainingset.csv'
TEST_DATA_FILE='data/testa.csv'
VALIDATE_DATE_FILE = 'data/validationset.csv'

In [7]:
import pandas as pd

train_only = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
validation = pd.read_csv(VALIDATE_DATE_FILE)

In [8]:
train = pd.concat([train_only, validation],axis=0,ignore_index=True)

In [13]:
sentiments = train.columns.tolist()[2:]

In [17]:
def punctuation_filter(content):
    return content.replace('\"','').replace('，','').replace('。','').replace('【',
    '').replace('】','').replace('\n','').replace('；','').replace('.','').replace('～','')

import jieba

def cut(content):
    return ' '.join(jieba.cut(content))

    
def punc_cut(content):
    return punctuation_filter(cut(content))

In [47]:
def format_data(train, test, max_features, maxlen, y_indexes):
    """
    Convert data to proper format.
    1) Shuffle
    2) Lowercase
    3) Sentiments to Categorical
    4) Tokenize and Fit
    5) Convert to sequence (format accepted by the network)
    6) Pad
    7) Voila!
    """
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    
    train = train.sample(frac=1).reset_index(drop=True)
    train['content'] = train['content'].apply(lambda x: punc_cut(x))
    test['content'] = test['content'].apply(lambda x: punc_cut(x))

    X = train['content']
    test_X = test['content']
    Y = to_categorical(train[y_indexes[0]].values)

    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=maxlen)
    test_X = tokenizer.texts_to_sequences(test_X)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    return X, Y, test_X

In [20]:
maxlen = 400
max_features = 20000

X, Y, test_X = format_data(train, test, max_features, maxlen, sentiments, 20)

Using TensorFlow backend.
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Mark\AppData\Local\Temp\jieba.cache
Loading model cost 0.932 seconds.
Prefix dict has been built succesfully.


In [43]:
sentiments

['location_traffic_convenience',
 'location_distance_from_business_district',
 'location_easy_to_find',
 'service_wait_time',
 'service_waiters_attitude',
 'service_parking_convenience',
 'service_serving_speed',
 'price_level',
 'price_cost_effective',
 'price_discount',
 'environment_decoration',
 'environment_noise',
 'environment_space',
 'environment_cleaness',
 'dish_portion',
 'dish_taste',
 'dish_look',
 'dish_recommendation',
 'others_overall_experience',
 'others_willing_to_consume_again']

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=seed)

In [85]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

In [89]:
model = Sequential()

# Input / Embdedding
model.add(Embedding(max_features, 150, input_length=maxlen))

# CNN
model.add(SpatialDropout1D(0.2))

model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())

# Output layer
model.add(Dense(2, activation='sigmoid'))

In [90]:
epochs = 5
batch_size = 32

In [140]:
def new_model():
    model_ = Sequential()

    # Input / Embdedding
    model_.add(Embedding(max_features, 150, input_length=maxlen))

    # CNN
    model_.add(SpatialDropout1D(0.2))

    model_.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
    model_.add(MaxPooling1D(pool_size=2))

    model_.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
    model_.add(MaxPooling1D(pool_size=2))

    model_.add(Flatten())

    # Output layer
    model_.add(Dense(4, activation='sigmoid'))
    model_.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_

In [None]:
for senti in sentiments:
    print('model_'+ senti +'.h5')
    Y_i = to_categorical(train[senti].values, 4)
    X_train, X_val, Y_train_i, Y_val_i = train_test_split(X, Y_i, test_size=0.25, random_state=seed)
    new_model = new_model()
    new_model.fit(X_train, Y_train_i, validation_data=(X_val, Y_val_i), epochs=epochs, batch_size=batch_size, verbose=2)
    new_model.save('model_'+ senti +'.h5')

model_location_traffic_convenience.h5
Train on 90000 samples, validate on 30000 samples
Epoch 1/5
