In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.cross_validation import train_test_split
from keras.utils.np_utils import to_categorical
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential

#----------------------------------------------------------
#參數
KEY = 'gender'
TEXT = 'description'
NUM_WORDS = 1200
MAX_LENGTH = 500
HIDDEN_SIZE = 256
OUTPUT_DIM = 64
BATCH_SIZE = 100
EPOCHS = 20
VALIDATION_SPLIT = 0.15
#----------------------------------------------------------
FILE_PATH = 'gender-classifier-DFE-791531.csv'
raw_data = pd.read_csv(FILE_PATH, encoding = "ISO-8859-1")						
raw_data[KEY] = raw_data[KEY].fillna('unknown')
raw_data[KEY] = raw_data[KEY].map({'male':0,'female':1,'brand':2,'unknown':3}).astype(int)
raw_data[TEXT] = raw_data[TEXT].fillna('NaN')
print("Raw data loaded successfully...\n")
#資料處理
data_0 = raw_data[raw_data[KEY] == 0] 
data_1 = raw_data[raw_data[KEY] == 1] 
data_2 = raw_data[raw_data[KEY] == 2] 
data_3 = raw_data[raw_data[KEY] == 3] 

raw_data = pd.concat([data_0,data_1], axis = 0, ignore_index=True)
raw_data = pd.concat([raw_data,data_2], axis = 0, ignore_index=True)
raw_data = pd.concat([raw_data,data_3], axis = 0, ignore_index=True)

train_data = raw_data.sample(frac = 0.75, random_state = 1)
test_data = raw_data.loc[~raw_data.index.isin(train_data.index)]

train_x = train_data[TEXT]
test_x = test_data[TEXT]

train_y = to_categorical(train_data[KEY])
test_y = to_categorical(test_data[KEY])

token = Tokenizer(num_words = NUM_WORDS)
token.fit_on_texts(train_x)

train_x = sequence.pad_sequences(token.texts_to_sequences(train_x),maxlen = MAX_LENGTH)
test_x = sequence.pad_sequences(token.texts_to_sequences(test_x),maxlen = MAX_LENGTH)
#模型建立
model = Sequential()

model.add(Embedding(output_dim = OUTPUT_DIM, input_dim = 4000, input_length = MAX_LENGTH))
model.add(Dropout(0.2))
#LSTM
model.add(LSTM(OUTPUT_DIM))
#HIDDEN_LAYERS
model.add(Dense(units = HIDDEN_SIZE, activation = 'relu'))
model.add(Dropout(0.2))
#OUTPUT_LAYERS
model.add(Dense(units = 4, activation = 'sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

train_history = model.fit(x = train_x,y = train_y,validation_split = VALIDATION_SPLIT,epochs= EPOCHS, batch_size = BATCH_SIZE,verbose = 2)

Raw data loaded successfully...

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 64)           256000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_5 (Dense)              (None, 256)               16640     
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 1028      
Total params: 306,692
Trainable params: 306,692
Non-trainable params: 0
_____________________________________