# Recurrent Neural Network

<img src="img/RNN-rolled.png"/ width="80px" height="80px">

<img src="img/RNN-unrolled.png"/ width="400px" height="400px">

# LSTM - Long Short Term Memory

<img src="img/LSTM3-chain.png"/ width="800px" height="800px">

_source: http://colah.github.io/posts/2015-08-Understanding-LSTMs_

In [21]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence,base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

In [22]:
from sklearn.cross_validation import train_test_split

In [23]:
import os
import pickle
import numpy as np
import re

In [24]:
import pandas as pd

### Dataset: Male & Female blogs

In [25]:
DATA_DIRECTORY = os.path.join('data')
print DATA_DIRECTORY

data


In [26]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
    male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
    female_posts = pickle.load(female_file)

In [27]:
filtered_male_posts = []
filtered_female_posts = []

for post_male in male_posts:
    if len(post_male) == 0:
        continue
    post_male = re.sub('\\n','',post_male)
    filtered_male_posts.append(post_male)

for post_female in female_posts:
    if len(post_female) == 0:
        continue
    post_female = re.sub('\\n','',post_female)
    filtered_female_posts.append(post_female)

In [28]:
all_posts = []

In [29]:
all_posts.extend(filtered_male_posts)
all_posts.extend(filtered_female_posts)

In [30]:
type(all_posts)

list

In [31]:
all_posts[1]

"i'm gonna work on my around the world today and hopefully get half of the paper done. Then i am going to the 20 hour famine at my church."

In [32]:
len(all_posts),len(filtered_male_posts),len(filtered_female_posts)

(4842, 2595, 2247)

### Create Labels

In [33]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(filtered_male_posts)),np.ones(len(filtered_female_posts))))

In [34]:
char_list = list(set(''.join(all_posts)))

### Build dict of character to index, index to character

In [35]:
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))


### Build dict of label to index, index to label

In [37]:
label_indices = {'male':0,'female':1}
indices_label = {0:'male',1:'female'}

In [38]:
MAX_LENGTH = 0
i = 0
MAX_INDEX = 0
for i,n in enumerate(all_posts):
    
    if len(n) > MAX_LENGTH:
        MAX_LENGTH = len(n)
        MAX_INDEX = i
        
print(MAX_LENGTH,MAX_INDEX)

(38794, 227)


In [39]:
MAX_LENGTH = 5000

In [40]:
def blog_to_char_seq(blog):
    blog_chars = list(blog)
    blog_chars_indices = list(map(lambda char: char_indices[char], blog_chars))
    return sequence.pad_sequences([blog_chars_indices], maxlen=MAX_LENGTH)[0]

In [41]:
X = []
y = []

for n, l in zip(all_posts, concatenate_array_rnn):
    X.append(blog_to_char_seq(n))
    y.append(l)
    
X = np.array(X).astype(np.uint8)
y = np.array(y)

print(X.shape, y.shape)

((4842, 5000), (4842,))


In [42]:
y

array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [44]:
len(char_list)

170

## A simple LSTM model

In [46]:
model = Sequential()
model.add(Embedding(len(char_list), 32, input_length=MAX_LENGTH, mask_zero=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [47]:
model.compile(loss='binary_crossentropy',optimizer='sgd', metrics=["accuracy"])

In [None]:
model.fit(X_train,y_train,
          batch_size=32,nb_epoch=10,
          validation_split=0.1,
          verbose=1)

Train on 3921 samples, validate on 436 samples
Epoch 1/10

In [44]:
model.evaluate(x_test_rnn,y_test_rnn,batch_size=32)



[0.25045589757068515, 0.58036677396284597]

In [47]:
predicted_output = model.predict(x_test_rnn,batch_size=32)
predicted_classes = model.predict_classes(x_test_rnn, batch_size=32)



In [48]:
df = pd.DataFrame(columns=['predicted','actual'])

In [49]:
df['predicted_class'] = predicted_classes.flatten()
df['predicted'] = predicted_output.flatten()

In [50]:
df['actual'] = y_test_rnn

In [51]:
df.predicted_class.value_counts()

0    927
Name: predicted_class, dtype: int64

In [35]:
df.actual.value_counts()

0    509
1    418
Name: actual, dtype: int64

In [48]:
x_train_rnn.shape

(3706, 100)