# Deep learning
This notebook is responsible for implementing a recurrent neural network using TensorFlow.

## Database credentials

In [2]:
db_user = ""
db_pass = ""
db_name = ""
db_host = "localhost"
with open("database_credentials.txt") as f:
    db_user = f.readline().strip()
    db_pass = f.readline().strip()
    db_name = f.readline().strip()

## Dataframe-ize tweets

In [3]:
import pymysql as pms
import numpy as np
import pandas as pd

In [4]:
try:
    con = pms.connect(host=db_host, user=db_user, passwd=db_pass, db=db_name)
    df = pd.read_sql("""SELECT * FROM search_tweets""", con)
finally:
    if con:
        con.close()
df.head()

Unnamed: 0,id,message
0,1,Great meeting with @Cabinet at the @WhiteHouse...
1,2,Looking forward to 3:30 P.M. meeting today at ...
2,3,"Lowest rated Oscars in HISTORY. Problem is, we..."
3,4,"JOBS, JOBS, JOBS! #MAGA"
4,5,The U.S. is acting swiftly on Intellectual Pro...


## Data exploration

In [5]:
#All tweets linearly joined together in a list
char_list = " ".join(list(df["message"]))

print("Unique space-separated character orderings: {}".format(len({
    word: None for word in char_list.split(" ")})))
print("Tweets: {}".format(df.shape[0]))

print("Average sentences per tweet: {}".format(
    (char_list.count(".") + char_list.count("?") + char_list.count("!")) / float(df.shape[0])))

Unique space-separated character orderings: 12837
Tweets: 2889
Average sentences per tweet: 2.7608168916580134


## Data preprocessing
### Lookup table
In order to create a word embedding, the words used in the tweets need to be transformed to IDs. The 2 way mapping from words->IDs and IDs->words is generated below.

In [6]:
from string import punctuation
from collections import Counter

punc = punctuation.replace("#", "")

In [7]:
sample_text = "here's some sample text. hopefully this\nworks? ok - time to give it a shot!!"
def get_lookup_tables(text):
    """
    Gets the lookup tables mapping character orderings to their IDs and vice-versa.
    :param text: Text to create lookup tables from
    :return: A tuple of mapes (vocab_to_int, int_to_vocab)
    """
    #If passed in text as big string, words separated by spaces
    if type(text) == str:
        #text = text.translate(None, punctuation).split()
        text = text.split()
    #If passed in text as list (same as string representation but separated by indices)
    elif type(text) == list:
        #Handle later
        None
        
    #Create mappings
    words = [k for (k,v) in Counter(text).items()]
    vocab_to_int = {}
    int_to_vocab = {}
    for i in range(len(words)):
        vocab_to_int[words[i]] = i
        int_to_vocab[i] = words[i]
    return (vocab_to_int, int_to_vocab)
#get_lookup_tables(sample_text)

### Punctuation tokenizing
Spaces split the tweets up word by words. However, punctuations make it difficult for neural networks to distinguish between "dream" and "dream!". The requring tokenization mechanism to map characters to their IDs is performed below.

With this mapping mechanism, the dictionary will be used to toeknize the symbols and add a space around the character, making the character it's own word. When punctuations act as their own word, the neural network can more easily incorporate them into it's produced language.

In [8]:
#Consider adding possessive/abbreviation for punctuation ... "'"
rnn_punctuation = [".", ",", "\"", ";", "!", "?", "(", ")", "-", "\n", "|"]
rnn_punctuation_words = list(map(lambda s : " ~" + s.upper() + "~", ["period", "comma", "quotation", "semicolon", "exclamation",
                         "question", "leftparen", "rightparen", "hyphen", "newline", "pipe"]))

rnn_punctuation_map = {rnn_punctuation[i]: rnn_punctuation_words[i] for i in range(len(rnn_punctuation))}
rnn_punctuation_map

{'\n': ' ~NEWLINE~',
 '!': ' ~EXCLAMATION~',
 '"': ' ~QUOTATION~',
 '(': ' ~LEFTPAREN~',
 ')': ' ~RIGHTPAREN~',
 ',': ' ~COMMA~',
 '-': ' ~HYPHEN~',
 '.': ' ~PERIOD~',
 ';': ' ~SEMICOLON~',
 '?': ' ~QUESTION~',
 '|': ' ~PIPE~'}

## Transform data

In [9]:
#Join all text and tokenize the punctuation
puncd_text = " ".join([m for m in df["message"]])
filtered_text = ""
for ch in puncd_text:
    filtered_text += ch if ch not in rnn_punctuation_map else rnn_punctuation_map[ch]
    
#Create the lookup tables
vocab_to_int, int_to_vocab = get_lookup_tables(filtered_text)

#all_text representation by IDs
int_text = [vocab_to_int[vocab] for vocab in filtered_text.split()]

## Building the RNN
### Checking TensorFlow
I'm having difficulty with linking tensorflow-gpu to a CUDA .dll. Because of this I'm just going to run with CPU TensorFlow right now, since the main point of this project is not GPU computing.

In [10]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

#Check tensorflow version
assert LooseVersion("1.0") <= LooseVersion(tf.__version__), "Please use TensorFlow version 1.0 or newer"
print("TensorFlow Version: {}".format(tf.__version__))

#Check for GPU
if not tf.test.gpu_device_name():
    warnings.warn("No GPU found. Please use a GPU to train your nerual network.")
else:
    print("Default GPU device: {}".format(tf.test.gpu_device_name()))
    
#Output available devices
from tensorflow.python.client import device_lib
devices = device_lib.list_local_devices()
for d in devices:
    print(d.name)

TensorFlow Version: 1.2.1
/cpu:0


  # This is added back by InteractiveShellApp.init_path()


### Input
Create TensorFlow placeholders for the neural network for the input text, targets, and learning rate.

In [11]:
def get_inputs():
    """
    Returns a size 3 tuple of TensorFlow placehodlers for the input text, targets,
    and learning rate used by for the RNN.
    """
    return (
        tf.placeholder(tf.int32, shape=[None, None], name="input"),
        tf.placeholder(tf.int32, shape=[None, None], name="targets"),
        tf.placeholder(tf.float32, name="lr")
    )

### LSTM Cell and RNN Size
Stack one or more long-short term memory cells using TensorFlow's BasicLSTMCell and MultiRNNCell classes.

In [12]:
def get_init_cell(batch_size, rnn_size):
    """
    Creates an RNN cell and initializes it.
    :param batch_size: Size of input batches
    :param rnn_size: Size of a MultiRNNCell
    :return: Tuple (cell, initialize state)
    """
    lstm_size = 256
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    cell = tf.contrib.rnn.MultiRNNCell([lstm] * rnn_size)
    initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), name="initial_state")
    return (cell, initial_state)

### Word embedding
Apply the word embedding to the input data.

In [13]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Return embedding for input_data.
    :param input_data: TensorFlow placeholder for text input
    :param vocab_size: Number of words in vocabulary
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input
    """
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    return tf.nn.embedding_lookup(embedding, input_data)

### Build the RNN
The single RNN cell has already been designed. Now design the recurrent network.

In [14]:
def build_rnn(cell, inputs):
    """
    Create an RNN using the RNN cell
    :param cell: RNN cell
    :param inputs: Input text data
    :return: Tuple (outputs, final state)
    """
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, "final_state")
    return outputs, final_state

### Bulid the entire Neural Network

In [15]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the NN
    :param cell: RNN cell
    :param rnn_size: Size of RNN
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (logits, final_state)
    """
    embedding = get_embed(input_data, vocab_size, embed_dim)
    outputs, final_state = build_rnn(cell, embedding)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    return logits, final_state

### Batching
Implement a batching function to create batches of input and targets. The batches should be a NumPy array with the shape (number of batches, 2, batch size, sequence length).

In [16]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their IDs
    :param batch_size: The size of the batch
    :param seq_length: The length of the sequence
    :return: Batches as a NumPy array
    """
    num_batches = len(int_text) // (batch_size * seq_length)
    #Initialize batches
    batches = [[[], []] for i in range(num_batches)]
    #Truncate input
    int_text = int_text[:batch_size*seq_length*num_batches]
    
    #Iterate until all elements are used
    for i in range(0, len(int_text), seq_length*num_batches):
        for j in range(num_batches):
            start = i + seq_length * j
            end = start + seq_length
            curr_in = int_text[start:end]
            curr_out = []
            for k in range(seq_length):
                curr_out.append(int_text[(start + k + 1) % len(int_text)])
                batches[j][0].append(curr_in)
                batches[j][1].append(curr_out)
    return np.array(batches)

## Training

In [17]:
num_epochs = 25
batch_size = 256
rnn_size = 1
embed_dim = 500
seq_length = 15
learning_rate = .01
show_every_n_batches = 25
save_dir = './save'

### Building the TensorFlow graph

In [18]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)
    
    #Probabilities for generating words
    probs = tf.nn.softmax(logits, name="probs")
    
    #Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]])
    )
    
    #Optimizer
    optimizer = tf.train.AdamOptimizer(lr)
    
    #Gradient clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad,var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train
Train the neural network on the preprocessed data.

In [19]:
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_i, (x,y) in enumerate(batches):
            feed = {
                input_text:x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)
            
            #Show every N batches
            print("Epoch {:>3} Batch {:>4}/{}\t train_loss = {:.3f}".format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss
                ))
            """if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print("Epoch {:>3} Batch {:>4}/{}\t train_loss = {:.3f}".format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss
                ))
                """
    #Save
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print("Model trained and saved")

Epoch   0 Batch    0/20	 train_loss = 9.167
Epoch   0 Batch    1/20	 train_loss = 9.048
Epoch   0 Batch    2/20	 train_loss = 8.186
Epoch   0 Batch    3/20	 train_loss = 7.467
Epoch   0 Batch    4/20	 train_loss = 7.523
Epoch   0 Batch    5/20	 train_loss = 7.527
Epoch   0 Batch    6/20	 train_loss = 7.412
Epoch   0 Batch    7/20	 train_loss = 7.406
Epoch   0 Batch    8/20	 train_loss = 7.345
Epoch   0 Batch    9/20	 train_loss = 7.260
Epoch   0 Batch   10/20	 train_loss = 7.225
Epoch   0 Batch   11/20	 train_loss = 6.982
Epoch   0 Batch   12/20	 train_loss = 7.047
Epoch   0 Batch   13/20	 train_loss = 6.948
Epoch   0 Batch   14/20	 train_loss = 6.855
Epoch   0 Batch   15/20	 train_loss = 6.836
Epoch   0 Batch   16/20	 train_loss = 6.658
Epoch   0 Batch   17/20	 train_loss = 6.576
Epoch   0 Batch   18/20	 train_loss = 6.665
Epoch   0 Batch   19/20	 train_loss = 6.548
Epoch   1 Batch    0/20	 train_loss = 6.177
Epoch   1 Batch    1/20	 train_loss = 6.142
Epoch   1 Batch    2/20	 train_l

Epoch   9 Batch    7/20	 train_loss = 2.334
Epoch   9 Batch    8/20	 train_loss = 2.325
Epoch   9 Batch    9/20	 train_loss = 2.260
Epoch   9 Batch   10/20	 train_loss = 2.228
Epoch   9 Batch   11/20	 train_loss = 2.248
Epoch   9 Batch   12/20	 train_loss = 2.213
Epoch   9 Batch   13/20	 train_loss = 2.223
Epoch   9 Batch   14/20	 train_loss = 2.174
Epoch   9 Batch   15/20	 train_loss = 2.140
Epoch   9 Batch   16/20	 train_loss = 2.183
Epoch   9 Batch   17/20	 train_loss = 2.152
Epoch   9 Batch   18/20	 train_loss = 2.102
Epoch   9 Batch   19/20	 train_loss = 2.184
Epoch  10 Batch    0/20	 train_loss = 2.094
Epoch  10 Batch    1/20	 train_loss = 2.097
Epoch  10 Batch    2/20	 train_loss = 2.140
Epoch  10 Batch    3/20	 train_loss = 2.137
Epoch  10 Batch    4/20	 train_loss = 2.096
Epoch  10 Batch    5/20	 train_loss = 2.124
Epoch  10 Batch    6/20	 train_loss = 2.136
Epoch  10 Batch    7/20	 train_loss = 2.119
Epoch  10 Batch    8/20	 train_loss = 2.103
Epoch  10 Batch    9/20	 train_l

Epoch  18 Batch   14/20	 train_loss = 0.943
Epoch  18 Batch   15/20	 train_loss = 0.909
Epoch  18 Batch   16/20	 train_loss = 0.938
Epoch  18 Batch   17/20	 train_loss = 0.953
Epoch  18 Batch   18/20	 train_loss = 0.863
Epoch  18 Batch   19/20	 train_loss = 0.924
Epoch  19 Batch    0/20	 train_loss = 0.878
Epoch  19 Batch    1/20	 train_loss = 0.921
Epoch  19 Batch    2/20	 train_loss = 0.947
Epoch  19 Batch    3/20	 train_loss = 0.927
Epoch  19 Batch    4/20	 train_loss = 0.896
Epoch  19 Batch    5/20	 train_loss = 0.922
Epoch  19 Batch    6/20	 train_loss = 0.966
Epoch  19 Batch    7/20	 train_loss = 0.925
Epoch  19 Batch    8/20	 train_loss = 0.887
Epoch  19 Batch    9/20	 train_loss = 0.876
Epoch  19 Batch   10/20	 train_loss = 0.857
Epoch  19 Batch   11/20	 train_loss = 0.896
Epoch  19 Batch   12/20	 train_loss = 0.896
Epoch  19 Batch   13/20	 train_loss = 0.899
Epoch  19 Batch   14/20	 train_loss = 0.868
Epoch  19 Batch   15/20	 train_loss = 0.828
Epoch  19 Batch   16/20	 train_l