# Deep learning
This notebook is responsible for implementing a recurrent neural network using TensorFlow.

## Database credentials

In [1]:
db_user = ""
db_pass = ""
db_name = ""
db_host = "localhost"
with open("database_credentials.txt") as f:
    db_user = f.readline().strip()
    db_pass = f.readline().strip()
    db_name = f.readline().strip()

## Dataframe-ize tweets

In [2]:
import pymysql as pms
import numpy as np
import pandas as pd

In [3]:
try:
    con = pms.connect(host=db_host, user=db_user, passwd=db_pass, db=db_name)
    df = pd.read_sql("""SELECT * FROM search_tweets""", con)
finally:
    if con:
        con.close()
df.head()

Unnamed: 0,id,message
0,1,Great meeting with @Cabinet at the @WhiteHouse...
1,2,Looking forward to 3:30 P.M. meeting today at ...
2,3,"Lowest rated Oscars in HISTORY. Problem is, we..."
3,4,"JOBS, JOBS, JOBS! #MAGA"
4,5,The U.S. is acting swiftly on Intellectual Pro...


## Data exploration

In [4]:
#All tweets linearly joined together in a list
char_list = " ".join(list(df["message"]))

print("Unique space-separated character orderings: {}".format(len({
    word: None for word in char_list.split(" ")})))
print("Tweets: {}".format(df.shape[0]))

print("Average sentences per tweet: {}".format(
    (char_list.count(".") + char_list.count("?") + char_list.count("!")) / float(df.shape[0])))

Unique space-separated character orderings: 12837
Tweets: 2889
Average sentences per tweet: 2.7608168916580134


## Data preprocessing
### Lookup table
In order to create a word embedding, the words used in the tweets need to be transformed to IDs. The 2 way mapping from words->IDs and IDs->words is generated below.

In [5]:
from string import punctuation
from collections import Counter

In [6]:
sample_text = "here's some sample text. hopefully this\nworks? ok - time to give it a shot!!"
def get_lookup_tables(text):
    """
    Gets the lookup tables mapping character orderings to their IDs and vice-versa.
    :param text: Text to create lookup tables from
    :return: A tuple of mapes (vocab_to_int, int_to_vocab)
    """
    #If passed in text as big string, words separated by spaces
    if type(text) == str:
        #text = text.translate(None, punctuation).split()
        text = text.split()
    #If passed in text as list (same as string representation but separated by indices)
    elif type(text) == list:
        #Handle later
        None
        
    #Create mappings
    words = [k for (k,v) in Counter(text).items()]
    vocab_to_int = {}
    int_to_vocab = {}
    for i in range(len(words)):
        vocab_to_int[words[i]] = i
        int_to_vocab[i] = words[i]
    return (vocab_to_int, int_to_vocab)
#get_lookup_tables(sample_text)

### Punctuation tokenizing
Spaces split the tweets up word by words. However, punctuations make it difficult for neural networks to distinguish between "dream" and "dream!". The requring tokenization mechanism to map characters to their IDs is performed below.

With this mapping mechanism, the dictionary will be used to toeknize the symbols and add a space around the character, making the character it's own word. When punctuations act as their own word, the neural network can more easily incorporate them into it's produced language.

In [7]:
#Consider adding possessive/abbreviation for punctuation ... "'"
rnn_punctuation = [".", ",", "\"", ";", "!", "?", "(", ")", "-", "\n", "|"]
rnn_punctuation_words = list(map(lambda s : "~" + s.upper() + "~", ["period", "comma", "quotation", "semicolon", "exclamation",
                         "question", "leftparen", "rightparen", "hyphen", "newline", "pipe"]))

rnn_punctuation_map = {rnn_punctuation[i]: rnn_punctuation_words[i] for i in range(len(rnn_punctuation))}
rnn_punctuation_map

{'\n': '~NEWLINE~',
 '!': '~EXCLAMATION~',
 '"': '~QUOTATION~',
 '(': '~LEFTPAREN~',
 ')': '~RIGHTPAREN~',
 ',': '~COMMA~',
 '-': '~HYPHEN~',
 '.': '~PERIOD~',
 ';': '~SEMICOLON~',
 '?': '~QUESTION~',
 '|': '~PIPE~'}

## Building the RNN
### Checking TensorFlow
I'm having difficulty with linking tensorflow-gpu to a CUDA .dll. Because of this I'm just going to run with CPU TensorFlow right now, since the main point of this project is not GPU computing.

In [8]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

#Check tensorflow version
assert LooseVersion("1.0") <= LooseVersion(tf.__version__), "Please use TensorFlow version 1.0 or newer"
print("TensorFlow Version: {}".format(tf.__version__))

#Check for GPU
if not tf.test.gpu_device_name():
    warnings.warn("No GPU found. Please use a GPU to train your nerual network.")
else:
    print("Default GPU device: {}".format(tf.test.gpu_device_name()))
    
#Output available devices
from tensorflow.python.client import device_lib
devices = device_lib.list_local_devices()
for d in devices:
    print(d.name)

TensorFlow Version: 1.2.1
/cpu:0


  # This is added back by InteractiveShellApp.init_path()


### Input
Create TensorFlow placeholders for the neural network for the input text, targets, and learning rate.

In [9]:
def get_inputs():
    """
    Returns a size 3 tuple of TensorFlow placehodlers for the input text, targets,
    and learning rate used by for the RNN.
    """
    return (
        tf.placeholder(tf.int32, shape=[None, None], name="input"),
        tf.placeholder(tf.int32, shape=[None, None], name="targets"),
        tf.placeholder(tf.float32, name="learningrate")
    )

### LSTM Cell and RNN Size
Stack one or more long-short term memory cells using TensorFlow's BasicLSTMCell and MultiRNNCell classes.

In [11]:
def get_init_cell(batch_size, rnn_size):
    """
    Creates an RNN cell and initializes it.
    :param batch_size: Size of input batches
    :param rnn_size: Size of a MultiRNNCell
    :return: Tuple (cell, initialize state)
    """
    lstm_size = 256
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    cell = tf.contrib.rnn.MultiRNNCelll([lstm] * rnn_size)
    initial_state = tf.identitiy(cell.zero_state(batch_size, tf.float32), name="initial_state")
    return (cell, initial_state)

### Word embedding
Apply the word embedding to the input data.

In [12]:
def get_embed(input_data, vocab_size, embed_dim):
    """
    Return embedding for input_data.
    :param input_data: TensorFlow placeholder for text input
    :param vocab_size: Number of words in vocabulary
    :param embed_dim: Number of embedding dimensions
    :return: Embedded input
    """
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    return tf.nn.embedding_lookup(embedding, input_data)

### Build the RNN
The single RNN cell has already been designed. Now design the recurrent network.

In [13]:
def build_rnn(cell, inputs):
    """
    Create an RNN using the RNN cell
    :param cell: RNN cell
    :param inputs: Input text data
    :return: Tuple (outputs, final state)
    """
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, "final_state")
    return outputs, final_state

### Bulid the entire Neural Network

In [14]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    """
    Build part of the NN
    :param cell: RNN cell
    :param rnn_size: Size of RNN
    :param input_data: Input data
    :param vocab_size: Vocabulary size
    :param embed_dim: Number of embedding dimensions
    :return: Tuple (logits, final_state)
    """
    embedding = get_embed(input_data, vocab_size, embed_dim)
    ouputs, final_state = build_rnn(cell, embedding)
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    return logits, final_state

### Batching
Implement a batching function to create batches of input and targets. The batches should be a NumPy array with the shape (number of batches, 2, batch size, sequence length).

In [15]:
def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target
    :param int_text: Text with the words replaced by their IDs
    :param batch_size: The size of the batch
    :param seq_length: The length of the sequence
    :return: Batches as a NumPy array
    """
    num_batches = len(int_text) // (batch_size * seq_length)
    #Initialize batches
    batches = [[[], []] for i in range(num_batches)]
    #Truncate input
    int_text = int_text[:batch_size*seq_length*num_batches]
    
    #Iterate until all elements are used
    for i in range(0, len(int_text), seq_length*num_batches):
        for j in range(num_batches):
            start = i + seq_length * j
            end = start + seq_length
            curr_in = int_text[start:end]
            curr_out = []
            for k in range(seq_length):
                curr_out.append(int_text[(start + k + 1) % len(int_text)])
                batches[j][0].append(curr_in)
                batches[j][1].append(curr_out)
    return np.array(batches)