# Practical 5.3 Modeling Text

# Sequence classification model

In [None]:
from __future__ import print_function

## Character-level sentiment classification

### Twitter sentiment data set

For this tutorial, we will use twitter data set, which can be downloaded here: https://storage.googleapis.com/trl_data/twitter_sentiment.zip. Notice that this data set contains shorter text than the text reviews being used in Practical 5.2. Download data and locate in the directory 'data'.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100
import re
import nltk

DATA_PATH = 'data'
EMBEDDING_PATH = 'embedding'
MODEL_PATH = 'model'

### Data preprocessing

In [None]:
# Script for preprocessing tweets by Romain Paulus
# with small modifications by Jeffrey Pennington
# from http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb

def split_hashtag(found):
    hashtag_body = found.group(0)[1:]
    
    return "<HASHTAG> " + hashtag_body + " <ALLCAPS>"

    
def preprocess(text):

    # Different regex parts for smiley faces
    eyes = "[8:=;]"
    nose = "['`\-]?"

    text = re.sub(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<URL>', text)
    text = re.sub(r'/', ' / ', text) # Force splitting words appended with slashes (once we tokenized the URLs, of course)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(eyes + nose + r'[)dD]+|[(dD]+' + nose + eyes, "<SMILE>", text)
    text = re.sub(eyes + nose + r'[pP]+', "<LOLFACE>", text)
    text = re.sub(eyes + nose + r'\(+|\)+' + nose + eyes, "<SADFACE>", text)
    text = re.sub(eyes + nose + r'( \/|[\\|l*])', "<NEUTRALFACE>", text)
    text = re.sub(r'<3', "<HEART>", text)
    text = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*', "<NUMBER>", text)
    text = re.sub(r'#\S+', split_hashtag, text) 
    text = re.sub(r'([!?.]){2,}', r'\1 <REPEAT>', text) # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>")
    text = re.sub(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <ELONG>', text) # Mark elongated words (eg. "wayyyy" => "way <ELONG>")

    return text.lower()

### Read raw data

In [None]:
raw_texts = []
raw_labels = []

import csv
with open(os.path.join(DATA_PATH,'twitter-sentiment.csv'), 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in reader:
        raw_texts.append(row[3])
        raw_labels.append(row[4])

### Transform labels into categorical form (one hot encoding for multi class output)

In [None]:
from keras.utils.np_utils import to_categorical

label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'irrelevant': 3}
labels = to_categorical(np.asarray([label_mapping[label] for label in raw_labels]))

### Preprocess raw data 

In [None]:
texts = [preprocess(text) for text in raw_texts]

### Create character-level vocabulary index

In [None]:
txt = ''
for doc in texts:
    for s in doc:
        txt += s

In [None]:
chars = set(txt)
print('total chars:', len(chars))

In [None]:
# pairs of character - index of character in look up vocabulary
char_indices = dict((c, i) for i, c in enumerate(chars))

# pairs of index of character - character in look up vocabulary
indices_char = dict((i, c) for i, c in enumerate(chars))

### Preparing data for training the model

In [None]:
# use only 4000 tweets for training set,
# and the rest for validation set

train_docs =  texts[:4000]
val_docs =  texts[4000:]

In [None]:
# define maximum length of input sequence for the model 
maxlen = 450 # 450 characters length

# initialize sequence as numpy array of zeros 
# will be acted as our padding if text length < 450 characters
X_train = np.zeros((len(train_docs), maxlen), dtype=np.int32)
y_train = labels[:4000]

# transform sequence of characters into their integer format of sequence (based on look up vocabulary index)
for i, doc in enumerate(train_docs):
    len_doc = len(doc)
    if len_doc > maxlen:
        txt = doc[:maxlen]
    else:
        txt = doc
    for j, char in enumerate(txt):
        X_train[i, j] = char_indices[char]

In [None]:
X_valid = np.zeros((len(val_docs), maxlen), dtype=np.int32) 
y_valid = labels[4000:]

for i, doc in enumerate(val_docs):
    len_doc = len(doc)
    if len_doc > maxlen:
        txt = doc[:maxlen]
    else:
        txt = doc
    for j, char in enumerate(txt):
        X_valid[i, j] = char_indices[char]

## Character-level Recurrent Neural Networks (RNN) model

In [None]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.layers import LSTM, Lambda
import tensorflow as tf

Use the same architecture (Keras functional API) as in Practical-5.2. 

Note that you train the model to solve multi class classification problem (instead of binary classification in Practical 5.2). So, you need to change slightly the structure of prediction layer (last output layer) and loss function.

Does this model suffer a similar problem?

In [None]:
# YOUR CODE HERE