# Practical 5.4 Modeling Text

# Sequence classification model

In [None]:
from __future__ import print_function

## Word-level sentiment classification

### Twitter sentiment data set

We use the same twitter data set as in Practical-5.3, but we will train the sequence model using word sequences, instead of character sequences. 

In [None]:
import os
import sys
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100
import re
import nltk

DATA_PATH = 'data'
EMBEDDING_PATH = 'embedding'
MODEL_PATH = 'model'

### Data preprocessing

In [None]:
# Script for preprocessing tweets by Romain Paulus
# with small modifications by Jeffrey Pennington
# from http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb

def split_hashtag(found):
    hashtag_body = found.group(0)[1:]
    
    return "<HASHTAG> " + hashtag_body + " <ALLCAPS>"

    
def preprocess(text):

    # Different regex parts for smiley faces
    eyes = "[8:=;]"
    nose = "['`\-]?"

    text = re.sub(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', '<URL>', text)
    text = re.sub(r'/', ' / ', text) # Force splitting words appended with slashes (once we tokenized the URLs, of course)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(eyes + nose + r'[)dD]+|[(dD]+' + nose + eyes, "<SMILE>", text)
    text = re.sub(eyes + nose + r'[pP]+', "<LOLFACE>", text)
    text = re.sub(eyes + nose + r'\(+|\)+' + nose + eyes, "<SADFACE>", text)
    text = re.sub(eyes + nose + r'( \/|[\\|l*])', "<NEUTRALFACE>", text)
    text = re.sub(r'<3', "<HEART>", text)
    text = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*', "<NUMBER>", text)
    text = re.sub(r'#\S+', split_hashtag, text) # Split hashtags on uppercase letters
    text = re.sub(r'([!?.]){2,}', r'\1 <REPEAT>', text) # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>")
    text = re.sub(r'\b(\S*?)(.)\2{2,}\b', r'\1\2 <ELONG>', text) # Mark elongated words (eg. "wayyyy" => "way <ELONG>")
    #text = re.sub(r'(?<![<A-Z])([^a-z0-9()<>\'`\-]){2,}', lambda x: x.group(1).lower() + ' <ALLCAPS>', text)

    return text.lower()

### Read raw data

In [None]:
raw_texts = []
raw_labels = []

import csv
with open(os.path.join(DATA_PATH,'twitter-sentiment.csv'), 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in reader:
        raw_texts.append(row[3])
        raw_labels.append(row[4])

### Transform labels into categorical form (one hot encoding for multi class output)

In [None]:
from keras.utils.np_utils import to_categorical

label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'irrelevant': 3}
labels = to_categorical(np.asarray([label_mapping[label] for label in raw_labels]))


### Preprocess raw data 

In [None]:
texts = [preprocess(text) for text in raw_texts]

### Create vocabulary index

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=32)
vocab = tokenizer.word_index
vocab['<eos>'] = 0 

### Preparing data for model input

In [None]:
x_train, y_train = data[:4000], labels[:4000]
x_val, y_val = data[4000:], labels[4000:]
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

### Load pretrained word embedding (GloVe)

Note that this GloVe file is originally from https://nlp.stanford.edu/projects/glove/ and trained on 2 billion tweets with a vocabulary of 1.2 million word forms. Filter down the dimension of embedding so that it only contains word tokens seen in twitter data we use. 

In [None]:
# function to load pretrained embedding
def load_embedding(vocab, dimension, filename):
    print('loading embeddings from "%s"' % filename, file=sys.stderr)
    embedding = np.zeros((max(vocab.values()) + 1, dimension), dtype=np.float32)
    seen = set()
    with open(filename) as fp:
        for line in fp:
            tokens = line.strip().split(' ')
            if len(tokens) == dimension + 1:
                word = tokens[0]
                if word in vocab:
                    embedding[vocab[word]] = [float(x) for x in tokens[1:]]
                    seen.add(word)
                    if len(seen) == len(vocab):
                        break
    return embedding

In [None]:
weights = load_embedding(vocab, 100, os.path.join(DATA_PATH,'glove.twitter.27B.100d.filtered.txt'))

## Word-level Recurrent Neural Networks (RNN) model

Construct LSTM model that uses word sequences as input to learn sentiment polarity of given text. Consider using the following layers:

* Input layer
* Embedding layer: initialize with pretrained embedding (GloVe)
* LSTM layer
* Prediction (Dense) layer

In [None]:
from keras.layers import Embedding, Input, LSTM, Dense
from keras.models import Model

In [None]:
# YOUR CODE HERE