In [50]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

import os
import re

In [51]:
EMBEDDING_FILES = [
    'embeddings/crawl-300d-2M.vec',
    'embeddings/glove.840B.300d.txt'
]

NUM_MODELS = 2
BATCH_SIZE = 128
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220

# Helper Functiosn

## Processing Embeddings

In [52]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

## Build and define the model

In [53]:
def build_model(embedding_matrix, model_name):
    words = Input(shape=(max_length,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='tanh')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(11, activation='softmax')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.name = model_name

    return model

# Data Preprocessing

In [54]:
def load_directory_data(directory):
    data={}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

def load_dataset(directory):
    pos_df = load_directory_data(directory + "/pos")
    neg_df = load_directory_data(directory + "/neg")
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

In [55]:
train_df = load_dataset("data/aclImdb/train/")
test_df = load_dataset("data/aclImdb/test/")

In [56]:
x_train = train_df["sentence"]
y_train = train_df["sentiment"]

x_test = test_df["sentence"]
y_test = test_df["sentiment"]

In [57]:
from keras.utils import to_categorical

y_train = to_categorical(train_df["sentiment"])
y_test = to_categorical(test_df["sentiment"])

In [58]:
set(train_df['sentiment'].values), train_df["sentiment"][1], y_train[1]

({'1', '10', '2', '3', '4', '7', '8', '9'},
 '9',
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32))

In [59]:
x_train[1], y_train[1]

('You spend most of this two-hour film wondering "what\'s the story regarding the lead character?" <br /><br />Will Smith, as a low-key "Ben Thomas" will keep you guessing. The last 20-25 minutes is when you find out, and it\'s a shocker....but you knew something dramatic was going to be revealed. Until then, Smith, plays it mysterious, almost stalking people. You know he has a good reason for doing it, but it\'s never really explained, once again, to keep us guessing until the end.<br /><br />All of it, including a on again/off again but touching romance with Rosario Dawkins ("Emily Posa") might make some viewers frustrated or wanting to quit this film.....but don\'t because the final long segment puts all the pieces of this puzzle together.<br /><br />This is a two-hour film and not the typical action-packed macho Will Smith film. In fact, the most shocking aspect might be seeing the drawn, sad face of Smith throughout this story. It almost doesn\'t even look like him in a number of 

## Tokenisation

In [60]:
tokenizer_obj = text.Tokenizer()
total_reviews = x_train + x_test
tokenizer_obj.fit_on_texts(total_reviews)

max_length = max([len(s.split()) for s in total_reviews])
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(x_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(x_test)

X_train_pad = sequence.pad_sequences(X_train_tokens, maxlen=max_length, padding="post")
X_test_pad = sequence.pad_sequences(X_test_tokens, maxlen=max_length, padding="post")

In [61]:
# embedding_matrix = np.concatenate(
#     [build_matrix(tokenizer_obj.word_index, f) for f in EMBEDDING_FILES], axis=-1)

# np.savetxt('embedding_concat.txt', embedding_matrix , fmt='%d')

In [62]:
embedding_matrix = np.loadtxt('embedding_concat.txt', dtype=int)

# Training Loop

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, TensorBoard
LOG_DIR = '/media/eigenstir/1TBSecondary/tbgraphs'

tbCallBack = TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
mc = ModelCheckpoint('best_model_mk1.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
ls_sched = LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))

In [None]:
weights = []
for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix, model_name = str(model_idx))
    for global_epoch in range(EPOCHS):
        model.fit(
            X_train_pad,
            y_train,
            validation_data=(X_test_pad, y_test),
            batch_size=BATCH_SIZE,
            epochs=3,
            verbose=2,
            callbacks=[tbCallBack, es, ModelCheckpoint('best_model_mk1' + str(model_idx) + '.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)]
        )
        weights.append(2 ** global_epoch)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
 - 203s - loss: 2.0389 - val_loss: 2.0202
Epoch 2/3




# Test Time

In [14]:
model = build_model(embedding_matrix, 'test')
model.load_weights('best_model_mk11.h5')

In [37]:
idx = 5
test_text = test_df["sentence"][idx]
test_df["sentence"][idx], test_df["sentiment"][idx]

('When at the very start of the film Paleontologist Donald Sutherland arrives at the Argyle family\'s house and it comes out he is the undeniable alibi for one of the members executed for murdering his mother two years ago your sensation is that you are about to watch a top thriller; an innocent man has been convicted and a killer is still around. But as the film runs along your disappointment increases inevitably.<br /><br />"Ordeal by Innocence" is a dull and at times even boring film that doesn\'t raise at any moment. Nothing interesting happens all along and even the final revealing of the facts lacks surprise and intensity (wether you guessed or not).<br /><br />Donald Sutherland, Cristopher Plummer, Faye Dunaway and Sarah Miles (far from her good performance in "Ryan\'s Daughter") just pass through their roles and not very enthusiastically either.<br /><br />You won\'t miss much if you skip this one.',
 '4')

In [38]:
test_text = tokenizer_obj.texts_to_sequences(test_text)
test_text = sequence.pad_sequences(test_text, maxlen=max_length, padding="post")

In [47]:
prediction = model.predict(test_text)

In [49]:
prediction[0]

array([2.8707360e-05, 2.6086321e-01, 1.4081886e-02, 2.1433288e-02,
       5.5947327e-03, 4.3497581e-05, 6.8869776e-05, 6.2136460e-02,
       5.8546316e-02, 1.5213890e-02, 6.3218963e-01], dtype=float32)

In [40]:
# prediction = np.average(prediction, axis=0)

In [43]:
np.argmax(prediction.flatten())

10

Testing texts: Uber

Neg:

1. #uber's customer service department is worst than comcast.
1. #Uber ...get your act right! # frustrated#waiting for competition
1. .@Uber charges more if they think you're willing to pay more. Uber thinks you're willing to pay more when your battery is about to die. And Uber knows when your battery is about to die.

Pos:
1. Ride-hailing industry expected to grow eightfold to $285 billion by 2030
1. Uber is awesome!
1. Best service ever with Uber


In [139]:
imdb = tf.keras.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


# Business Times Singapore

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import pprint

date_sentiments = {}

for i in range(1,11):
    page = urlopen('https://www.businesstimes.com.sg/search/facebook?page='+str(i)).read()
    soup = BeautifulSoup(page, features="html.parser")
    posts = soup.findAll("div", {"class": "media-body"})
    for post in posts:
        time.sleep(1)
        url = post.a['href']
        date = post.time.text
        print(date, url)
        try:
            link_page = urlopen(url).read()
        except:
            url = url[:-2]
            link_page = urlopen(url).read()
        link_soup = BeautifulSoup(link_page)
        sentences = link_soup.findAll("p")
        passage = ""
        for sentence in sentences:
            passage += sentence.text
        sentiment = sia.polarity_scores(passage)['compound']
        date_sentiments.setdefault(date, []).append(sentiment)

date_sentiment = {}

for k,v in date_sentiments.items():
    date_sentiment[datetime.strptime(k, '%d %b %Y').date() + timedelta(days=1)] = round(sum(v)/float(len(v)),3)

earliest_date = min(date_sentiment.keys())

print(date_sentiment)

# Backtrader


In [None]:
import backtrader as bt
import backtrader.indicators as btind
import datetime
import os.path
import sys

class Sentiment(bt.Indicator):
    lines = ('sentiment',)
    plotinfo = dict(
        plotymargin=0.15,
        plothlines=[0],
        plotyticks=[1.0, 0, -1.0])
    
    def next(self):
        self.date = self.data.datetime
        date = bt.num2date(self.date[0]).date()
        prev_sentiment = self.sentiment
        if date in date_sentiment:
            self.sentiment = date_sentiment[date]
        self.lines.sentiment[0] = self.sentiment


class SentimentStrat(bt.Strategy):
    params = (
        ('period', 15),
        ('printlog', True),
    )

    def log(self, txt, dt=None, doprint=False):
        ''' Logging function for this strategy'''
        if self.params.printlog or doprint:
            dt = dt or self.datas[0].datetime.date(0)
            print('%s, %s' % (dt.isoformat(), txt))

    def __init__(self):
        # Keep a reference to the "close" line in the data[0] dataseries
        self.dataclose = self.datas[0].close
        # Keep track of pending orders
        self.order = None
        self.buyprice = None
        self.buycomm = None
        self.sma = bt.indicators.SimpleMovingAverage(
            self.datas[0], period=self.params.period)
        self.date = self.data.datetime
        self.sentiment = None
        Sentiment(self.data)
        
    def notify_order(self, order):
        if order.status in [order.Submitted, order.Accepted]:
            # Buy/Sell order submitted/accepted to/by broker - Nothing to do
            return
        
        # Check if an order has been completed
        # Attention: broker could reject order if not enough cash
        if order.status in [order.Completed]:
            if order.isbuy():
                self.log(
                    'BUY EXECUTED, Price: %.2f, Cost: %.2f, Comm %.2f' %
                    (order.executed.price,
                     order.executed.value,
                     order.executed.comm))
                self.buyprice = order.executed.price
                self.buycomm = order.executed.comm
            else:  # Sell
                self.log('SELL EXECUTED, Price: %.2f, Cost: %.2f, Comm %.2f' %
                         (order.executed.price,
                          order.executed.value,
                          order.executed.comm))
                
            self.bar_executed = len(self)     
            
        elif order.status in [order.Canceled, order.Margin, order.Rejected]:
            self.log('Order Canceled/Margin/Rejected')
            
        # Write down: no pending order
        self.order = None
        
    def notify_trade(self, trade):
        if not trade.isclosed:
            return

        self.log('OPERATION PROFIT, GROSS %.2f, NET %.2f' %
                 (trade.pnl, trade.pnlcomm))
    
    ### Main Strat ###
    def next(self):
        # log closing price of the series from the reference
        self.log('Close, %.2f' % self.dataclose[0])
        
        date = bt.num2date(self.date[0]).date()
        prev_sentiment = self.sentiment
        if date in date_sentiment:
            self.sentiment = date_sentiment[date]
        
        # Check if an order is pending. if yes, we cannot send a 2nd one
        if self.order:
            return
        print(self.sentiment)
        # If not in the market and previous sentiment not none
        if not self.position and prev_sentiment:
            # buy if current close more than sma AND sentiment increased by >= 0.5
            if self.dataclose[0] > self.sma[0] and self.sentiment - prev_sentiment >= 0.5:
                self.log('BUY CREATE, %.2f' % self.dataclose[0])
                self.order = self.buy()
                
        # Already in the market and previous sentiment not none
        elif prev_sentiment:
            # sell if current close less than sma AND sentiment decreased by >= 0.5
            if self.dataclose[0] < self.sma[0] and self.sentiment - prev_sentiment <= -0.5:
                self.log('SELL CREATE, %.2f' % self.dataclose[0])
                self.order = self.sell()

    def stop(self):
        self.log('(MA Period %2d) Ending Value %.2f' %
                 (self.params.period, self.broker.getvalue()), doprint=True)
        

if __name__ == '__main__':
    cerebro = bt.Cerebro()
    
    # Strategy
    cerebro.addstrategy(SentimentStrat)

    # Data Feed
    data = bt.feeds.YahooFinanceData(
        dataname = 'FB',
        fromdate = earliest_date,
        todate = datetime.datetime(2018,11,25),
        reverse = False
    )
    
    cerebro.adddata(data)

    cerebro.broker.setcash(100000.0)
    cerebro.addsizer(bt.sizers.FixedSize, stake=10)
    cerebro.broker.setcommission(commission=0.001)
    print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
    cerebro.run()
    print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
    
    cerebro.plot()

# Saving a model

In [None]:
import sys
from keras.models import load_model
import tensorflow as tf
# from keras import backend as K
from tensorflow.python.framework import graph_util
from tensorflow.python.framework import graph_io
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants



In [None]:
keras.backend.set_learning_phase(0)

model = build_model(embedding_matrix, model_name = str(model_idx))
config = model.get_config()
weights = model.get_weights()
new_model = Model.from_config(config)
new_model.set_weights(weights)

Export the model

In [None]:
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import utils
from tensorflow.python.saved_model import tag_constants, signature_constants
from tensorflow.python.saved_model.signature_def_utils_impl import build_signature_def, predict_signature_def
from tensorflow.contrib.session_bundle import exporter

export_path = 'exported_model'
builder = saved_model_builder.SavedModelBuilder(export_path)

signature = predict_signature_def(inputs={'text': new_model.input},
                                  outputs={'sentiment': new_model.output})

with keras.get_session() as sess:
    builder.add_meta_graph_and_variables(sess=sess,
                                         tags=[tag_constants.SERVING],
                                         signature_def_map={'predict': signature})
    builder.save()

# Another type of loop

In [15]:
embedding_matrix = np.loadtxt('embedding_concat.txt', dtype=int)
words = Input(shape=(max_length,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
x = SpatialDropout1D(0.3)(x)
x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
result = Dense(11, activation='sigmoid')(hidden)
    
model = Model(inputs=words, outputs=result)
model.compile(loss='binary_crossentropy', optimizer='adam')

IndentationError: unexpected indent (<ipython-input-15-c5479d416f9e>, line 3)

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler

tbCallBack = keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=0, write_graph=True, write_images=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
ls_sched = LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch))

In [None]:
model.fit(X_train_pad, y_train, batch_size=64, epochs=10, 
          validation_data=(X_test_pad, y_test), callbacks=[tbCallBack, es, mc])