In [564]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import torch
from sklearn.preprocessing import StandardScaler

In [565]:
file_path = "/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset"
file_list = os.listdir(file_path)
file_list.remove('.DS_Store')

In [566]:
# selecting earning calls from the year 2018
subset=[]
companies=[]

start = 2018
end = 2018

# Only considering a subset of data
for file in file_list:
    
    year= int(file[:4])
    
    if year >= start and year < (end+1):
        subset.append(file)
        
    company = file[9:]
    if company not in companies:
        companies.append(company)
        

In [567]:
earning_calls = sorted(subset)

In [8]:
# to check features used 
print(df.columns)
len(df.columns)

Index(['Mean pitch', 'Standard deviation', 'Minimum pitch', 'Maximum pitch',
       'Mean intensity', 'Minimum intensity', 'Maximum intensity',
       'Number of pulses', 'Number of periods', 'Mean period',
       'Standard deviation of period', 'Fraction of unvoiced',
       'Number of voice breaks', 'Degree of voice breaks', 'Jitter local',
       'Jitter local absolute', 'Jitter rap', 'Jitter ppq5', 'Jitter ddp',
       'Shimmer local', 'Shimmer local dB', 'Shimmer apq3', 'Shimmer apq5',
       'Shimmer apq11', 'Shimmer dda', 'Mean autocorrelation', 'Mean NHR',
       'Mean HNR', 'Audio Length'],
      dtype='object')


29

In [9]:
len(earning_calls)

320

In [10]:
cnt_ineq=0
file_ineq=0
max_len=0
min_date=30000000
max_date=0
tot_len=0

for i in range(len(earning_calls)):
#for i in range(5): 
    file = earning_calls[i]
        
    text_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/text.txt'
    f = open(text_file_path, "r")
    # to display file content
    content =f.read()
    lines = content.split("\n") # split at new line character
    
    if lines[-1]=='':
        lines = lines[:-1] # last line blank removal
    
    aud_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/features.csv'
    df = pd.read_csv(aud_file_path)
    
    # each sentence is mapped with the corresponding audio features
    # number of sentences in an earning call
    num_of_sent= len(lines)
    
    if len(lines)!=df.shape[0]:
        print("error")
        file_no=i
        cnt_ineq += 1
        
    if num_of_sent > max_len:
        max_len=num_of_sent
        
    tot_len+= len(lines)
        
    val = int(file[:8])
        
    if val < min_date:
        min_date = val
    
    if val > max_date:
        max_date = val
        
print("Max number of sentences in a call: {}".format(max_len))
print("Avg number of sentences in a call: {}".format(tot_len/len(earning_calls)))
print("Min Date: {}".format(min_date))
print("Max Date: {}".format(max_date))

Max number of sentences in a call: 444
Avg number of sentences in a call: 174.29375
Min Date: 20180104
Max Date: 20180621


## Scraping volatility data

In [13]:
import re
from io import StringIO
from datetime import datetime, timedelta

import requests
import pandas as pd

In [14]:
# Our data spans these dates
# 1 st Jan 2018 to 31 st Dec 2018 
# yahoo finance historical data

def get_yahoofinance_hist(company_idx):
    session = requests.Session()
    # period1 and period2 for max and min date = company value can be formatted
    download_link = 'https://query1.finance.yahoo.com/v7/finance/download/{company}?period1=1514764800&period2=1546214400&interval=1d&events=history&includeAdjustedClose=true'
#https://query1.finance.yahoo.com/v7/finance/download/GPN?period1=1514764800&period2=1546214400&interval=1d&events=history&includeAdjustedClose=true
    try:
        url = download_link.format(company=company_idx)
        response = session.get(url)
        response.raise_for_status()
        
        if response.status_code != 404:
            df = pd.read_csv(StringIO(response.text), parse_dates=['Date'])
    
            # filtering as we only need close price
            df_fil = df[['Date','Close']]
            
            return df_fil
        else:
            #print("exc1")
            return -1
    
    except:
        #print("exc2")
        #error_info.append(company_idx)
        return -1
    

In [15]:
df = get_yahoofinance_hist('K')

In [16]:
df

Unnamed: 0,Date,Close
0,2018-01-02,67.970001
1,2018-01-03,67.650002
2,2018-01-04,68.730003
3,2018-01-05,68.940002
4,2018-01-08,69.169998
...,...,...
245,2018-12-21,57.639999
246,2018-12-24,55.820000
247,2018-12-26,57.150002
248,2018-12-27,57.139999


In [17]:
closePrice={}
error_info=[]
error_info_idx=[]

# scraping data for all the companies and removing those for which data could not be scraped
for i in range(len(earning_calls)):
    company_idx = earning_calls[i][9:]
    
    if company_idx not in closePrice.keys():
        df = get_yahoofinance_hist(company_idx)
        if type(df) == int: # in case when the url was not accessible
            error_info.append(earning_calls[i])
            error_info_idx.append(i)
        else:    
            closePrice[company_idx] = df
            

In [18]:
len(earning_calls)

320

In [19]:
# dictionary of closing price for various companies
len(closePrice.keys())

238

In [21]:
fil_calls = earning_calls

In [22]:
# removing all calls for which data was unavailable
for i in sorted(error_info_idx, reverse=True):
    #print(i)
    del fil_calls[i]

In [23]:
# filtered calls
len(fil_calls)

298

In [272]:
x = '20180510_FTK'

In [274]:
comp = x[9:]
df = closePrice[comp]

In [283]:
from datetime import datetime

datetime_object = datetime.strptime('2018-05-10', '%Y-%m-%d')

In [284]:
datetime_object

datetime.datetime(2018, 5, 10, 0, 0)

In [300]:
indices=[]
tou = 3 # timesteps
grdVals =np.zeros((len(fil_calls),2*tou+2))
idx =0
# there will be 8 timesteps as we need p(i-1) and p(i) to calculate the return price

#for i in range(1):
for call in fil_calls:
    #call = fil_calls[i]
    comp = call[9:]
    df =  closePrice[comp]
    
    date = call[:4]+'-'+call[4:6]+'-'+ call[6:8]
    
    # idx in the dataframe
    dt =-1
    
    for i in range(df.shape[0]):
        
        if str(df.iloc[i]['Date'])[:10]==date:
            dt =i
            indices.append(dt)
    
    if dt == -1:
        print("error")
    
    # taking 3 values before and 3 values after - tou = 3 for window
    vals=[]
    
    if (dt-tou-1)<0:
        # use the same value as close price day
        pre = [df.iloc[dt]['Close']]*(tou+1)
        vals = vals + pre
    else:
        # prev 4 days
        vals = vals + list(df.iloc[dt-tou-1:dt]['Close'])
    
    vals = vals+ [df.iloc[dt]['Close']]
    
    if (dt+tou)>(df.shape[0]):
        # use the same value as close price day
        post = [df.iloc[dt]['Close']]*(tou)
        vals = vals + post
    else:
        vals= vals + list(df.iloc[dt+1:dt+tou+1]['Close'])
    
    #print(idx)
    grdVals[idx]= vals
    idx+=1
    

grdVals = np.array(grdVals)

In [301]:
vals

[43.799999,
 43.700001,
 43.349998,
 45.0,
 40.950001,
 39.900002,
 40.099998,
 41.049999]

In [302]:
grdVals.shape

(298, 8)

In [303]:
fil_calls[5]

'20180123_HAFC'

In [304]:
grdVals[5]

array([32.049999, 31.700001, 32.200001, 32.200001, 32.099998, 31.75    ,
       31.950001, 31.6     ])

In [305]:
grdVals_df= pd.DataFrame(grdVals)
grdVals_df.to_csv(r'/Users/vishwa/Desktop/grdVals.csv')

## Approach - Getting Direct sentence embeddings from SentenceBERT

In [None]:
!pip install sentence-transformers

In [411]:
from sentence_transformers import SentenceTransformer

# to get the sentence_model
sentence_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [412]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package punkt to /Users/vishwa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [390]:
document = ["I ate dinner.", 
       "Bedford is an existing site that we have."]

In [393]:
def pad_sent_embed(embed,max_len):
    
    # as we dont want to lose context, we will add blank sentence as a prembedding
    #if len(embed)>120:
        # take first 120 lines
        #lines=lines[:120]
    #else:
        # zero vector
    dim_size = 768
    zer_vec = np.zeros((max_len-len(embed),dim_size))
    embed = np.concatenate((zer_vec,embed),axis=0)
        
    return embed

In [394]:
token_sent = []
for sentence in document:
    token_sent.append(word_tokenize(sentence.lower()))

In [None]:
token_sent

In [417]:
embeddings = sentence_model.encode(document)
import pickle

#Store sentences & embeddings on disc
with open('/Users/vishwa/Desktop/embeds/embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': document, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
with open('/Users/vishwa/Desktop/embeds/embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']

In [420]:
stored_embeddings.shape

(2, 768)

In [None]:
sentence_embeddings = sentence_model.encode(document)

In [None]:
len(sentence_embeddings[0])

In [None]:
sentence_embeddings

In [None]:
sentence_embeddings=pad_sent_embed(sentence_embeddings)
sentence_embeddings

In [554]:
cnt_ineq=0
file_ineq=0
max_len=0
min_date=30000000
max_date=0
tot_len=0



for i in range(len(fil_calls)):
#for i in range(5): 
    file =fil_calls[i]
          
    text_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/text.txt'
    f = open(text_file_path, "r")
    # to display file content
    content =f.read()
    lines = content.split("\n") # split at new line character
    
    if lines[-1]=='':
        lines = lines[:-1] # last line blank removal
    
    aud_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/features.csv'
    df = pd.read_csv(aud_file_path)
    
    # each sentence is mapped with the corresponding audio features
    # number of sentences in an earning call
    num_of_sent= len(lines)
    
    if len(lines)!=df.shape[0]:
        print("error")
        file_no=i
        cnt_ineq += 1
        
    if num_of_sent > max_len:
        max_len=num_of_sent
        
    tot_len+= len(lines)
        
    val = int(file[:8])

print("Max number of sentences in a call: {}".format(max_len))
print("Avg number of sentences in a call: {}".format(tot_len/len(fil_calls)))

text_data = np.zeros((len(fil_calls),max_len,768))

for i in range(len(fil_calls)):
#for i in range(201,298):
#for i in range(200,201):
    file =fil_calls[i]
    
    text_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/text.txt'
    f = open(text_file_path, "r")
    # to display file content
    content =f.read()
    lines = content.split("\n") # split at new line character
    
    if lines[-1]=='':
        lines = lines[:-1] # last line blank removal
        
    sentence_embeddings = sentence_model.encode(lines)
    sentence_embeddings_padded = pad_sent_embed(sentence_embeddings,max_len)
    
    text_data[i] = np.array(sentence_embeddings_padded)
    
    with open('/Users/vishwa/Desktop/embeds/'+ file +'.pkl', "wb") as fOut:
        pickle.dump({'embeddings': sentence_embeddings_padded}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
    
    if i%10==0:
        print("Iterations complete {}".format(i))

Max number of sentences in a call: 444
Avg number of sentences in a call: 175.29530201342283
Iterations complete 200


In [27]:
import pickle
with open('/Users/vishwa/Desktop/embeds/20180122_SFBS.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_embeddings = stored_data['embeddings']

In [555]:
file_path = "/Users/vishwa/Desktop/embeds"
file_list = os.listdir(file_path)
file_list.remove('.DS_Store')
file_list = sorted(file_list)

In [28]:
textData = np.zeros((len(fil_calls),max_len,768))

for i in range(len(fil_calls)):
    
    with open('/Users/vishwa/Desktop/embeds/'+fil_calls[i]+'.pkl', "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    
    textData[i] = np.array(stored_embeddings)

In [29]:
textData.shape

(298, 444, 768)

In [None]:
#import pad_sequences
!pip install tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [385]:
comp_list=[]
for i in range(len(fil_calls)):
    comp = fil_calls[i][9:]
    if comp not in comp_list:
        comp_list.append(comp)

In [387]:
len(comp_list)

238

## Populating Audio Feature Data

In [433]:
cnt_ineq=0
file_ineq=0
max_seg=0
min_date=30000000
max_date=0
tot_seg=0

for i in range(len(fil_calls)):
#for i in range(5): 
    file =fil_calls[i]
            
    text_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/text.txt'
    f = open(text_file_path, "r")
    # to display file content
    content =f.read()
    lines = content.split("\n") # split at new line character
    
    if lines[-1]=='':
        lines = lines[:-1] # last line blank removal
    
    aud_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/features.csv'
    df = pd.read_csv(aud_file_path)
    
    # each sentence is mapped with the corresponding audio features
    # number of audio utterance segments in an earning call
    num_of_seg= df.shape[0]
    
    if len(lines)!=df.shape[0]:
        print("error")
        file_no=i
        cnt_ineq += 1
        
    if num_of_seg > max_seg:
        max_seg=num_of_seg
        
    tot_seg+= num_of_seg
        
    val = int(file[:8])
        
    if val < min_date:
        min_date = val
    
    if val > max_date:
        max_date = val
        
print("Max number of audio utterances in a call: {}".format(max_seg))
print("Avg number of sentences in a call: {}".format(tot_seg/len(fil_calls)))
print("Min Date: {}".format(min_date))
print("Max Date: {}".format(max_date))

Max number of audio utterances in a call: 444
Avg number of sentences in a call: 175.29530201342283
Min Date: 20180104
Max Date: 20180605


In [33]:
# we need to pad the sequences
# and also truncate the sentences as this can cause sparsity issues

num_calls = len(fil_calls)
num_features = 29
num_segments = max_len

audioData = np.zeros((num_calls,num_segments,num_features))

In [34]:
np.shape(audioData)

(298, 444, 29)

In [35]:
# to pad audio segment data or truncate if necessary
def pad_aud_seg(audio_features):
    # as we dont want to lose context, we will add blank sentence as a prembedding
    # as the last hidden state captures context
    num_features = audio_features.shape[1] #29
    
    # 120 audio segments ~ avg 114
    # to avoid high dimensionality
    max_seg_len = max_len
    seg = audio_features.shape[0]
    if seg >= max_seg_len:
        # take first 120 segments - truncate
        padded_audio = np.array(audio_features[:max_seg_len,:])
    else:
        zero_padding = [[0]*num_features]*(max_seg_len-seg)
        padded_audio = np.concatenate((zero_padding,audio_features),axis=0)
        
    return padded_audio

In [36]:
import math
from sklearn.preprocessing import MinMaxScaler

idx = 0

for i in range(len(fil_calls)):
#for i in range(5): 
    file =fil_calls[i]
        
    text_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/text.txt'
    f = open(text_file_path, "r")
    # to display file content
    content =f.read()
    lines = content.split("\n") # split at new line character
    
    if lines[-1]=='':
        lines = lines[:-1] # last line blank removal
    
    aud_file_path = r'/Users/vishwa/Desktop/MAEC-master/MAEC_Dataset/'+ file +'/features.csv'
    df = pd.read_csv(aud_file_path)
    
    # each sentence is mapped with the corresponding audio features
    # number of audio utterance segments in an earning call
    num_of_seg= df.shape[0]
    
    if len(lines)!=df.shape[0]:
        print("error")
        file_no=i
        cnt_ineq += 1
        
    #print(idx)
    #print(num_of_seg)
    
    # replacing undefined str type
    df = df.replace('--undefined--', float('nan'))
    df = df.replace('--undefined-', float('nan'))
    df = df.replace('--undefined-- ', float('nan'))
    # fill forward to fill these values
    for col in df.columns:
        #try:
           # df[col].fillna( method ='ffill', inplace = True) 
        # to catch exception for all nan values
        #except RuntimeWarning as e:
            #print("error")
        df[col].fillna(0, inplace = True) 
    
    
    audio_feat = np.array(df)
    # pad the audio segment for efficient batching
    padded_values = pad_aud_seg(audio_feat)
    
    audioData[idx] = padded_values
    
    if idx%100==0:
        print("Files completed : {}".format(idx))
    
    idx +=1 

Files completed : 0
Files completed : 100
Files completed : 200


In [37]:
audioData.shape

(298, 444, 29)

# Computing Past 3 day volatility and next 3 day volatility

In [390]:
# sending a list of closing price
def compute_volatility(close_pr,close_pr_prev):
    
    return_pr= [pr/pr_prev for pr,pr_prev in zip(close_pr,close_pr_prev)]
    #return_pr = [ (pr)/(pr-1) for pr in close_pr ]
    
    mean_rt = np.mean(return_pr)
    diff_rt = (return_pr-mean_rt)
    if len(close_pr)==0:
        print("error")
    vol = np.log(np.sqrt(sum(np.multiply(diff_rt,diff_rt))/(len(close_pr))))
    
    return vol

In [391]:
# scraped and processed data
vol_file_path = r'/Users/vishwa/Desktop/grdVals.csv'
closing_pr = pd.read_csv(vol_file_path)
closing_pr= closing_pr.drop(['Unnamed: 0'],axis=1)
closing_pr.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [392]:
# volitility of prev 3 days
pre_vol=[]

# volitility of next 3 days -- var to be predicted
post_vol=[]

for i in range(closing_pr.shape[0]):
    pre_curr = [closing_pr.iloc[i]['1']]+[closing_pr.iloc[i]['2']]+[closing_pr.iloc[i]['3']]
    pre_past = [closing_pr.iloc[i]['0']]+[closing_pr.iloc[i]['1']]+[closing_pr.iloc[i]['2']]
    
    post_curr = [closing_pr.iloc[i]['5']]+[closing_pr.iloc[i]['6']]+[closing_pr.iloc[i]['7']]
    post_past = [closing_pr.iloc[i]['4']]+[closing_pr.iloc[i]['5']]+[closing_pr.iloc[i]['6']]
    
    pre_vol.append(compute_volatility(pre_curr,pre_past))
    post_vol.append(compute_volatility(post_curr,post_past))

  vol = np.log(np.sqrt(sum(np.multiply(diff_rt,diff_rt))/(len(close_pr))))


In [393]:
corr=[]
for i in range(len(pre_vol)):
    if pre_vol[i]!=np.float('-inf'):
        corr.append(pre_vol[i])

In [394]:
for i in range(len(pre_vol)):
    if pre_vol[i]==np.float('-inf'):
        pre_vol[i]= np.mean(corr)

In [395]:
for i in range(len(post_vol)):
    if post_vol[i]==np.float('-inf'):
        #print("here")
        #print(i)
        post_vol[i]= np.mean(corr)

## Model

In [411]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import concatenate

In [516]:
import tensorflow
tensorflow.random.set_seed(2)

### Model 1: Concatenation Layer - CNN and Past Values

In [538]:

feature_emb_dim = 768
feature_aud_dim = 29 # number of audio features
batch_size = 8

# Text Encoder
# our text already is in the form of embeddings
input_text = keras.Input(shape=(max_len,feature_emb_dim))
# Add 1 bidirectional LSTM
layer1_t = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(input_text)
output_t = layers.Dense(1, activation="tanh")(layer1_t)

# Audio Encoder
# our audio is in the form of features
input_aud = keras.Input(shape=(max_len,feature_aud_dim))
# Add 1 bidirectional LSTM
layer1_a = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(input_aud)
output_a = layers.Dense(1, activation="tanh")(layer1_a)

# Fusion
fusion = concatenate([output_t, output_a])
fusion2 = layers.Reshape((max_len,2,1))(fusion)
fusion3 = layers.Conv2D(filters=1,kernel_size=(4,2),padding="valid",activation="linear",input_shape=(max_len,2))(fusion2)
fusion4 = layers.Reshape((fusion3.shape[1],))(fusion3)
fusion5 = layers.Dense(128, activation="linear")(fusion4)

layer_txt_aud = layers.Dense(64, activation="linear")(fusion5)

# combined output from both text and audio
output_txt_aud = layers.Dense(1, activation="linear")(layer_txt_aud)

# Input from past val
input_past = keras.Input(shape=(1,))
input_final = concatenate([output_txt_aud, input_past])

# output volatility
output_vol = layers.Dense(1, activation="linear")(input_final)

# Final Vector gives a dense layer as op

In [539]:
fusion4

<KerasTensor: shape=(None, 441) dtype=float32 (created by layer 'reshape_51')>

In [540]:
# Tying the entire model
model_cnn = Model(inputs=[input_text, input_aud,input_past], outputs=output_vol)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model_cnn.compile(loss='mean_squared_error', optimizer=opt)

In [541]:
input_aud

<KerasTensor: shape=(None, 444, 29) dtype=float32 (created by layer 'input_148')>

In [542]:
# Train and test split
# Since this is dependent on time series, we cannot make use of random split

split= 200
train_txt = np.array(textData[:split])
train_aud = np.array(audioData[:split])
train_past = np.array(pre_vol[:split])

train_y = np.array(post_vol[:split])

test_txt = np.array(textData[split:])
test_aud = np.array(audioData[split:])
test_past = np.array(pre_vol[split:])

test_y = np.array(post_vol[split:])

In [543]:
# pre processing the data
# in this case we will have to preprocess each branch

# scaling the audio data
from sklearn.preprocessing import MinMaxScaler

scaler_layer = {}
for i in range(train_aud.shape[1]):
    scaler_layer[i] = StandardScaler()
    train_aud[:, i, :] = scaler_layer[i].fit_transform(train_aud[:, i, :]) 

for i in range(test_aud.shape[1]):
    test_aud[:, i, :] = scaler_layer[i].transform(test_aud[:, i, :]) 

In [544]:
train_txt.shape

(200, 444, 768)

In [546]:
# train the model
print("training model...")
model_cnn.fit(
x=[train_txt, train_aud,train_past], y=train_y,
epochs=25, batch_size=8)

training model...
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1e3252b20>

In [547]:
results = model_cnn.evaluate([test_txt,test_aud,test_past], test_y, batch_size=8)
print("test loss, test acc:", results)

test loss, test acc: 1.9353705644607544


In [None]:
# to check predictions
preds = model_cnn.predict([test_txt,test_aud,test_past])
for i in range(len(preds)):
    print("{} {}".format(preds[i][0],test_y[i]))

In [568]:
#!pip install h5py

In [569]:
#!pip install keras

In [552]:
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json

# serialize model to JSON
model_json = model_cnn.to_json()
with open("cnn3.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model_cnn.save_weights("cnn3.h5")
print("Saved model to disk")

Saved model to disk


In [553]:
# load json and create model
json_file = open("cnn3.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("cnn3.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='mean_squared_error', optimizer=opt,metrics=['mean_squared_error'])
score = loaded_model.evaluate([test_txt,test_aud,test_past],test_y, verbose=0)
print("%s: %.4f" % (loaded_model.metrics_names[1], score[1]))

Loaded model from disk
mean_squared_error: 1.9354


In [554]:
test_txt.shape

(98, 444, 768)

### Model 2: Concatenation Layer - LSTM and Past Value

In [555]:

feature_emb_dim = 768
feature_aud_dim = 29
batch_size = 8

# Text Encoder
# our text already is in the form of embeddings
input_text = keras.Input(shape=(max_len,feature_emb_dim))
# Add 1 bidirectional LSTM
layer1_t = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(input_text)
output_t = layers.Dense(1, activation="tanh")(layer1_t)

# Audio Encoder
# our text already is in the form of embeddings
input_aud = keras.Input(shape=(max_len,feature_aud_dim))
# Add 1 bidirectional LSTM
layer1_a = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(input_aud)
output_a = layers.Dense(1, activation="tanh")(layer1_a)

# Fusion
fusion = concatenate([output_t, output_a])
fusion2 = layers.LSTM(1, return_sequences=True)(fusion)
fusion3 = layers.Reshape((fusion2.shape[1],))(fusion2)
fusion4 = layers.Dense(1, activation="linear")(fusion3)

layer_txt_aud = layers.Dense(64, activation="linear")(fusion4)
output_txt_aud = layers.Dense(1, activation="linear")(layer_txt_aud)

# Input from past val
input_past = keras.Input(shape=(1,))
input_final = concatenate([output_txt_aud, input_past])

# output volatility
output_vol = layers.Dense(1, activation="linear")(input_final)

# Final Vector gives a dense layer as op

In [556]:
input_final

<KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'concatenate_111')>

In [557]:
# Input to the model includes text input, audio input and the past volatility

# Tying the entire model
model_lstm = Model(inputs=[input_text, input_aud,input_past], outputs=output_vol)
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model_lstm.compile(loss='mean_squared_error', optimizer=opt)

In [558]:
# Train and test split

split= 200
train_txt = np.array(textData[:split])
train_aud = np.array(audioData[:split])
train_past = np.array(pre_vol[:split])

train_y = np.array(post_vol[:split])

test_txt = np.array(textData[split:])
test_aud = np.array(audioData[split:])
test_past = np.array(pre_vol[split:])

test_y = np.array(post_vol[split:])

# pre processing the data
# in this case we will have to preprocess each branch

# scaling the audio data
from sklearn.preprocessing import MinMaxScaler

scaler_layer = {}
for i in range(train_aud.shape[1]):
    scaler_layer[i] = StandardScaler()
    train_aud[:, i, :] = scaler_layer[i].fit_transform(train_aud[:, i, :]) 

for i in range(test_aud.shape[1]):
    test_aud[:, i, :] = scaler_layer[i].transform(test_aud[:, i, :]) 

In [559]:
test_aud.shape

(98, 444, 29)

In [560]:
# train the model
print("training model...")
cnt += 1
model_lstm.fit(
x=[train_txt, train_aud,train_past], y=train_y,
#validation_data=([test_txt,test_aud,test_past], test_y),
epochs=25, batch_size=8)

training model...
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1cdd38e20>

In [561]:
results = model_lstm.evaluate([test_txt,test_aud,test_past], test_y, batch_size=8)
print("test loss, test acc:", results)

test loss, test acc: 0.8469723463058472


In [562]:
# from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json

# serialize model to JSON
model_json = model_lstm.to_json()
with open("model_lstm.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model_lstm.save_weights("model_lstm.h5")
print("Saved model to disk")
 


Saved model to disk


In [563]:
# load json and create model
json_file = open('model_lstm.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_lstm.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='mean_squared_error', optimizer=opt,metrics=['mean_squared_error'])
score = loaded_model.evaluate([test_txt,test_aud,test_past],test_y, verbose=0)
print("%s: %.4f" % (loaded_model.metrics_names[1], score[1]))

Loaded model from disk
mean_squared_error: 0.8470


In [518]:
print(loaded_model) 
print(loaded_model.summary())

<tensorflow.python.keras.engine.functional.Functional object at 0x1e824b460>
Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_111 (InputLayer)          [(None, 444, 768)]   0                                            
__________________________________________________________________________________________________
input_112 (InputLayer)          [(None, 444, 29)]    0                                            
__________________________________________________________________________________________________
bidirectional_118 (Bidirectiona (None, 444, 256)     918528      input_111[0][0]                  
__________________________________________________________________________________________________
bidirectional_119 (Bidirectiona (None, 444, 128)     48128       input_112[0][0]                  
______________