 - Required Libraries

In [None]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)


In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import re
import calendar
import warnings
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional, Embedding, LSTM, Dense, Conv1D, GlobalMaxPool1D, MaxPool1D, MaxPooling1D, Dropout, Activation , Flatten , Input, concatenate
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from    keras.utils.vis_utils    import plot_model

 - Keras imports

 - Load the Data

In [None]:
reutersFile = 'news_reuters.csv'
stockFile = 'stockReturns.json'
df1 = pd.read_csv('../input/nlp-project-data/Data/news_reuters.csv', header=None, 
                  names=['ticker', 'company', 'pub_date', 'headline', 'first_sent', 'category'])
df2 = pd.read_json('../input/nlp-project-data/Data/stockReturns.json')

In [None]:
df2

In [None]:
def reformat_y_data(data, tickerType='mid'):
    tmp = data[tickerType].apply(pd.Series)
    tmp = tmp.stack().rename('price', inplace=True).reset_index()
    tmp['y'] = np.where(tmp['price'] >= 0, 1, 0)
    tmp.rename(columns={'level_0': 'ticker', 'level_1': 'pub_date'}, inplace=True)
    return tmp

def clean_and_merge_data(X, Y):
    y_tickers = set(Y['ticker'])
    X = X.loc[X['ticker'].isin(y_tickers)]
    # Make sure data types are the same for merge    
    Y['pub_date'] = Y['pub_date'].astype(df1['pub_date'].dtype)
    Y['ticker'] = Y['ticker'].astype(df1['ticker'].dtype)
    return X.merge(Y, on=['ticker', 'pub_date'], how='left')

def clean_text(sent):
    monthStrings = list(calendar.month_name)[1:] + list(calendar.month_abbr)[1:]
    monthPattern = '|'.join(monthStrings)
    sent = re.sub(r' +', ' ', sent)
    sent = re.sub(r'U.S.', 'United States', sent)
    sent = re.sub(r'^(\W?[A-Z\s\d]+\b-?)', '', sent)
    sent = re.sub(r'^ ?\W ', '', sent)
    sent = re.sub(r'({}) \d+'.format(monthPattern), '', sent)
    sent = re.sub(r' +', ' ', sent)
    return sent 

def tokenize_sent(col):
    return [text_to_word_sequence(text, lower=False) for text in col]

def filt_to_one(x, random_state=10):
    if x.shape[0] > 1:
        if 'topStory' in x['category'].unique():
            x = x.loc[x['category'] == 'topStory']
        if x.shape[0] > 1:
            x = x.sample(n=1, random_state=random_state)
    return x

In [None]:
cleanY = reformat_y_data(df2, 'short')
merged = clean_and_merge_data(df1, cleanY)

In [None]:
merged

In [None]:
df1

 - Clean up the Data

In [None]:
for i in range(0,len(merged)):
    a=merged['headline'][i]+merged['first_sent'][i]
    merged['final_text'][i]=a

In [None]:
merged

In [None]:

# Clean up text
merged['headline'] = merged.headline.apply(clean_text)
merged['first_sent'] = merged.first_sent.apply(clean_text)
merged['final_text'] = merged.first_sent.apply(clean_text)
# Turn sentences into tokens
merged['headline_token'] = tokenize_sent(merged.headline)
merged['first_sent_token'] = tokenize_sent(merged.first_sent)
# Get one record per company/day
finalData = merged.groupby(by=['ticker', 'pub_date']).apply(filt_to_one)
# Combine Headline and First Sentence into one text 
finalData['final_text_tokens'] = finalData['headline_token'] + finalData.first_sent_token
# Remove observations with missing stock price
finalData.dropna(inplace=True)
new_columns = ['ticker2', 'company', 'pub_date2', 
            'headline', 'first_sent', 'category', 
            'price', 'y', 'final_text','headline_token', 
            'first_sent_token','final_text_tokens']
finalData.columns = new_columns
finalData.reset_index(inplace=True)
X = finalData['final_text'].values
y = finalData['y'].values

In [None]:
finalData

In [None]:
data=finalData['first_sent_token']
print(len(data))

In [None]:
j=0
for i in data:
    j=j+len(i)
j=j/11390
print(j)

 - Create a train and test set, retaining the same test set for every model

In [None]:
#split data into training and testing sets and stratify on y
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)
trainTokensAsString = X_train
testTokensAsString = X_test

#how many unique words to use (i.e num rows in embedding vector)
MAX_NUM_WORDS=40 

#max number of words in a review to use
MAX_SEQUENCE_LENGTH=100 
#fit tokenizer on X_train
#create padded sequences
tokenizer = Tokenizer(num_words=200)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
train_data = pad_sequences(sequences, maxlen=100)

#convert y_train to one-hot encoded version
word_index = tokenizer.word_index
y_train_labels = to_categorical(np.asarray(y_train))

#check shape of train_data and y_train_labels
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', y_train_labels.shape)
#fit tokenizer on X_test
tokenizer_test = Tokenizer(num_words=200)
tokenizer_test.fit_on_texts(X_test)
#create padded sequences
sequences_test = tokenizer_test.texts_to_sequences(X_test)
test_data = pad_sequences(sequences_test, maxlen=100)
#convert y_test to one-hot encoded version
word_index_text = tokenizer_test.word_index
y_test_labels = to_categorical(np.asarray(y_test))

#check shape of test_data and y_test_labels
print('Shape of data tensor:', test_data.shape)
print('Shape of label tensor:', y_test_labels.shape)

 - Load word embeddings

In [None]:
#I assume that you have the 'glove.6B.100d.txt' file in your directory
GLOVE_DIR=''
embeddings_index = {}
f = open('../input/glove6b100/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

 - Create Embedding Matrix

In [None]:
#set the size of each word vector
EMBEDDING_DIM = 100 

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
       # words not found in embedding index will be all-zeros.
       embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)


 - Define functions to calculate precision and recall

## Model 2: CNN

In [None]:
def vectorize_sentences(data, lexicon, maxlen=200):
    X = []
    for sentences in data:
        x = [lexicon[token] if token in lexicon else lexicon['<UNK>'] for 
                                 token in sentences]
        x2 = np.eye(len(char_indices) + 1)[x]
        X.append(x2)
    return (pad_sequences(X, maxlen=maxlen))

def create_cnn_model(char_maxlen, vocab_size,
                     nb_filter=100, filter_kernels = [4] * 4,
                     pool_size=3, n_dense_nodes=100,
                     drop_out=.2, n_out=2):

    inputs = Input(shape=(char_maxlen, vocab_size), name='char_input_layer')

    conv1 = Conv1D(nb_filter, kernel_size=filter_kernels[0],
                  padding='valid', activation='relu',
                  input_shape=(char_maxlen, vocab_size))(inputs)
    
    maxpool1 = MaxPool1D(pool_size=pool_size)(conv1)

    conv2 = Conv1D(nb_filter, kernel_size=filter_kernels[1],
                          padding='valid', activation='relu')(maxpool1)
    maxpool2 = MaxPool1D(pool_size=pool_size)(conv2)

    conv3 = Conv1D(nb_filter, kernel_size=filter_kernels[2],
                          padding='valid', activation='relu')(maxpool2)

    conv4 = Conv1D(nb_filter, kernel_size=filter_kernels[3],
                          padding='valid', activation='relu')(conv3)

    maxpool3 = MaxPool1D(pool_size=pool_size)(conv4)
    flatten = Flatten()(maxpool3)

    dense_layer = Dense(n_dense_nodes, activation='relu')(flatten)
    dropout = Dropout(drop_out)(dense_layer)

    output_layer = Dense(n_out, activation='softmax', name='output')(dropout)

    model = Model(inputs=inputs, outputs=output_layer)

    model.compile(loss='categorical_crossentropy', optimizer="adam", 
                  metrics=['accuracy', recall, precision])    
    return model 

char_maxlen = 1024 
nb_filter = 128
dense_outputs = 1024
filter_kernels = [7, 5, 5, 3]
pool_size = 5
# Turn all tokens into one string and then all obs 
# into one overall string

oneTxt = ' '.join(trainTokensAsString)

# Get info about characters
chars = set(oneTxt)
vocab_size = len(chars) + 1
print('total chars:', vocab_size)
char_indices = dict((c, i + 2) for i, c in enumerate(chars))
indices_char = dict((i + 2, c) for i, c in enumerate(chars))

char_indices['<UNK>'] = 1
indices_char[1] = '<UNK>'

trainTokensAsString = X_train
testTokensAsString = X_test
trainCharData = vectorize_sentences(trainTokensAsString, char_indices, 1024)
testCharData = vectorize_sentences(testTokensAsString, char_indices, 1024)
trainCharData.shape
testCharData.shape



In [None]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
def train_and_test_model(model, x_train, y_train, x_test, y_test, 
                         modelSaveName, modelSavePath='',
                         batch_size=64, epochs=2, validation_split=.1):
    print(model.summary())
    
    filepath = os.path.join(modelSavePath, modelSaveName + '.hdf5')
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1)
    callbacks_list = [checkpoint]
    model.fit(x=x_train, y=y_train, batch_size=batch_size, 
              epochs=epochs, validation_split=validation_split, 
              callbacks=callbacks_list,verbose=1)
    
    score, acc, rec, prec = model.evaluate(x_test, y_test, batch_size=batch_size)
    return (model, acc, rec, prec)  

n_out = 2




In [None]:
cnn_model = create_cnn_model(char_maxlen=char_maxlen, 
                             vocab_size=vocab_size,
                             nb_filter=nb_filter, 
                             filter_kernels=filter_kernels,
                             pool_size=pool_size, 
                             n_dense_nodes=dense_outputs,
                             drop_out=.5, 
                             n_out=n_out)

#plot_model(cnn_model, to_file="cnnmodel.png",show_shapes=True)
cnn_res = train_and_test_model(cnn_model, trainCharData[:, :, 1:],
                               y_train_labels, 
                               testCharData[:, :, 1:], 
                               y_test_labels, 
                               'cnn_model',
                               epochs=1)

In [None]:
train_data.shape

# **ML**

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
!pip install xgboost
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
models = [Perceptron(), 
          LogisticRegression(C=1000.0, solver='liblinear',random_state=0,class_weight='balanced'), 
          SVC(kernel='rbf',gamma=0.2,C=0.5,class_weight='balanced',),
          MLPClassifier(hidden_layer_sizes=(200,100,50),activation='relu', solver='adam', alpha=0.0001),
          neighbors.KNeighborsClassifier(n_neighbors=6,  n_jobs=1),
          DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=0,class_weight='balanced'),
          RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=1, n_jobs=2),
          XGBClassifier(n_estimators=200, use_label_encoder=False, max_depth=4, learning_rate=0.1,scale_pos_weight=0.5)
         ]

In [None]:
print(train_data.shape)
print(y_train.shape)
print(test_data.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() 

sc.fit(train_data)

print(sc.scale_, sc.mean_)

X_train_std = sc.transform(train_data)
X_test_std = sc.transform(test_data)

In [None]:
model_names = ['Perceptron','LogisticRegression','No-linear-SVM','neural_network','KNN', 'DecisionTreeClassifier', 'RandomForest', 'XGBOOST']
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
def train_and_get_importance(model, model_name):
    print(model_name+'----traning')
    model.fit(X_train_std,y_train)
    y_pred = model.predict(X_test_std)
    print('Misclassified samples: %d' % (y_test != y_pred).sum())
    from sklearn.metrics import accuracy_score
    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred, digits=4, target_names=['0', '1']))
    con_mat=metrics.confusion_matrix(y_test, y_pred)
    print(con_mat)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    print ('auc score', auc(false_positive_rate, true_positive_rate))
   

for model, model_name in zip(models, model_names):
    train_and_get_importance(model, model_name)

# **Model 1RNN**

In [None]:
from keras.layers import LSTM,GRU
from keras.layers.recurrent import SimpleRNN
def create_rnn_model2(seq_input_len, embed_matrix, 
                     n_RNN_nodes, n_dense_nodes, 
                     recurrent_dropout=0.2, 
                     drop_out=.2, n_out=2):
    
    word_input = Input(shape=(seq_input_len,), name='word_input_layer')
    word_embeddings = Embedding(input_dim=embed_matrix.shape[0],
                                output_dim=embed_matrix.shape[1],
                                weights=[embed_matrix], 
                                mask_zero=True, 
                                name='word_embedding_layer')(word_input) 
    hidden_layer1 = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=True, 
                                      recurrent_dropout=recurrent_dropout, 
                                      dropout=drop_out, name='hidden_layer1'))(word_embeddings)
    hidden_layer2 = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=False, 
                                      recurrent_dropout=recurrent_dropout,
                                      dropout=drop_out, name='hidden_layer2'))(hidden_layer1)
    dense_layer = Dense(units=n_dense_nodes, activation='relu', name='dense_layer')(hidden_layer2)
    drop_out3 = Dropout(drop_out)(dense_layer)
    output_layer = Dense(units=n_out, activation='softmax',
                         name='output_layer')(drop_out3)
    model = Model(inputs=[word_input], outputs=output_layer)
    model.compile(loss='categorical_crossentropy', optimizer="adam", 
                  metrics=['accuracy', recall, precision])
    return model 

rnn_model = create_rnn_model2(seq_input_len=train_data.shape[-1],
                             embed_matrix=embedding_matrix, 
                             recurrent_dropout=.4, drop_out=.5,
                             n_RNN_nodes=500, n_dense_nodes=500, n_out=n_out)


In [None]:
plot_model(rnn_model, to_file="rnnmodel.png",show_shapes=True)

In [None]:
nb_epoch = 1

rnn_res = train_and_test_model(rnn_model, train_data, 
                               y_train_labels, test_data, 
                               y_test_labels, 'rnn_model',
                               epochs=nb_epoch)

## Model 3: RNN+CNN

In [None]:
from keras.layers import LSTM,GRU
from keras.layers.recurrent import SimpleRNN
def create_cnn_rnn_model(rnn_input_len, char_maxlen, vocab_size,
                         embed_matrix, n_RNN_nodes, 
                         nb_filter=100, filter_kernels = [4] * 4,
                         pool_size=3, n_dense_nodes=100,
                         recurrent_dropout=0.2, 
                         drop_out=.2, n_out=2):
    
    word_input = Input(shape=(rnn_input_len,), name='word_input_layer')
    char_input = Input(shape=(char_maxlen, vocab_size), name='char_input_layer')
    
    word_embeddings = Embedding(input_dim=embed_matrix.shape[0],
                                output_dim=embed_matrix.shape[1],
                                weights=[embed_matrix], 
                                mask_zero=True, 
                                name='word_embedding_layer')(word_input) 

    rnn_output1 = Bidirectional(GRU(units=n_RNN_nodes, return_sequences=True, 
                                      recurrent_dropout=recurrent_dropout, 
                                      dropout=drop_out, name='hidden_layer1'))(word_embeddings)
    
    rnn_output2 = Bidirectional(GRU(units=n_RNN_nodes, return_sequences=False, 
                                      recurrent_dropout=recurrent_dropout,
                                      dropout=drop_out, name='hidden_layer2'))(rnn_output1)
            
    conv1 = Conv1D(nb_filter, kernel_size=filter_kernels[0],
                  padding='valid', activation='relu',
                  input_shape=(char_maxlen, vocab_size))(char_input)

    maxpool1 = MaxPool1D(pool_size=pool_size)(conv1)

    conv2 = Conv1D(nb_filter, kernel_size=filter_kernels[1],
                          padding='valid', activation='relu')(maxpool1)
    maxpool2 = MaxPool1D(pool_size=pool_size)(conv2)

    conv3 = Conv1D(nb_filter, kernel_size=filter_kernels[2],
                          padding='valid', activation='relu')(maxpool2)

    conv4 = Conv1D(nb_filter, kernel_size=filter_kernels[3],
                          padding='valid', activation='relu')(conv3)

    maxpool3 = MaxPool1D(pool_size=pool_size)(conv4)
    cnn_output = Flatten()(maxpool3)

    merged_layer = concatenate([cnn_output, rnn_output2])
    
    dense_layer1 = Dense(n_dense_nodes, activation='relu', name='dense_layer')(merged_layer)
    drop_out1 = Dropout(drop_out)(dense_layer1)
    dense_layer2 = Dense(n_dense_nodes, activation='relu')(drop_out1)
    drop_out2 = Dropout(drop_out)(dense_layer2)
    
    main_output = Dense(n_out, activation='softmax', name='output_layer')(drop_out2)

    model = Model(inputs=[word_input, char_input], outputs=[main_output])

    model.compile(loss='categorical_crossentropy', optimizer="adam", 
                  metrics=['accuracy', recall, precision])    

    return model 

In [None]:
cnn_rnn_model = create_cnn_rnn_model(rnn_input_len=train_data.shape[-1], 
                                     char_maxlen=char_maxlen, 
                                     vocab_size=vocab_size,
                                     embed_matrix=embedding_matrix, 
                                     n_RNN_nodes=500,
                                     nb_filter=nb_filter, 
                                     filter_kernels=filter_kernels,
                                     pool_size=pool_size, 
                                     n_dense_nodes=400,
                                     recurrent_dropout=0.4, 
                                     drop_out=.5, 
                                     n_out=n_out)
plot_model(cnn_rnn_model, to_file="cnn_rnn_model.png",show_shapes=True)

In [None]:
nb_epoch = 1
cnn_rnn_res = train_and_test_model(cnn_rnn_model, 
                               [train_data, trainCharData[:, :, 1:]],
                               y_train_labels, 
                               [test_data, testCharData[:, :, 1:]],
                               y_test_labels, 
                               'cnn_rnn_model',
                               epochs=nb_epoch)

## 4. Compare performance of all of models in a table (precision and recall)

In [None]:
pd.DataFrame.from_records([rnn_res[1:4], cnn_rnn_res[1:4]], 
                          columns=['accuracy', 'recall', 'precision'], 
                         index=['rnn_mod', 'cnn_rnn_mod'])

In [None]:
pd.DataFrame.from_records([rnn_res[1:4]], 
                          columns=['accuracy', 'recall', 'precision'], 
                         index=['rnn_mod'])

## 5. Look at your labeling and print out the underlying data compared to the labels - for each model print out 2-3 examples of a good classification and a bad classification. Make an assertion why your model does well or poorly on those outputs.

In [None]:
def print_classifications(classifications, classType, test_y, test_text):
    texts = [''.join(sent) for sent in test_text[classifications]]
    stock_movements = np.where(test_y[classifications], 'positive', 'negative')
    print('Examples of {} predictions:\n'.format(classType))
    for i in range(len(texts)):
        print('Stock movement was {}'.format(stock_movements[i]))
        print('News info:\n{}'.format(texts[i]))
        print('')

In [None]:
def predict_and_print_samples(model, modelName, test_x, test_y=y_test, test_text = X_test):
    """"Print out predictions of the model"""
    print('Stats for {} model'.format(modelName))
    res = model.predict(test_x)
    class_res = np.apply_along_axis(np.argmax, axis=1, arr=res)
    comparisons = class_res == test_y
    comparisons = pd.DataFrame(comparisons)
    good_class = comparisons.loc[comparisons[0] == True].index[0:3]
    bad_class = comparisons.loc[comparisons[0] == False].index[0:3]
    print_classifications(good_class, 'correct', test_y, test_text)
    print_classifications(bad_class, 'INcorrect', test_y, test_text)
    y_test_df = pd.DataFrame(y_test)
    top3MostProbPosArg = np.argsort(res[:, 1])[-3:]
    top3Y = y_test_df.iloc[top3MostProbPosArg]
    top3Probs = pd.Series(res[top3MostProbPosArg, 1], index=top3Y.index)
    top3Data = pd.concat([top3Y, top3Probs], axis=1)
    top3Data.columns = ['Actual', 'PositiveProb']
    print('')
    print('Top 3 Most Positive Probability:')
    print(top3Data)

In [None]:
predict_and_print_samples(rnn_res[0], 'RNN', test_data)
#predict_and_print_samples(cnn_res[0], 'CNN', testCharData[:, :, 1:])
predict_and_print_samples(cnn_rnn_res[0], 'CNN_RNN', [test_data, testCharData[:, :, 1:]])

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)