In [4]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from collections import Counter
from importlib.machinery import SourceFileLoader
from os.path import join
from torchtext.vocab import GloVe
import seaborn as sns

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords' ,quiet=True)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

import re

STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup

# from google.colab import files, drive
import os

# Web Scraping
import requests
from bs4 import BeautifulSoup

In [39]:
# Read csvs
df_train = pd.read_csv("../input/finance-dataset/finance_train.csv")
df_test = pd.read_csv("../input/finance-dataset/finance_test.csv")

# Constants
PUNCTUATION = '!#$%&()*,-./:;<=>?@^_`{|}~'
PUNCTUATION_RE = re.compile("[%s]" % PUNCTUATION)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))


# Hyperparameters
MAX_NB_WORDS = None
MAX_SEQUENCE_LENGTH = None
EPOCHS = None
EMBEDDING_DIM = None
BATCH_SIZE = None

# Clean text - returns lowercase text with removed chars and stopwords
def clean_text(text:str):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = text.replace('x', '')
    text = PUNCTUATION_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

# Padding and indexing of unique words
def pad_sequences_train(train, test):
  tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters=PUNCTUATION, lower=True)
  tokenizer.fit_on_texts(train['Sentence'].values)
  word_index = tokenizer.word_index
  X = tokenizer.texts_to_sequences(train['Sentence'].values)
  X_train = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
  Xt = tokenizer.texts_to_sequences(test['Sentence'].values)
  X_test = pad_sequences(Xt, maxlen=MAX_SEQUENCE_LENGTH)
  return X_train, X_test

# Run model
def run_model(xtr, ytr, xt, yt, labelnum, epochs=5, max_sequence_length=256, max_nb_words=1000, embedding_dim=300):
  # Problematic input
  if any(x is None for x in [xtr, ytr, xt, yt, epochs, max_sequence_length, max_nb_words, embedding_dim]):
    print('Replace the None values above with your new values before calling the run_model() function.')
    return None, None, None
  
  # NN
  model = Sequential() # Container
  model.add(Embedding(max_nb_words+1, embedding_dim, mask_zero=True, input_length=max_sequence_length)) # Embedding
  model.add(SpatialDropout1D(0.2)) # Dropout
  model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) # LSTM layer
  model.add(Dense(labelnum, activation='softmax')) # Densely connected layer with softmax activation
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  # Performance
  print(model.summary())
  history = model.fit(xtr, 
                    ytr, 
                    epochs=epochs, 
                    batch_size=BATCH_SIZE,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
  test_loss, test_accuracy = model.evaluate(xt, yt)
  return model, history, test_accuracy

# Softmax 
def softmax(model_output):
    ret = []
    for logits in model_output:
        total = sum(np.exp(logits))
        tmp = []
        for logit in logits:
            tmp.append(np.exp(logit)/total)
        ret.append(tmp)
    return ret
        

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Get rid of unwanted characters and punctuation to reduce noise
df_train["Sentence"] = df_train["Sentence"].map(clean_text)
df_test["Sentence"] = df_test["Sentence"].map(clean_text)

In [7]:
df_train.head()
df_test.head()

Unnamed: 0,Sentence,Label
0,third quarter 2010 net sales increased 52 eur ...,2
1,foundries division reports sales increased 97 ...,2
2,financing project come mainly china,1
3,sukhraj dulai 2900 block boni sue court culdes...,1
4,finland leading metals group outokumpu said fo...,2


In [8]:
MAX_SEQUENCE_LENGTH = 256
MAX_NB_WORDS = 1000


In [9]:
# Pad X train and X test, with the model fitted to X train
X_train, X_test = pad_sequences_train(df_train, df_test)
X_train

array([[  0,   0,   0, ..., 347,  91, 161],
       [  0,   0,   0, ..., 285,  93, 484],
       [  0,   0,   0, ...,  39, 185, 654],
       ...,
       [  0,   0,   0, ...,  32,  11,  16],
       [  0,   0,   0, ...,  11,  94,  15],
       [  0,   0,   0, ..., 909,  33, 218]], dtype=int32)

In [10]:
X_test

array([[  0,   0,   0, ...,   4,   1,   2],
       [  0,   0,   0, ...,  32,  11,  31],
       [  0,   0,   0, ..., 786, 534, 270],
       ...,
       [  0,   0,   0, ...,  25, 954, 363],
       [  0,   0,   0, ..., 536, 931,  27],
       [  0,   0,   0, ...,   2,   1,   2]], dtype=int32)

In [11]:
# Get dummies to turn labels into a format where it can be processed by a probabilistic model
Y_train = pd.get_dummies(df_train["Label"]).values
Y_test = pd.get_dummies(df_test["Label"]).values
# Y_train

In [None]:
# Y_test

In [35]:
# Initialize model
label_count = 3
label_map = {0:"negative", 1:"neutral", 2:"positive"}

model, history, test_accuracy = run_model(X_train, Y_train, X_test, Y_test, label_count, 5, 256, 1000, 300)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 256, 300)          300300    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 256, 300)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 461,003
Trainable params: 461,003
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
# Scrape BBC News for company activities
r1 = requests.get("https://www.bbc.com/news/business/companies")
coverpage = r1.content

soup1 = BeautifulSoup(coverpage, 'html5lib')
coverpage_news = soup1.find_all('h3')

filtered_news = []
for news in coverpage_news:
    filtered_news.append(news.get_text())
    
print(filtered_news[:10])

["Takeover of UK tech firm 'raises serious concerns'", "Takeover of UK tech firm 'raises serious concerns'", 'M&S says recovery plan boosting sales and profits', "Morrisons backs US firm's improved takeover offer", 'Apple delays return to the office until 2022', 'OnlyFans to ban sexually explicit content', "Amazon 'planning to open department stores in US'", 'Lloyds aiming to become giant UK landlord', "Takeover of UK tech firm 'raises serious concerns'", 'M&S says recovery plan boosting sales and profits']


In [37]:
# preprocess data
filtered_news = pd.DataFrame(filtered_news, columns=["Sentence"]).Sentence.map(clean_text)
print(type(filtered_news))
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters=PUNCTUATION, lower=True)
tokenizer.fit_on_texts(df_train['Sentence'].values)
Xn = tokenizer.texts_to_sequences(filtered_news.values)
X_new = pad_sequences(Xn, maxlen=MAX_SEQUENCE_LENGTH)
print(X_new.shape)

<class 'pandas.core.series.Series'>
(66, 256)


In [44]:
preds = model.predict(X_new)
softmax_preds = softmax(preds)
softmax_preds[:10]

[[0.2554752026256853, 0.44378737313502536, 0.30073742423928934],
 [0.2554752026256853, 0.44378737313502536, 0.30073742423928934],
 [0.2921404651012038, 0.32220456224991284, 0.3856549726488834],
 [0.22413627531428393, 0.21964790993671893, 0.5562158147489972],
 [0.21400915122446423, 0.5709291508161224, 0.21506169795941335],
 [0.21827027859876366, 0.5634004105961071, 0.2183293108051292],
 [0.23197913738248538, 0.533084181571281, 0.23493668104623364],
 [0.2836805272483922, 0.3355237188619162, 0.3807957538896916],
 [0.2554752026256853, 0.44378737313502536, 0.30073742423928934],
 [0.2921404651012038, 0.32220456224991284, 0.3856549726488834]]

In [54]:
positives = [news+"    prob:"+str(softmax[2]) for news, softmax in zip(filtered_news, softmax_preds) if 2 == np.argmax(softmax)]
positives

['ms says recovery plan boosting sales profits    prob:0.3856549726488834',
 'morrisons backs us firms improved takeover offer    prob:0.5562158147489972',
 'lloyds aiming become giant uk landlord    prob:0.3807957538896916',
 'ms says recovery plan boosting sales profits    prob:0.3856549726488834',
 'morrisons backs us firms improved takeover offer    prob:0.5562158147489972',
 'lloyds aiming become giant uk landlord    prob:0.3807957538896916',
 'ms says recovery plan boosting sales profits    prob:0.3856549726488834',
 'morrisons backs us firms improved takeover offer    prob:0.5562158147489972',
 'lloyds aiming become giant uk landlord    prob:0.3807957538896916',
 'former netfli staffers charged insider trading    prob:0.3693195176525851',
 'brazil hopes world get taste favourite spirit    prob:0.38014123795756083',
 'five ways firms reach purple pound    prob:0.34317161304867894',
 'ms says recovery plan boosting sales profits    prob:0.3856549726488834',
 'morrisons backs us fi

In [47]:
neutrals = [news for news, softmax in zip(filtered_news, softmax_preds) if 1 == np.argmax(softmax)]
neutrals

['takeover uk tech firm raises serious concerns',
 'takeover uk tech firm raises serious concerns',
 'apple delays return office 2022',
 'onlyfans ban seually eplicit content',
 'amazon planning open department stores us',
 'takeover uk tech firm raises serious concerns',
 'apple delays return office 2022',
 'onlyfans ban seually eplicit content',
 'amazon planning open department stores us',
 'apple delays return office 2022',
 'onlyfans ban seually eplicit content',
 'amazon planning open department stores us',
 'toyota cut production 40 amid chip crisis',
 'franco manca owner planning big epansion',
 'never mind going electric park',
 'trust big tech health data',
 'music always need',
 'couples handle awkward chats money',
 'boohoo ceo says clothes arent throwaway',
 'touring france postbreit puzzle',
 'fisherwomen honoured photography ehibition',
 'takeover uk tech firm raises serious concerns',
 'apple delays return office 2022',
 'sky broadband says online access problems resolv

In [49]:
negatives = [news for news, softmax in zip(filtered_news, softmax_preds) if 0 == np.argmax(softmax)]
negatives

['ftse bosses earn 86 times average wage',
 '40 million tmobile customers hit data breach',
 'bank scam victims speak trouble getting refunds',
 'ftse bosses earn 86 times average wage',
 'home depot discriminated blm supporter',
 'mobile app']