In [1]:
# import required libraries
import pandas as pd
import numpy as np
!pip install sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf



I will be using a dataset downloaded from Kaggle cited as:

Malo, P., Sinha, A., Korhonen, P., Wallenius, J., & Takala, P. (2014). Good debt or bad debt: Detecting semantic orientations in economic texts. Journal of the Association for Information Science and Technology, 65(4), 782-796.

I chose this dataset as it is very relevant to our use-case: it contains 4844 financial news headlines categorized by their sentiment from the "perspective of a retail investor".

In [2]:
# create dataframe
df = pd.read_csv('./financialnews.csv', encoding='latin-1')      # latin-1 encoding required to fix error
df

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [3]:
# change column names
df.columns = ['Sentiment', 'SentimentText']
df

Unnamed: 0,Sentiment,SentimentText
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [4]:
# convert text in the Sentiment column to a number
mapper = {'negative': 0,
          'neutral': 1,
          'positive': 2}
df.Sentiment = df.Sentiment.map(mapper)
df

Unnamed: 0,Sentiment,SentimentText
0,1,Technopolis plans to develop in stages an area...
1,0,The international electronic industry company ...
2,2,With the new production plant the company woul...
3,2,According to the company 's updated strategy f...
4,2,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,0,LONDON MarketWatch -- Share prices ended lower...
4841,1,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,0,Operating profit fell to EUR 35.4 mn from EUR ...
4843,0,Net sales of the Paper segment decreased to EU...


In [5]:
# count the number of values that are negative, neutral, or positive
df.Sentiment.value_counts()

1    2878
2    1363
0     604
Name: Sentiment, dtype: int64

I notice that we may be dealing with a bad dataset as the datapoints are skewed towards neutral/positive sentiment. I'll try to fix this by keeping only 604 datapoints from each Sentiment classification.

In [6]:
for i in range(3):
    if len(df.loc[df.Sentiment == i]) > 604:
        print(f'{i} is too large')
    else:
        print(f'{i} is ok')

0 is ok
1 is too large
2 is too large


In [7]:
# create three separate dataframes to work with
df_neg = df[df['Sentiment'] == 0]
df_neg = df_neg.reset_index(drop=True)

df_neu = df[df['Sentiment'] == 1]
df_neu = df_neu.reset_index(drop=True)

df_pos = df[df['Sentiment'] == 2]
df_pos = df_pos.reset_index(drop=True)

In [8]:
# drop (2878-604=2274) random rows from df_neu
remove_neu = 2274
drop_ind_neu = np.random.choice(df_neu.index, remove_neu, replace=False)   # sample 'remove_neu' number of random row id's
df_neu_subset = df_neu.drop(drop_ind_neu)                                  # remove random rows from df
len(df_neu_subset)

604

In [9]:
# drop (1363-604=759) random rows from df_pos
remove_pos = 759
drop_ind_pos = np.random.choice(df_pos.index, remove_pos, replace=False)
df_pos_subset = df_pos.drop(drop_ind_pos)
len(df_pos_subset)

604

In [10]:
# concatenate the three dataframes into a new one
new_df = pd.concat([df_neu_subset, df_pos_subset, df_neg])
new_df = new_df.reset_index(drop=True)
new_df

Unnamed: 0,Sentiment,SentimentText
0,1,At the request of Finnish media company Alma M...
1,1,The company supports its global customers in d...
2,1,In June it sold a 30 percent stake to Nordstje...
3,1,"The 718,430 new Series A shares will become su..."
4,1,Tiimari operates 194 stores in six countries -...
...,...,...
1807,0,HELSINKI Thomson Financial - Shares in Cargote...
1808,0,LONDON MarketWatch -- Share prices ended lower...
1809,0,Operating profit fell to EUR 35.4 mn from EUR ...
1810,0,Net sales of the Paper segment decreased to EU...


In [11]:
new_df.Sentiment.value_counts()

1    604
2    604
0    604
Name: Sentiment, dtype: int64

Now that we have the same number of datapoints for each type of sentiment, we can proceed with the analysis.

In [12]:
# split data into train and test set (test set will be 20% of our data)
train, valid = train_test_split(new_df, test_size=0.2)           

In [13]:
# convert train/test data into np arrays so that ml model can take in as input
train_text = np.array(train['SentimentText'].tolist().copy())
train_labels = keras.utils.to_categorical(train['Sentiment'].astype('int64'))

In [14]:
valid_text = np.array(valid['SentimentText'].tolist().copy())
valid_labels = keras.utils.to_categorical(valid['Sentiment'].astype('int64'))

In [15]:
# declaring variables for preprocessing 

vocab_size = 1000     # number of words tokenizer will keep in its memory
embedding_dim = 16
max_length = 142      # we'll accept sentences of max. 142 characters
trunc_type = 'post'
padding_type = 'post'
oov_token = '<OOV>'   # placeholder for words not in tokenizer

# instantiating our tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(
                num_words=vocab_size,
                oov_token=oov_token)

In [16]:
tokenizer.fit_on_texts(train_text)
#tokenizer.word_index

sequences = tokenizer.texts_to_sequences(train_text)          # use tokenizer on our sentences to create sequences
 
    
    
padded = tf.keras.preprocessing.sequence.pad_sequences(       # ensure sentences are of the same length
            sequences, maxlen=max_length,
            padding=padding_type,
            truncating=trunc_type)                 

In [17]:
# repeat for the testing text
testing_sequences = tokenizer.texts_to_sequences(valid_text)

testing_padded = tf.keras.preprocessing.sequence.pad_sequences(
                    testing_sequences,
                    maxlen=max_length,
                    padding=padding_type,
                    truncating=trunc_type)

In [18]:
# need to save the tokenizer on local computer so we can load it later
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

The next task is to build a simple neural network in Keras:

In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

# use 'sparse_categorical_accuracy' instead of 'accuracy' to fix "model.state_updates is deprecated" error 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 142, 16)           16000     
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 51        
Total params: 16,323
Trainable params: 16,323
Non-trainable params: 0
_________________________________________________________________


In [20]:
# fitting and training the model on our modified dataset

num_epochs = 30                             # number of times to go over the dataset
history = model.fit(padded, 
                    train_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, valid_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [21]:
# testing our model
phrase = ['TSLA will compound by 10% over the next 10 years']

test_sequences = tokenizer.texts_to_sequences(phrase)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(       
                test_sequences, maxlen=max_length,
                padding=padding_type,
                truncating=trunc_type)

pred = model.predict(test_padded)
classes = np.argmax(pred, axis=-1)
dict_sentiment = {0:'Negative', 1:'Neutral', 2:'Positive'}
print(f'{phrase} : {dict_sentiment[int(classes)]}')                # outputs neutral or, sometimes, positive

['TSLA will compound by 10% over the next 10 years'] : Positive


In [22]:
# save our model
model.save('model_1')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: model_1\assets
