In [1]:
#importing library
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
#loading the dataset
df=pd.read_csv('../input/sentiment-analysis-for-financial-news/all-data.csv',names=['Label','text'],encoding='latin-1')

In [3]:
#loading the dataset
df

Unnamed: 0,Label,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [4]:
#checking for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   4846 non-null   object
 1   text    4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [5]:
#preprocessing
df['Label'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [6]:
#checking for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   4846 non-null   object
 1   text    4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [7]:
#getting unique value
df['Label'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [8]:
#creating the function to converting into numberical dataset

def get_sequences(texts):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences=tokenizer.texts_to_sequences(texts)
    max_seq_length=np.max(list(map(lambda x:len(x),sequences)))
    sequences=pad_sequences(sequences,maxlen=max_seq_length,padding='post')
    return sequences

In [9]:
get_sequences(df['text'])

array([[  94,    5, 3498, ...,    0,    0,    0],
       [ 840,  336,    5, ...,    0,    0,    0],
       [   1,  293,  656, ...,    0,    0,    0],
       ...,
       [  42,   31,  242, ...,    0,    0,    0],
       [  30,   27,    2, ...,    0,    0,    0],
       [  27,    3,   35, ...,    0,    0,    0]], dtype=int32)

In [10]:
#Preprocessing 
def preprocess_inputs(df):
    #copying the dataset
    
    df=df.copy()
    #converting the dataset into the numerical form 
    sequences=get_sequences(df['text'])
    #labeling the 
    label_mapping={
        'negative':0,
        'neutral':1,
        'positive':2
    }
    y=df['Label'].replace(label_mapping)
    #train_test_split
    train_sequences,test_sequences,y_train,y_test=train_test_split(sequences,y,train_size=0.7,shuffle=True,random_state=1)
    #returning the dataset
    return train_sequences,test_sequences,y_train,y_test

In [11]:
#train_test_split
train_sequences,test_sequences,y_train,y_test=preprocess_inputs(df)

In [12]:
#pad sequences
train_sequences

array([[5442,  510,   16, ...,    0,    0,    0],
       [  22, 1628,    4, ...,    0,    0,    0],
       [1141,  936,  136, ...,    0,    0,    0],
       ...,
       [   1,  419,   16, ...,    0,    0,    0],
       [2586,  123, 3247, ...,    0,    0,    0],
       [  30,  615,  555, ...,    0,    0,    0]], dtype=int32)

In [13]:
#training the model
inputs=tf.keras.Input(shape=(train_sequences.shape[1],))
x=tf.keras.layers.Embedding(
    input_dim=10123,
    output_dim=128,
    input_length=train_sequences.shape[1]
)(inputs)
x=tf.keras.layers.GRU(256,activation='tanh')(x)
outputs=tf.keras.layers.Dense(3,activation='softmax')(x)
model=tf.keras.Model(inputs=inputs,outputs=outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])
history=model.fit(
    train_sequences,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True)])

2022-05-10 10:54:13.694875: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 10:54:13.795328: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 10:54:13.796545: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 10:54:13.798247: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Epoch 1/100


2022-05-10 10:54:18.217705: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [14]:
#evaluating the model
model.evaluate(test_sequences,y_test)



[0.9354650378227234, 0.5845942497253418]

In [15]:
y_test.value_counts()

1    850
2    420
0    184
Name: Label, dtype: int64