# 1. import libraries:

In [7]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import tensorflow as tf

# 2.load data

In [29]:
data=pd.read_csv('all-data.csv', names=['Sentiments', 'News_headlines'], encoding='latin-1')
data.head().T

Unnamed: 0,0,1,2,3,4
Sentiments,neutral,neutral,negative,positive,positive
News_headlines,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .","Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .","The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .",With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales ."


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sentiments      4846 non-null   object
 1   News_headlines  4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


we can see that the dataset has 4846 rows and two columns, namely,’ Sentiment’ and ‘News Headline’.

In [31]:
df.isna().sum()

Sentiments        0
News_headlines    0
dtype: int64

In [32]:
df['Sentiments'].value_counts()

neutral     2879
positive    1363
negative     604
Name: Sentiments, dtype: int64

We can undoubtedly infer that the dataset includes three categories of sentiments:

Neutral
Positive
Negative
Out of 4846 sentiments,2879 have been found to be neutral,1363 positive, and the rest negative.

# **Preprocessing

In [36]:
def get_sequences(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    print("Vocab length:", len(tokenizer.word_index) + 1)
    
    max_seq_length = np.max(list(map(lambda x: len(x), sequences)))
    print("Maximum sequence length:", max_seq_length)
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences


In [37]:
def preprocess_inputs(data):
    data= data.copy()
    
    sequences = get_sequences(df['News_headlines'])
    
    label_mapping = {
        'negative': 0,
        'neutral': 1,
        'positive': 2
    }
    
    y = data['Sentiments'].replace(label_mapping)
    
    train_sequences, test_sequences, y_train, y_test = train_test_split(sequences, y, train_size=0.7, shuffle=True, random_state=1)
    
    return train_sequences, test_sequences, y_train, y_test

In [38]:
train_sequences, test_sequences, y_train, y_test = preprocess_inputs(data)

Vocab length: 10123
Maximum sequence length: 71


In [39]:
train_sequences

array([[5442,  510,   16, ...,    0,    0,    0],
       [  22, 1628,    4, ...,    0,    0,    0],
       [1141,  936,  136, ...,    0,    0,    0],
       ...,
       [   1,  419,   16, ...,    0,    0,    0],
       [2586,  123, 3247, ...,    0,    0,    0],
       [  30,  615,  555, ...,    0,    0,    0]])

In [40]:
 test_sequences

array([[   1, 1967,  159, ...,    0,    0,    0],
       [ 554,   16, 6822, ...,    0,    0,    0],
       [  42,   31,  242, ...,    0,    0,    0],
       ...,
       [  94,    5, 2084, ...,    0,    0,    0],
       [   1,  259,  184, ...,    0,    0,    0],
       [ 618,   12,  538, ...,    0,    0,    0]])

In [41]:
y_train

545     2
2374    0
4217    1
1071    1
716     2
       ..
2895    1
2763    1
905     2
3980    1
235     2
Name: Sentiments, Length: 3392, dtype: int64

# 3. Feature extraction:

Our objective is to predict the sentiment of a given news headline. Obviously, the ‘News Headline’ column is our only feature and the ‘Sentiment’ column is our target variable.

# Training**

In [42]:
inputs = tf.keras.Input(shape=(train_sequences.shape[1],))
x = tf.keras.layers.Embedding(
    input_dim=10123,
    output_dim=128,
    input_length=train_sequences.shape[1]
)(inputs)
x = tf.keras.layers.GRU(256, return_sequences=True, activation='tanh')(x)
x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    train_sequences,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


# Result

In [43]:
results = model.evaluate(test_sequences, y_test, verbose=0)

print("    Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

    Test Loss: 0.65712
Test Accuracy: 75.65%
