<a href="https://colab.research.google.com/github/yassineoo/sentiment_analyses_MLB_CNN_LSTM_BERT/blob/main/LSTM_VS_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Team members:
- ADJLANE Aymen Abdeldjalil
- ATTOU Yassine
- BOUCENNA Abderrahmane
- GHODBANE Youcef Islam

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

data = pd.read_csv('finance_sentiment.csv')

data.head(10)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
5,$SPY wouldn't be surprised to see a green close,positive
6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
7,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative
8,Kone 's net sales rose by some 14 % year-on-ye...,positive
9,The Stockmann department store will have a tot...,neutral


### Split the data

In [None]:
from sklearn.model_selection import train_test_split

Xtxt_train, Xtxt_test, Y_train, Y_test = train_test_split(np.array(data["Sentence"]), np.array(data["Sentiment"]), test_size=0.3, random_state=0)
Xtxt_train.shape, Xtxt_test.shape

((4089,), (1753,))

# LSTM Model

### Load trained weights of a Word2Vec model

In [None]:
# Make a directory named kaggle and copy the kaggle.json file there.
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/

# Change the permission of the file
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d umbertogriffo/googles-trained-word2vec-model-in-python
# Extract the data
!mkdir /content/googles-trained-word2vec-model
!unzip /content/googles-trained-word2vec-model-in-python.zip -d /content/googles-trained-word2vec-model

In [None]:
from gensim.models import KeyedVectors

word2vec_pretrained = KeyedVectors.load_word2vec_format("/content/googles-trained-word2vec-model/GoogleNews-vectors-negative300.bin",binary=True)
word2vec_pretrained_dict = dict(zip(word2vec_pretrained.key_to_index.keys(), word2vec_pretrained.vectors))

### Data Preprocessing

In [None]:
from keras.preprocessing.sequence import pad_sequences

token = tf.keras.preprocessing.text.Tokenizer(num_words=None)

token.fit_on_texts(list(Xtxt_train) + list(Xtxt_test))

xtrain_seq = token.texts_to_sequences(Xtxt_train)
xtest_seq = token.texts_to_sequences(Xtxt_test)

#zero pad sequences
xtrain_pad = pad_sequences(xtrain_seq,padding='post')
xtest_pad = pad_sequences(xtest_seq,padding='post')


word_index = token.word_index

In [None]:
embedding_matrix = np.zeros((len(word_index)+1, 300))
for word,i in word_index.items():
    embedding_vector = word2vec_pretrained_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Build Custom Metrics (F1-Score)

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m_():
    def f1_m(y_true, y_pred):
        precision = precision_m(y_true, y_pred)
        recall = recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))
    return f1_m

### Define LSTM model

In [None]:
from keras.layers import Embedding, Dropout

lstm_model = Sequential()
lstm_model.add(Embedding(len(word_index)+1,300,weights=[embedding_matrix], trainable = False))

lstm_model.add(SpatialDropout1D(0.3))
lstm_model.add(LSTM(300, dropout = 0.3, recurrent_dropout = 0.3))

lstm_model.add(Dense(1024 , activation = 'relu'))
lstm_model.add(Dropout(0.8))

lstm_model.add(Dense(1024, activation = 'relu'))
lstm_model.add(Dropout(0.8))

lstm_model.add(Dense(3, activation = 'softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

In [None]:
lstm_model.fit(xtrain_pad, y=output, batch_size=512, epochs=100, verbose=1)

### LSTM - Classification Report

In [None]:
import timeit
from sklearn.metrics import classification_report

t = timeit.default_timer()
Ypred = lstm_model.predict(xtest_pad)
print('LSTM', timeit.default_timer()-t)

print(classification_report(Y_test, label_binarizer.inverse_transform(y), zero_division=0))

# BERT model

### Load the Tokenizer & the model

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preparing the data & Training

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Prepare the training data
train_texts = list(Xtxt_train)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(Y_train)
Y_train_one_hot = to_categorical(integer_encoded, num_classes=3)

# Tokenize training data
train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="tf", max_length=512)

# Training the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

bert_model.fit(
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    Y_train_one_hot,
    epochs=3,
    batch_size=8
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7c93f3243820>

### BERT - Classification Report

In [None]:
# One-hot encode the test labels
test_integer_encoded = label_encoder.transform(Y_test)
Y_test_one_hot = to_categorical(test_integer_encoded, num_classes=3)

# Tokenize test data
test_encodings = tokenizer(list(Xtxt_test), padding=True, truncation=True, return_tensors="tf", max_length=512)

Test Loss: 0.49501001834869385
Test Accuracy: 0.7923559546470642


In [None]:
from sklearn.metrics import classification_report

# Predicting on test data
predictions = bert_model.predict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']})
predicted_classes = tf.argmax(predictions.logits, axis=1).numpy()

# Convert one-hot encoded test labels back to label encoding for comparison
true_classes = tf.argmax(Y_test_one_hot, axis=1).numpy()

report = classification_report(true_classes, predicted_classes, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

    negative       0.64      0.33      0.44       283
     neutral       0.79      0.91      0.85       940
    positive       0.86      0.84      0.85       530

    accuracy                           0.80      1753
   macro avg       0.76      0.70      0.71      1753
weighted avg       0.79      0.80      0.78      1753

