# **Required Libraries**

In [2]:
# %% [Cell 1 - Install All Dependencies]
# Install core packages with pinned versions
!pip install --upgrade --force-reinstall \
  numpy==1.23.5 \
  tensorflow==2.12.0 \
  transformers==4.30 \
  scipy==1.10.1 \
  gensim==4.3.2 \
  --no-deps

# Environment configuration
%env TF_FORCE_GPU_ALLOW_GROWTH=true

Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting tensorflow==2.12.0
  Using cached tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting transformers==4.30
  Using cached transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
Collecting scipy==1.10.1
  Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting gensim==4.3.2
  Using cached gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Using cached tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (586.0 MB)
Using cached transformers-4.30.0-py3-none-any.whl (7.2 MB)
Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)
Using cached gensim-4

env: TF_FORCE_GPU_ALLOW_GROWTH=true


# **2. Data Loading & Exploration**

**I will use the Financial PhraseBank dataset from Hugging Face datasets**

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [3]:
from datasets import load_dataset

dataset = load_dataset('financial_phrasebank', 'sentences_allagree')
df = dataset['train'].to_pandas()
df.columns = ['text', 'label']
df.head()

# EDA
print(f"Dataset size: {len(df)}")
print("Class distribution:")
print(df['label'].value_counts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset size: 2264
Class distribution:
label
1    1391
2     570
0     303
Name: count, dtype: int64


In [3]:
df.head()

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2


# **3. Text Preprocessing**
Advanced preprocessing with NLTK and custom financial terms handling


In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
nltk.download(['stopwords', 'wordnet', 'punkt'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Custom financial stopwords and terms
FIN_STOPWORDS = {'company', 'million', 'billion', 'percent', 'said', 'also', 'year', 'quarter'}
BASE_STOPWORDS = set(stopwords.words('english')) - {'not', 'no', 'nor', 'against'}
STOPWORDS = BASE_STOPWORDS.union(FIN_STOPWORDS)

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)

    # Tokenize and lemmatize
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in STOPWORDS]

    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)
print("Original text:", df['text'].iloc[0])
print("Cleaned text:", df['cleaned_text'].iloc[0])

Original text: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Cleaned text: according gran no plan move production russia although growing


# **4. Model Training - Three Approaches**
I will implement three different approaches for comparison







# **Approach 1: Traditional ML (Logistic Regression + TF-IDF)**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    stop_words=list(STOPWORDS)
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [10]:
# Model Training
lr = LogisticRegression(
    class_weight='balanced',
    solver='saga',
    max_iter=1000
)
lr.fit(X_train_tfidf, y_train)

# Evaluation
print("Logistic Regression Performance:")
print(classification_report(y_test, lr.predict(X_test_tfidf)))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.55      0.70      0.61        56
           1       0.89      0.93      0.91       276
           2       0.79      0.61      0.69       121

    accuracy                           0.81       453
   macro avg       0.74      0.75      0.74       453
weighted avg       0.82      0.81      0.81       453



# **Approach 2: Deep Learning (LSTM + Word2Vec)**

In [13]:
# Cell 1: Fix numpy version conflict
!pip uninstall -y numpy
!pip install numpy==1.23.5 --no-deps
!pip install tensorflow==2.12.0

Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.12.0)
  Downloading protobuf-4.25.6-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12.0)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.12.0)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of jax to determine which versio

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

In [11]:
def preprocess_lstm(text):
    # Keep numbers, $, % and basic cleaning
    text = re.sub(r'[^\w\s$%]', '', text)  # Changed regex pattern
    text = text.lower().strip()
    return ' '.join(nltk.word_tokenize(text))

# Apply only to LSTM data
df['lstm_text'] = df['text'].apply(preprocess_lstm)

# Word2Vec Embeddings (train on modified text)
sentences = [text.split() for text in df['lstm_text']]  # Use lstm_text column

In [18]:
w2v_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=1)

# Tokenization (now using lstm_text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['lstm_text'])
vocab_size = len(tokenizer.word_index) + 1

# Sequence padding
X = tokenizer.texts_to_sequences(df['lstm_text'])
X = pad_sequences(X, maxlen=100)

In [21]:
import numpy as np

# Train-test split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X, df['label'], test_size=0.2, random_state=42
)

# Embedding matrix - ADD NULL EMBEDDING HANDLING
embedding_matrix = np.zeros((vocab_size, 50))
unknown_words = []
for word, i in tokenizer.word_index.items():
    try:
        embedding_matrix[i] = w2v_model.wv[word]
    except KeyError:
        unknown_words.append(word)
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(100,))  # Random initialization

print(f"{len(unknown_words)} unknown words received random embeddings")

# LSTM Model - ARCHITECTURE
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        vocab_size, 50,
        weights=[embedding_matrix],
        input_length=100,
        trainable=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
        64,
        dropout=0.2,
        recurrent_dropout=0.2,
        return_sequences=False  # EXPLICITLY SET THIS
    )),
    tf.keras.layers.Dense(32, activation='relu'),  # Reduced from 64
    tf.keras.layers.Dropout(0.3),  # Reduced from 0.4
    tf.keras.layers.Dense(3, activation='softmax')
])

# OPTIMIZED COMPILATION
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0007),  # Lower LR
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']  # Simplified metrics
)

# TRAINING WITH EARLY STOPPING
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',  # More stable than loss
    patience=5,  # Increased patience
    mode='max',
    restore_best_weights=True
)

# Added model checkpointing
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'best_lstm.h5',
    monitor='val_accuracy',
    save_best_only=True
)

history = model.fit(
    X_train_lstm, y_train_lstm,
    epochs=30,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop, checkpoint],  # Added checkpoint
    verbose=1  # Simpler progress
)


# ENHANCED EVALUATION
print("\nLSTM Performance:")
test_loss, test_acc = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# CONFIDENCE SCORES
y_probs = model.predict(X_test_lstm)
print("\nSample Predictions:")
for i in range(3):
    print(f"Text: {df['text'].iloc[i][:50]}...")
    print(f"True: {y_test_lstm.iloc[i]} Pred: {np.argmax(y_probs[i])}")
    print(f"Confidence: {np.max(y_probs[i]):.2%}\n")

0 unknown words received random embeddings
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30

LSTM Performance:
Test Accuracy: 0.7881
Test Loss: 0.5557

Sample Predictions:
Text: According to Gran , the company has no plans to mo...
True: 1 Pred: 1
Confidence: 87.04%

Text: For the last quarter of 2010 , Componenta 's net s...
True: 1 Pred: 1
Confidence: 91.53%

Text: In the third quarter of 2010 , net sales increased...
True: 0 Pred: 2
Confidence: 61.33%



# **Approach 3: Transformer (FinBERT)**

In [22]:
!pip install tokenizers==0.13.3 --force-reinstall

Collecting tokenizers==0.13.3
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/7.8 MB[0m [31m88.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.8/7.8 MB[0m [31m116.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
[31mERROR: pip's dependency resolver does not current

In [23]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [24]:
# FinBERT-specific preprocessing (preserves casing/punctuation)
def preprocess_finbert(text):
    # Only clean whitespace, keep original casing and symbols
    return ' '.join(text.strip().split())

df['text_finbert'] = df['text'].apply(preprocess_finbert)

In [25]:
# Load FinBERT pretrained model
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # FinBERT tokenizer
finbert = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Use minimally processed text from special preprocessing
X_train_finbert, X_test_finbert, y_train_finbert, y_test_finbert = train_test_split(
    df['text_finbert'],  # Uses raw text with whitespace cleanup only
    df['label'],
    test_size=0.2,
    random_state=42
)



config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]



tf_model.h5:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some layers from the model checkpoint at yiyanghkust/finbert-tone were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
# Direct tokenization without custom cleaning
def finbert_tokenize(data):
    return tokenizer(
        data.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="tf"
    )

# Tokenize using FINBERT'S native tokenizer
train_encodings = finbert_tokenize(X_train_finbert)  # Use finbert-split data
test_encodings = finbert_tokenize(X_test_finbert)

# Model configuration
optimizer = Adam(learning_rate=2e-5)  # Lower rate for fine-tuning
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
finbert.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Training
history = finbert.fit(
    dict(train_encodings),
    y_train_finbert,
    epochs=1,
    batch_size=4,  # Smaller batches for CPU
    validation_split=0.2
)

# IMMEDIATELY SAVE AFTER 1 EPOCH
finbert.save_pretrained("finbert_partial")
tokenizer.save_pretrained("finbert_partial")

# Evaluation
print("FinBERT Performance:")
finbert.evaluate(dict(test_encodings), y_test_finbert)

FinBERT Performance:


[0.22143246233463287, 0.940397322177887]

In [None]:
finbert.save_pretrained("finbert_sentiment")
tokenizer.save_pretrained("finbert_sentiment")

# Compress for easy download
!zip -r finbert_sentiment.zip finbert_sentiment/

  adding: finbert_sentiment/ (stored 0%)
  adding: finbert_sentiment/special_tokens_map.json (deflated 42%)
  adding: finbert_sentiment/tf_model.h5 (deflated 7%)
  adding: finbert_sentiment/tokenizer_config.json (deflated 45%)
  adding: finbert_sentiment/vocab.txt (deflated 50%)
  adding: finbert_sentiment/config.json (deflated 49%)
  adding: finbert_sentiment/tokenizer.json (deflated 70%)


In [28]:
from google.colab import files
files.download('finbert_sentiment.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# Mount Drive (run first)
from google.colab import drive
drive.mount('/content/drive')

# Save to Drive
!cp -r finbert_sentiment.zip "/content/drive/My Drive/"

Mounted at /content/drive
