<a href="https://colab.research.google.com/github/tetdp/ittle-Lemon-Food-Ordering-App/blob/main/AI_Driven_Sentiment_Analysis_for_Stock_Market_News_LSTM_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import load_dataset
from collections import Counter
from torch.utils.data import WeightedRandomSampler
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
dataset = load_dataset("NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed")

# Use a subset of the dataset due to memory limitations
data = dataset['train'].select(range(70000))

# Ensure that the dataset has the required columns
if 'text' not in data.column_names or 'sentiment' not in data.column_names:
    raise KeyError("Expected 'text' or 'sentiment' columns not found.")

# Extract features and labels
X = data['text']
y = data['sentiment']

# Convert sentiment labels to binary (1 for positive, 0 for negative)
y = [1 if sentiment.lower() == 'positive' else 0 for sentiment in y]

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create a custom Dataset for PyTorch
class StockNewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Handle class imbalance with WeightedRandomSampler
class_counts = Counter(y_train)
class_weights = [1.0 / class_counts[cls] for cls in range(len(class_counts))]
sample_weights = [class_weights[int(label)] for label in y_train]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

# Create DataLoaders
train_dataset = StockNewsDataset(X_train_tensor, y_train_tensor)
test_dataset = StockNewsDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the LSTM-based model
class SentimentAnalysisLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super(SentimentAnalysisLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Add an extra dimension to the input for LSTM (batch, seq_len, input_dim)
        x = x.unsqueeze(1)  # Now the shape will be (batch_size, seq_len=1, input_dim)
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Use the output of the last time step
        output = self.fc(lstm_out)
        return output

# Initialize the LSTM model
input_dim = X_train.shape[1]  # Number of features after vectorization
hidden_dim = 128  # Hidden units in LSTM
output_dim = 1  # Binary classification
model = SentimentAnalysisLSTM(input_dim, hidden_dim, output_dim)

# Adjust the loss function to handle class imbalance
pos_weight = torch.tensor([class_counts[0] / class_counts[1]])
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)  # For imbalance
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")

# Evaluating the model
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs).squeeze()
        predicted = (torch.sigmoid(outputs) > 0.5).float()
        y_pred.extend(predicted.numpy())
        y_true.extend(labels.numpy())

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
cm = confusion_matrix(y_true, y_pred)
print(cm)

print("Classification Report:")
print(classification_report(y_true, y_pred))

# Function to extract company name(s) from text
def extract_company_name(text):
    """
    Extracts company names or organizations mentioned in the text using spaCy.
    """
    doc = nlp(text)
    companies = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE"]]
    return companies if companies else ["No company name identified"]

# User input for prediction
while True:
    user_input = input("Enter news article text (or 'quit' to quit): ")
    if user_input.lower() == 'quit':
        break

    # Vectorize user input
    user_input_vectorized = vectorizer.transform([user_input]).toarray()
    user_input_tensor = torch.tensor(user_input_vectorized, dtype=torch.float32)

    # Predict the sentiment
    model.eval()
    with torch.no_grad():
        output = model(user_input_tensor).squeeze()
        sentiment_score = torch.sigmoid(output).item()
        sentiment = 'positive' if sentiment_score > 0.5 else 'negative'

    # Extract company name(s) from user input
    companies = extract_company_name(user_input)

    # Display the results
    print(f"Sentiment: {sentiment}")
    print(f"Sentiment Score: {sentiment_score:.4f}")
    print(f"Company Name(s): {', '.join(companies)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/787 [00:00<?, ?B/s]

(…)-00000-of-00001-ccd537eba2831636.parquet:   0%|          | 0.00/49.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/142000 [00:00<?, ? examples/s]

Epoch 1/5, Loss: 0.10579465478977987
Epoch 2/5, Loss: 0.0662853336318263
Epoch 3/5, Loss: 0.054196915586612056
Epoch 4/5, Loss: 0.04606054861338011
Epoch 5/5, Loss: 0.03962635120762778
Confusion Matrix:
[[1121  280]
 [2714 9885]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.29      0.80      0.43      1401
         1.0       0.97      0.78      0.87     12599

    accuracy                           0.79     14000
   macro avg       0.63      0.79      0.65     14000
weighted avg       0.90      0.79      0.82     14000

Enter news article text (or 'quit' to quit): Nvidia dismissed a large number of its employees
Sentiment: negative
Sentiment Score: 0.1928
Company Name(s): Nvidia
Enter news article text (or 'quit' to quit): NVIDIA distributed 40% dividends
Sentiment: negative
Sentiment Score: 0.0925
Company Name(s): NVIDIA
Enter news article text (or 'quit' to quit): Nvidia dismissed a large number of its employees
Sentiment: negativ