In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import glob
import os

In [2]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob=0.3):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, bidirectional=True, dropout=dropout_prob)
        self.fc1 = nn.Linear(hidden_dim * 2, 32)
        self.fc2 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.layer_dim * 2, x.size(0), self.hidden_dim).requires_grad_()
        out, _ = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc1(out[:, -1, :])
        out = self.dropout(self.relu(out))
        out = self.fc2(out)
        return out

def prepare_stock_data(df, ma_periods=[5, 10, 20, 50]):
    data = df.copy()
    data['Date'] = pd.to_datetime(data['Date'])
    data.set_index('Date', inplace=True)
    for period in ma_periods:
        data[f'MA_{period}'] = data['Adj Close'].rolling(window=period).mean()
    data['Price_Change'] = data['Adj Close'].pct_change()
    data['Volume_Change'] = data['Volume'].pct_change()
    data.dropna(inplace=True)
    return data

def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:(i + sequence_length), :])
        y.append(data[i + sequence_length, 0])
    return np.array(X), np.array(y)

In [3]:
def prepare_and_train_model(df, features, sequence_length=20, test_size=0.2, learning_rate=0.001, epochs=100):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df[features])
    X, y = create_sequences(scaled_data, sequence_length)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
    
    X_train, X_test = torch.tensor(X_train, dtype=torch.float32), torch.tensor(X_test, dtype=torch.float32)
    y_train, y_test = torch.tensor(y_train, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)
    
    input_dim = X_train.shape[2]
    model = LSTMModel(input_dim=input_dim, hidden_dim=128, layer_dim=2, output_dim=1)
    criterion = nn.HuberLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output.view(-1), batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        if (epoch + 1) % 20 == 0:  # Print every 20 epochs
            avg_epoch_loss = epoch_loss / len(train_loader)
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_epoch_loss:.4f}')
    
    return model, scaler, (X_train, X_test, y_train, y_test)

def evaluate_stock_predictions(y_true, y_pred):
    y_true, y_pred = y_true.detach().numpy(), y_pred.detach().numpy()
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        'R2': r2
    }

In [4]:
def process_all_stocks(data_folder="../data/"):
    # Get all stock files
    stock_files = glob.glob(os.path.join(data_folder, "processed_stock_*.csv"))
    
    # Initialize results dictionary
    results = {}
    
    # Features sets
    features_with_sentiment = ['Adj Close', 'Volume', 'Price_Change', 'Volume_Change', 
                             'MA_5', 'MA_10', 'MA_20', 'MA_50', 'sentiment']
    features_without_sentiment = ['Adj Close', 'Volume', 'Price_Change', 'Volume_Change', 
                                'MA_5', 'MA_10', 'MA_20', 'MA_50']
    
    for file in stock_files:
        stock_name = os.path.basename(file).replace('processed_stock_data_', '').replace('.csv', '')
        print(f"\nProcessing {stock_name}...")
        
        # Load and prepare data
        df = pd.read_csv(file)
        df = prepare_stock_data(df)
        
        results[stock_name] = {
            'with_sentiment': {},
            'without_sentiment': {}
        }
        
        # Train and evaluate model with sentiment
        print("Training model with sentiment...")
        model, scaler, (_, X_test, _, y_test) = prepare_and_train_model(df, features_with_sentiment)
        model.eval()
        predictions = model(X_test).view(-1).detach()
        results[stock_name]['with_sentiment'] = evaluate_stock_predictions(y_test, predictions)
        
        # Train and evaluate model without sentiment
        print("Training model without sentiment...")
        model, scaler, (_, X_test, _, y_test) = prepare_and_train_model(df, features_without_sentiment)
        model.eval()
        predictions = model(X_test).view(-1).detach()
        results[stock_name]['without_sentiment'] = evaluate_stock_predictions(y_test, predictions)
    
    return results

In [5]:
def display_results(results):
    # Create DataFrame for easy comparison
    rows = []
    for stock, metrics in results.items():
        for metric in ['MSE', 'RMSE', 'MAE', 'MAPE', 'R2']:
            rows.append({
                'Stock': stock,
                'Metric': metric,
                'With Sentiment': metrics['with_sentiment'][metric],
                'Without Sentiment': metrics['without_sentiment'][metric],
                'Difference': metrics['with_sentiment'][metric] - metrics['without_sentiment'][metric]
            })
    
    df_results = pd.DataFrame(rows)
    
    # Calculate improvement percentages
    df_results['Improvement %'] = (df_results['Difference'] / df_results['Without Sentiment'] * 100)
    
    # Format and display results
    pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x))
    df_pivot = df_results.pivot(index=['Stock', 'Metric'], 
                              columns=[],
                              values=['With Sentiment', 'Without Sentiment', 'Improvement %'])
    
    return df_pivot

In [6]:
# Run the analysis
results = process_all_stocks()
comparison_df = display_results(results)
print("\nResults Comparison:")
print(comparison_df)

# Save results to CSV
comparison_df.to_csv('sentiment_analysis_comparison.csv')

# Create visualization of improvements
plt.figure(figsize=(15, 8))
metrics_to_plot = ['RMSE', 'MAE', 'MAPE']  # Selecting key metrics for visualization

for metric in metrics_to_plot:
    improvements = []
    stocks = []
    for stock in results.keys():
        with_sentiment = results[stock]['with_sentiment'][metric]
        without_sentiment = results[stock]['without_sentiment'][metric]
        improvement = ((without_sentiment - with_sentiment) / without_sentiment) * 100
        improvements.append(improvement)
        stocks.append(stock)
    
    plt.plot(stocks, improvements, marker='o', label=f'{metric} Improvement %')

plt.title('Sentiment Analysis Impact on Model Performance')
plt.xlabel('Stocks')
plt.ylabel('Improvement Percentage')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('sentiment_analysis_impact.png')
plt.close()


Processing goog...
Training model with sentiment...


  from .autonotebook import tqdm as notebook_tqdm


Epoch [20/100], Loss: 0.0025
Epoch [40/100], Loss: 0.0017
Epoch [60/100], Loss: 0.0012
Epoch [80/100], Loss: 0.0011
Epoch [100/100], Loss: 0.0014
Training model without sentiment...
Epoch [20/100], Loss: 0.0043
Epoch [40/100], Loss: 0.0024
Epoch [60/100], Loss: 0.0019
Epoch [80/100], Loss: 0.0014
Epoch [100/100], Loss: 0.0017

Processing d...
Training model with sentiment...
Epoch [20/100], Loss: 0.0034
Epoch [40/100], Loss: 0.0028
Epoch [60/100], Loss: 0.0024
Epoch [80/100], Loss: 0.0018
Epoch [100/100], Loss: 0.0018
Training model without sentiment...
Epoch [20/100], Loss: 0.0045
Epoch [40/100], Loss: 0.0034
Epoch [60/100], Loss: 0.0033
Epoch [80/100], Loss: 0.0024
Epoch [100/100], Loss: 0.0020

Processing baba...
Training model with sentiment...
Epoch [20/100], Loss: 0.0040
Epoch [40/100], Loss: 0.0034
Epoch [60/100], Loss: 0.0023
Epoch [80/100], Loss: 0.0023
Epoch [100/100], Loss: 0.0024
Training model without sentiment...
Epoch [20/100], Loss: 0.0047
Epoch [40/100], Loss: 0.0037
E

  mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100


Epoch [20/100], Loss: 0.0045
Epoch [40/100], Loss: 0.0038
Epoch [60/100], Loss: 0.0034
Epoch [80/100], Loss: 0.0022
Epoch [100/100], Loss: 0.0021

Processing aapl...
Training model with sentiment...


  mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100


Epoch [20/100], Loss: 0.0133
Epoch [40/100], Loss: 0.0101
Epoch [60/100], Loss: 0.0089
Epoch [80/100], Loss: 0.0064
Epoch [100/100], Loss: 0.0054
Training model without sentiment...
Epoch [20/100], Loss: 0.0087
Epoch [40/100], Loss: 0.0082
Epoch [60/100], Loss: 0.0056
Epoch [80/100], Loss: 0.0050
Epoch [100/100], Loss: 0.0048

Processing msft...
Training model with sentiment...
Epoch [20/100], Loss: 0.0026
Epoch [40/100], Loss: 0.0019
Epoch [60/100], Loss: 0.0016
Epoch [80/100], Loss: 0.0013
Epoch [100/100], Loss: 0.0011
Training model without sentiment...
Epoch [20/100], Loss: 0.0043
Epoch [40/100], Loss: 0.0033
Epoch [60/100], Loss: 0.0026
Epoch [80/100], Loss: 0.0022
Epoch [100/100], Loss: 0.0016

Processing fb...
Training model with sentiment...
Epoch [20/100], Loss: 0.0045
Epoch [40/100], Loss: 0.0035
Epoch [60/100], Loss: 0.0027
Epoch [80/100], Loss: 0.0020
Epoch [100/100], Loss: 0.0015
Training model without sentiment...
Epoch [20/100], Loss: 0.0043
Epoch [40/100], Loss: 0.0028


  mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100


Epoch [20/100], Loss: 0.0119
Epoch [40/100], Loss: 0.0077
Epoch [60/100], Loss: 0.0071
Epoch [80/100], Loss: 0.0055
Epoch [100/100], Loss: 0.0042

Results Comparison:
              With Sentiment  Without Sentiment  Improvement %
Stock Metric                                                  
goog  MSE             0.0019             0.0025       -21.8540
      RMSE            0.0441             0.0499       -11.5998
      MAE             0.0336             0.0410       -18.0558
      MAPE            4.1450             4.9485       -16.2378
      R2              0.7210             0.6430        12.1317
d     MSE             0.0024             0.0024         1.6986
      RMSE            0.0495             0.0490         0.8457
      MAE             0.0388             0.0382         1.5008
      MAPE            8.5367             8.3787         1.8863
      R2              0.9077             0.9092        -0.1696
baba  MSE             0.0010             0.0011        -7.2442
      RMSE    

  mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  'Difference': metrics['with_sentiment'][metric] - metrics['without_sentiment'][metric]
  improvement = ((without_sentiment - with_sentiment) / without_sentiment) * 100
