In [6]:
pip install feedparser pandas_market_calendars

Collecting pandas_market_calendars
  Downloading pandas_market_calendars-4.4.2-py3-none-any.whl.metadata (9.1 kB)
Collecting exchange-calendars>=3.3 (from pandas_market_calendars)
  Downloading exchange_calendars-4.5.8-py3-none-any.whl.metadata (37 kB)
Collecting pyluach (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting korean-lunar-calendar (from exchange-calendars>=3.3->pandas_market_calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Downloading pandas_market_calendars-4.4.2-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.1/108.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading exchange_calendars-4.5.8-py3-none-any.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.8/196.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading korean_lunar_calendar-0.3.1-py3-none-any.whl (9.

In [7]:
import pandas as pd
import yfinance as yf
import feedparser
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np
import pandas_market_calendars as mcal

In [8]:
# Download VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
# Initialize VADER SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [22]:
# Function to get sentiment score
def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']

In [23]:
# Function to generate date range for the past 2 years (excluding today's date)
def generate_date_range():
    end_date = datetime.now() - timedelta(days=1)
    start_date = end_date - timedelta(days=2*365)
    return pd.date_range(start=start_date, end=end_date)

In [24]:
# Function to fetch headlines for a given date from Yahoo Finance RSS feed for a specific stock ticker
def fetch_headlines(date, stock):
    url = f"https://finance.yahoo.com/rss/headline?s={stock}"
    feed = feedparser.parse(url)
    headlines = [
        entry.title for entry in feed.entries
        if datetime(*entry.published_parsed[:6]).date() == date.date()
    ]
    return headlines

In [25]:
# Function to fill missing sentiment scores with backward interpolation
def fill_missing_scores(df):
    df['Sentiment'] = df['Sentiment'].interpolate(method='linear', limit_direction='backward')
    return df

In [26]:
# Function to adjust historical prices for stock splits
def adjust_for_splits(hist):
    splits = hist['Stock Splits']
    for date, split in splits[splits != 0].items():
        hist.loc[:date, 'Close'] /= split
    return hist

In [27]:
def get_trading_days():
    nyse = mcal.get_calendar('NYSE')
    schedule = nyse.schedule(start_date=(datetime.now() - timedelta(days=2*365)).strftime('%Y-%m-%d'),
                              end_date=(datetime.now() + timedelta(days=30)).strftime('%Y-%m-%d'))
    return schedule.index.to_pydatetime()


In [28]:
# Get stock ticker from user
stock = input("Enter the stock ticker: ")

Enter the stock ticker: AAPL


In [29]:
# Fetch historical stock data for the past 2 years (excluding today's date)
stock_data = yf.Ticker(stock)
hist = stock_data.history(start=(datetime.now() - timedelta(days=2*365)), end=(datetime.now() - timedelta(days=1)))


In [30]:
# Adjust historical prices for stock splits
hist = adjust_for_splits(hist)

In [31]:
# Generate date range for the past 2 years (excluding today's date)
date_range = generate_date_range()

In [32]:
# Create a DataFrame with dates
df = pd.DataFrame(date_range, columns=['Date'])
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
df.set_index('Date', inplace=True)

In [34]:
# Generate Sentiment Scores
df['Headlines'] = df.index.to_series().apply(lambda date: fetch_headlines(datetime.strptime(date, '%Y-%m-%d'), stock))
df['Sentiment'] = df['Headlines'].apply(lambda headlines: get_sentiment_score(' '.join(headlines)) if headlines else None)
df = fill_missing_scores(df)

In [45]:
# Add closing prices to the DataFrame
# Convert the index to DatetimeIndex if it's not already
hist.index = pd.to_datetime(hist.index)
hist['Date'] = hist.index.strftime('%Y-%m-%d')
hist.set_index('Date', inplace=True)
df['Close'] = hist['Close']

In [46]:
# Interpolate missing closing prices
df['Close'] = df['Close'].interpolate(method='linear', limit_direction='backward').interpolate(method='linear', limit_direction='forward')
df = df[['Close', 'Sentiment']]

In [47]:
# Fill any remaining NaN values in closing prices with forward interpolation as a fallback
df['Close'] = df['Close'].interpolate(method='linear', limit_direction='forward')

# Remove the Headlines column and keep only Date, Closing Price, Sentiment Score
df = df[['Close', 'Sentiment']]

In [48]:
# Standardize the closing prices and sentiment scores
scaler = StandardScaler()
df[['Close', 'Sentiment']] = scaler.fit_transform(df[['Close', 'Sentiment']])

In [49]:
# Create Features and Labels
X = df[['Close', 'Sentiment']].values

train_size = int(len(X) * 0.8)
train, test = X[:train_size], X[train_size:]

train_X, train_y = train[:-1], train[1:, 0]
test_X, test_y = test[:-1], test[1:, 0]

train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))


In [50]:
# Build LSTM Model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(train_X, train_y, epochs=50, batch_size=32, validation_data=(test_X, test_y), verbose=2, shuffle=False)


  super().__init__(**kwargs)


Epoch 1/50
19/19 - 3s - 155ms/step - loss: 0.6214 - val_loss: 2.5165
Epoch 2/50
19/19 - 0s - 6ms/step - loss: 0.5730 - val_loss: 2.4611
Epoch 3/50
19/19 - 0s - 7ms/step - loss: 0.5338 - val_loss: 2.4070
Epoch 4/50
19/19 - 0s - 6ms/step - loss: 0.4961 - val_loss: 2.3520
Epoch 5/50
19/19 - 0s - 7ms/step - loss: 0.4590 - val_loss: 2.2934
Epoch 6/50
19/19 - 0s - 7ms/step - loss: 0.4223 - val_loss: 2.2283
Epoch 7/50
19/19 - 0s - 7ms/step - loss: 0.3860 - val_loss: 2.1540
Epoch 8/50
19/19 - 0s - 6ms/step - loss: 0.3498 - val_loss: 2.0682
Epoch 9/50
19/19 - 0s - 6ms/step - loss: 0.3136 - val_loss: 1.9693
Epoch 10/50
19/19 - 0s - 5ms/step - loss: 0.2777 - val_loss: 1.8562
Epoch 11/50
19/19 - 0s - 7ms/step - loss: 0.2427 - val_loss: 1.7289
Epoch 12/50
19/19 - 0s - 5ms/step - loss: 0.2088 - val_loss: 1.5895
Epoch 13/50
19/19 - 0s - 7ms/step - loss: 0.1767 - val_loss: 1.4399
Epoch 14/50
19/19 - 0s - 6ms/step - loss: 0.1467 - val_loss: 1.2832
Epoch 15/50
19/19 - 0s - 7ms/step - loss: 0.1194 - val_

<keras.src.callbacks.history.History at 0x7f679b49e7a0>

In [51]:
# Get the number of trading days to predict from user
n_days = int(input("Enter the number of trading days to predict: "))

Enter the number of trading days to predict: 5


In [52]:
# Filter for trading days
trading_days = get_trading_days()
future_predictions_scaled = []
last_sequence_scaled = X[-1].reshape((1, 1, X.shape[1]))

predicted_dates = []
for current_date in trading_days:
    if len(future_predictions_scaled) >= n_days:
        break

    next_pred_scaled = model.predict(last_sequence_scaled)
    future_predictions_scaled.append(next_pred_scaled[0][0])
    predicted_dates.append(current_date.strftime('%Y-%m-%d'))

    last_sequence_scaled = np.concatenate([last_sequence_scaled[:, :, 1:], next_pred_scaled.reshape(1, 1, 1)], axis=2)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


In [53]:
# Transform back to Original Scale
future_predictions_scaled_array = np.array(future_predictions_scaled).reshape(-1, 1)
future_predictions_original_array = scaler.inverse_transform(
    np.concatenate((future_predictions_scaled_array, np.zeros_like(future_predictions_scaled_array)), axis=1)
)[:, 0]


In [54]:
#Dispaly Predictions
print(f"Predicted closing prices for the next {n_days} trading days:")
for date, price in zip(predicted_dates, future_predictions_original_array):
    print(f"{date}: {price:.2f}")

Predicted closing prices for the next 5 trading days:
2022-11-21: 220.74
2022-11-22: 184.39
2022-11-23: 215.00
2022-11-25: 185.81
2022-11-28: 210.82


In [55]:
# Get the number of trading days to predict from user
n_days = int(input("Enter the number of trading days to predict: "))


# Filter for trading days
trading_days = get_trading_days()
future_predictions_scaled = []
last_sequence_scaled = X[-1].reshape((1, 1, X.shape[1]))


predicted_dates = []
for current_date in trading_days:
    if len(future_predictions_scaled) >= n_days:
        break

    next_pred_scaled = model.predict(last_sequence_scaled)
    future_predictions_scaled.append(next_pred_scaled[0][0])
    predicted_dates.append(current_date.strftime('%Y-%m-%d'))

    last_sequence_scaled = np.concatenate([last_sequence_scaled[:, :, 1:], next_pred_scaled.reshape(1, 1, 1)], axis=2)


# Transform back to Original Scale
future_predictions_scaled_array = np.array(future_predictions_scaled).reshape(-1, 1)
future_predictions_original_array = scaler.inverse_transform(
    np.concatenate((future_predictions_scaled_array, np.zeros_like(future_predictions_scaled_array)), axis=1)
)[:, 0]


#Dispaly Predictions
print(f"Predicted closing prices for the next {n_days} trading days:")
for date, price in zip(predicted_dates, future_predictions_original_array):
    print(f"{date}: {price:.2f}")

Enter the number of trading days to predict: 20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32