I chose the LSTM model to predict NVDA stock prices because it's well-suited for capturing sequential patterns in time-series data. LSTM's ability to remember and learn from past price movements allows it to effectively analyze historical stock data and make future predictions. This model excels at understanding the complex, non-linear relationships often found in stock prices, making it a reliable choice for forecasting in the financial market.


Data Source are downloaded from https://www.investing.com/equities/nvidia-corp-historical-data

# Data Loading and Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import datetime
import glob
import os

In [2]:
# Function to create a DataFrame by concatenating multiple CSV files from a given directory
def create_df(data_dir):
    # List all CSV files in the directory
    file_list = glob.glob(os.path.join(data_dir, '*.csv'))
    data = []

    # Read and append data from each CSV file to create a consolidated DataFrame
    for file in file_list:
        df = pd.read_csv(file)
        data.append(df)

    # Concatenate the data from all CSV files
    df = pd.concat(data)

    # Set the "Date" column to a datetime format, handling different date formats
    if df["Date"].str.contains("/").any():
        try:
            df["Date"] = pd.to_datetime(df['Date'], format="%m/%d/%Y")
        except ValueError:
            df["Date"] = pd.to_datetime(df['Date'], format="%d/%m/%Y")
    elif df["Date"].str.contains(",").any():
        df["Date"] = pd.to_datetime(df["Date"], format="%b %d, %Y")

    # Sort the DataFrame by the "Date" column
    df = df.sort_values(by="Date")

    # Clean the data and set the "Date" column as the index
    df = df.drop(columns=["Change %","Open","High","Low", "Vol."])
    df = df.set_index("Date")

    # Resample the data to fill missing dates and reset the index
    df = df.resample('D').ffill()
    df = df.reset_index()

    # Convert the "Price" column to float type, handling comma-separated values
    if df["Price"].dtypes == object :
      df["Price"] = df["Price"].str.replace(",","").astype(np.float64)

    # Apply a logarithmic transformation to the "Price" column
    df["Price"] = np.log(df["Price"])

    return df

In [3]:
# Load data
df = create_df(r"/content/Untitled Folder")
df

Unnamed: 0,Date,Price
0,1999-01-25,-0.798508
1,1999-01-26,-0.867501
2,1999-01-27,-0.867501
3,1999-01-28,-0.867501
4,1999-01-29,-0.916291
...,...,...
9030,2023-10-16,6.133290
9031,2023-10-17,6.085365
9032,2023-10-18,6.044911
9033,2023-10-19,6.042657


In [4]:
# Initialize the MinMaxScaler for data scaling
scaler = MinMaxScaler()

# Function to split data into features and target
def split_features_target(dataframe,sequence_length):
  scaled_data = scaler.fit_transform(dataframe['Price'].values.reshape(-1, 1))
  X = []
  y = []
  for i in range(len(scaled_data) - sequence_length):
      X.append(scaled_data[i:i+sequence_length, 0])
      y.append(scaled_data[i+sequence_length, 0])

  X = np.array(X)
  y = np.array(y)
  return X,y

In [5]:
# Split data into features (X) and target (y) for training
sequence_length = 30 # You can adjust this value
X, y = split_features_target(df,30)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [6]:
# Build an LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(units=50))
model.add(Dense(1))
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [7]:
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7fa7206ed780>

In [8]:
# Make predictions
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# Inverse transform the predictions to the original scale
train_predictions = scaler.inverse_transform(train_predictions)
test_predictions = scaler.inverse_transform(test_predictions)



In [9]:
# Evaluate the model using RMSE
from sklearn.metrics import mean_squared_error
import math

# Concatenate predictions and actual values for training and testing sets
all_predictions = np.concatenate((np.exp(train_predictions), np.exp(test_predictions)))
all_actual = np.concatenate((np.exp(y_train), np.exp(y_test)))

# Calculate the overall RMSE for the entire dataset
overall_rmse = math.sqrt(mean_squared_error(all_actual, all_predictions))

print(f"Overall RMSE: {overall_rmse:.4f}")

Overall RMSE: 86.3769


In [10]:
# Create a DataFrame with matching lengths for plotting
data_length = len(test_predictions)
df_pred = pd.DataFrame({
    'Date': df.iloc[-data_length:,0],
    'Testing Predictions': test_predictions.flatten(),
    'Actual Prices': df['Price'].tail(data_length)
})

# Create an interactive plot for visualizing the predictions
fig = px.line(df_pred, x='Date', y=['Testing Predictions', 'Actual Prices'],
              labels={'value': 'Stock Price(log)', 'variable': 'Data Type'},
              title="NVDA Stock Price Prediction In Log Scale")

fig.show()

In [11]:
# Create a DataFrame with matching lengths for USD price plotting
data_length = len(test_predictions)
df_pred2 = pd.DataFrame({
    'Date': df.iloc[-data_length:,0],
    'Testing Predictions': np.exp(test_predictions.flatten()),
    'Actual Prices': np.exp(df['Price'].tail(data_length))
})

# Create an interactive plot for visualizing the predictions in USD
fig = px.line(df_pred2, x='Date', y=['Testing Predictions', 'Actual Prices'],
              labels={'value': 'Stock Price(USD)', 'variable': 'Data Type'},
              title="NVDA Stock Price Prediction In Original Scale")

fig.show()

In [12]:
# Create a copy of the original DataFrame to predict future prices
df_copy = df
numday_futurepredict = 18

# Predict future stock prices day by day
for _ in range(numday_futurepredict):
    # Get the last 30 days of data and scale it
    last_30_days = df_copy['Price'].tail(30).values.reshape(-1, 1)
    scaled_last_30_days = scaler.fit_transform(last_30_days)[0]

    # Make a prediction for the next day
    new_y = model.predict(np.array([scaled_last_30_days]))
    new_y = scaler.inverse_transform(new_y)

    # Calculate the date for the next day
    last_date = df_copy.iloc[-1, 0]
    new_date = last_date + pd.Timedelta(days=1)

    # Create a new DataFrame with the new date and predicted price
    new_df = pd.DataFrame({'Date': [new_date], 'Price': [new_y[0, 0]]})

    # Append the new data to the copy of the original DataFrame
    df_copy = pd.concat([df_copy, new_df], ignore_index=True)



In [13]:
# Convert the log-scaled prices back to USD
df_copy["Price"] = np.exp(df_copy["Price"])

In [14]:
# Display the predictions
predictions = df_copy.iloc[-14:].reset_index(drop=True)
predictions

Unnamed: 0,Date,Price
0,2023-10-25,415.548638
1,2023-10-26,414.456887
2,2023-10-27,416.433919
3,2023-10-28,418.752512
4,2023-10-29,420.346964
5,2023-10-30,420.346964
6,2023-10-31,420.346964
7,2023-11-01,425.633385
8,2023-11-02,420.417925
9,2023-11-03,422.527543


In [15]:
# Create a DataFrame with matching lengths for visualizing predictions
df_pred = pd.DataFrame({
    'Date': df_copy.iloc[-30-numday_futurepredict:,0],
    'Predictions': df_copy.iloc[-30-numday_futurepredict:,1],
    'Actual Prices': df_copy.iloc[-30-numday_futurepredict:-numday_futurepredict,1]
})

# Create an interactive plot for visualizing future predictions
fig = px.line(df_pred, x='Date', y=['Predictions', 'Actual Prices'],
              labels={'value': 'Stock Price', 'variable': 'Data Type'},
              title="NVDA Stock Price Prediction")

fig.show()

In [16]:
#Export the predictions
predictions.to_csv("HUASHIAOHONG_Project_1.csv", index=False)