In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import math


In [2]:
# Load each CSV
nvda = pd.read_csv("NVDA_daily_data.csv")
tsm = pd.read_csv("TSM_daily_data.csv")
samsung = pd.read_csv("005930.KS_daily_data.csv")
asml = pd.read_csv("ASML_daily_data.csv")
cdns = pd.read_csv("CDNS_daily_data.csv")
snps = pd.read_csv("SNPS_daily_data.csv")


In [3]:
dfs = [nvda, tsm, samsung, asml, cdns, snps]

for df in dfs:
    df["Date"] = pd.to_datetime(df["Date"])
    df.sort_values("Date", inplace=True)
    df.reset_index(drop=True, inplace=True)

nvda.head()
nvda.columns


Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [4]:
def prep_supplier(df, prefix):
    out = df[["Date", "Close", "Volume"]].copy()
    out.rename(
        columns={
            "Close": f"{prefix}_Close",
            "Volume": f"{prefix}_Volume"
        },
        inplace=True
    )
    return out


In [5]:
asml_s = prep_supplier(asml, "ASML")
cdns_s = prep_supplier(cdns, "CDNS")
snps_s = prep_supplier(snps, "SNPS")
tsm_s  = prep_supplier(tsm, "TSM")
ssng_s = prep_supplier(samsung, "SSNG")  # Samsung


In [6]:
data = nvda[["Date", "Close", "Volume"]].copy()
data.rename(
    columns={
        "Close": "NVDA_Close",
        "Volume": "NVDA_Volume"
    },
    inplace=True
)


In [12]:
suppliers = [asml_s, cdns_s, snps_s, tsm_s, ssng_s]

for sup in suppliers:
    data = data.merge(sup, on="Date", how="inner")

data.head()



In [16]:
# Target = next day's NVDA close
data["NVDA_Close_next"] = data["NVDA_Close"].shift(-1)

# Drop last row (its target is NaN because there is no next day after it)
data = data.dropna(subset=["NVDA_Close_next"])

data.head()

Unnamed: 0,Date,NVDA_Close,NVDA_Volume,ASML_Close_x,ASML_Volume_x,CDNS_Close_x,CDNS_Volume_x,SNPS_Close_x,SNPS_Volume_x,TSM_Close_x,...,ASML_Close,ASML_Volume,CDNS_Close,CDNS_Volume,SNPS_Close,SNPS_Volume,TSM_Close,TSM_Volume,SSNG_Close,SSNG_Volume
0,2000-01-04,0.094922,300480000,40.734375,968800,22.0,1796600,30.4375,1291000,18.078094,...,40.734375,968800,22.0,1796600,30.4375,1291000,18.078094,2557142,6110.0,74195000
1,2000-01-05,0.091797,188352000,39.609375,1458133,21.625,2758000,30.703125,3343800,18.196564,...,39.609375,1458133,21.625,2758000,30.703125,3343800,18.196564,2295467,5580.0,74680000
2,2000-01-06,0.085807,120480000,37.171875,3517867,20.9375,1505800,28.46875,3107400,17.438372,...,37.171875,3517867,20.9375,1505800,28.46875,3107400,17.438372,1539456,5620.0,54390000
3,2000-01-07,0.08724,71184000,38.015625,1631200,22.3125,1891000,28.75,1710800,18.101789,...,38.015625,1631200,22.3125,1891000,28.75,1710800,18.101789,1511230,5540.0,40305000
4,2000-01-10,0.090104,239856000,41.625,1341867,23.375,1485600,30.15625,1718600,18.836285,...,41.625,1341867,23.375,1485600,30.15625,1718600,18.836285,1102626,5770.0,46880000


In [17]:
data.feature_cols = ["NVDA_Close",
    "NVDA_Volume",
    "ASML_Close",
    "ASML_Volume",
    "CDNS_Close",
    "CDNS_Volume",
    "SNPS_Close",
    "SNPS_Volume",
    "TSM_Close",
    "TSM_Volume",
    "SSNG_Close",
    "SSNG_Volume"]

X = data[feature_cols]
y = data["NVDA_Close_next"]


  data.feature_cols = ["NVDA_Close",


In [18]:
n_rows = len(data)
train_size = int(n_rows * 0.8)

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_test = X.iloc[train_size:]
y_test = y.iloc[train_size:]

print(X_train.shape, X_test.shape)


(4795, 12) (1199, 12)


In [19]:
rf = RandomForestRegressor(
    n_estimators=300,   # number of trees
    max_depth=None,     # let trees grow fully (you can limit this)
    random_state=42,    # reproducibility
    n_jobs=-1           # use all CPU cores
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
y_pred = rf.predict(X_test)

rmse = math.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R^2 : {r2:.4f}")


RMSE: 38.8581
MAE : 24.1627
R^2 : -0.6152


In [21]:
# Take the last row in 'data' as "today"
last_row = data.iloc[[-1]]  # keep as a DataFrame

X_latest = last_row[feature_cols]

next_day_prediction = rf.predict(X_latest)[0]

print("Last date in dataset:", last_row["Date"].values[0])
print("Predicted NVDA close for next trading day:", next_day_prediction)


Last date in dataset: 2024-09-23T00:00:00.000000000
Predicted NVDA close for next trading day: 7.008241728146871
