# Problem 1: Raw Data Processing

Objective: Ingest and process raw stock market datasets.

Tasks:


1.Download the ETF and stock datasets from the primary dataset available at https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset.


2.Setup a data structure to retain all data from ETFs and stocks with the following columns:

Symbol: string

Security Name: string

Date: string (YYYY-MM-DD)

Open: float

High: float

Low: float

Close: float

Adj Close: float

Volume: int


3.Convert the resulting dataset into a structured format (e.g. Parquet).

In [None]:
!pip install kaggle

In [4]:
!kaggle datasets download -d jacksoncrow/stock-market-dataset

Downloading stock-market-dataset.zip to C:\Users\y46ju\OneDrive\Documents\Python Scripts\riskThinkingWorkSample\experiment




  0%|          | 0.00/522M [00:00<?, ?B/s]
  0%|          | 1.00M/522M [00:01<11:33, 788kB/s]
  0%|          | 2.00M/522M [00:01<06:05, 1.49MB/s]
  1%|          | 3.00M/522M [00:01<03:56, 2.31MB/s]
  1%|          | 4.00M/522M [00:01<03:01, 3.00MB/s]
  1%|          | 5.00M/522M [00:02<02:33, 3.54MB/s]
  1%|1         | 6.00M/522M [00:02<02:20, 3.85MB/s]
  1%|1         | 7.00M/522M [00:02<02:33, 3.51MB/s]
  2%|1         | 8.00M/522M [00:03<02:59, 3.00MB/s]
  2%|1         | 9.00M/522M [00:03<02:48, 3.19MB/s]
  2%|1         | 10.0M/522M [00:04<05:20, 1.67MB/s]
  2%|2         | 13.0M/522M [00:05<02:47, 3.19MB/s]
  3%|2         | 14.0M/522M [00:05<02:40, 3.33MB/s]
  3%|2         | 15.0M/522M [00:05<02:30, 3.53MB/s]
  3%|3         | 16.0M/522M [00:05<02:29, 3.54MB/s]
  3%|3         | 17.0M/522M [00:06<02:34, 3.43MB/s]
  3%|3         | 18.0M/522M [00:06<02:31, 3.49MB/s]
  4%|3         | 19.0M/522M [00:06<02:30, 3.51MB/s]
  4%|3         | 20.0M/522M [00:07<02:20, 3.76MB/s]
  4%|4         | 21.0

In [5]:
import zipfile

In [6]:
with zipfile.ZipFile('stock-market-dataset.zip', 'r') as zip_file:
    zip_file.extractall('./')

In [27]:
#!pip install pandas
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp311-cp311-win_amd64.whl (20.5 MB)
     ---------------------------------------- 0.0/20.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.5 MB 93.5 kB/s eta 0:03:40
     ---------------------------------------- 0.0/20.5 MB 93.5 kB/s eta 0:03:40
     --------------------------------------- 0.0/20.5 MB 108.9 kB/s eta 0:03:09
     --------------------------------------- 0.1/20.5 MB 142.2 kB/s eta 0:02:24
     --------------------------------------- 0.1/20.5 MB 142.2 kB/s eta 0:02:24
     --------------------------------------- 0.1/20.5 MB 145.6 kB/s eta 0:02:21
    

In [4]:
import pandas as pd
import os
import concurrent.futures
import time

In [21]:
dtypes = {
    'Symbol': str,
    'Security Name': str,
    'Date': str,
    'Open': float,
    'High': float,
    'Low': float,
    'Close': float,
    'Adj Close': float,
    'Volume': int
    }

In [12]:
symbols_valid_meta = pd.read_csv("symbols_valid_meta.csv")

In [15]:
symbols_valid_meta = symbols_valid_meta[['Symbol', 'Security Name', 'ETF']]

In [17]:
symbols_valid_meta_stocks = symbols_valid_meta[symbols_valid_meta.ETF=='N']
symbols_valid_meta_etfs = symbols_valid_meta[symbols_valid_meta.ETF=='Y']

In [19]:
symbols_valid_meta_stocks_symbol = symbols_valid_meta_stocks.Symbol.to_list()
symbols_valid_meta_stocks_security_name = symbols_valid_meta_stocks['Security Name'].to_list()
symbols_valid_meta_etfs_symbol = symbols_valid_meta_etfs.Symbol.to_list()
symbols_valid_meta_etfs_security_name = symbols_valid_meta_etfs['Security Name'].to_list()

In [25]:
os.mkdir('stocks_result')
os.mkdir('etfs_result')

In [None]:
for i in range(len(symbols_valid_meta_stocks_symbol)):
    df = pd.read_csv('stocks/'+symbols_valid_meta_stocks_symbol[i]+'.csv')
    df['Symbol'] = symbols_valid_meta_stocks_symbol[i]
    df['Security Name'] = symbols_valid_meta_stocks_security_name[i]
    df = df.astype(dtypes)
    df.to_parquet('stocks_result/'+symbols_valid_meta_stocks_symbol[i]+'.parquet')

In [37]:
def process_stock_file(symbol):
    df = pd.read_csv('stocks/'+symbol+'.csv')
    df['Symbol'] = symbol
    df['Security Name'] = symbols_valid_meta_stocks_security_name[symbols_valid_meta_stocks_symbol.index(symbol)]
    df = df.astype(dtypes)
    df.to_parquet('stocks_result/'+symbol+'.parquet')

# Create a thread pool with a maximum threads
max_threads_list = [1, 4, 16, 32, 64]
for max_threads in max_threads_list:
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Submit each stock symbol to the thread pool for processing
        futures = [executor.submit(process_stock_file, symbol) for symbol in symbols_valid_meta_stocks_symbol]
        # Wait for all threads to complete
        concurrent.futures.wait(futures)
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time:", execution_time)

Execution time: 95.2776210308075
Execution time: 47.03400111198425
Execution time: 44.99587678909302
Execution time: 45.331650257110596
Execution time: 47.69900417327881


In [38]:
def process_etf_file(symbol):
    df = pd.read_csv('etfs/'+symbol+'.csv')
    df['Symbol'] = symbol
    df['Security Name'] = symbols_valid_meta_etfs_security_name[symbols_valid_meta_etfs_symbol.index(symbol)]
    df = df.astype(dtypes)
    df.to_parquet('etfs_result/'+symbol+'.parquet')

# Create a thread pool with a maximum threads
max_threads_list = [1, 4, 16, 32, 64]
for max_threads in max_threads_list:
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Submit each stock symbol to the thread pool for processing
        futures = [executor.submit(process_etf_file, symbol) for symbol in symbols_valid_meta_etfs_symbol]
        # Wait for all threads to complete
        concurrent.futures.wait(futures)
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time:", execution_time)

Execution time: 28.3250253200531
Execution time: 16.208886861801147
Execution time: 15.483115673065186
Execution time: 15.678836822509766
Execution time: 15.579127311706543


max threads = 16

# Problem 2: Feature Engineering

Objective: Build some feature engineering on top of the dataset from Problem 1.

Tasks:

Calculate the moving average of the trading volume (Volume) of 30 days per each stock and ETF, and retain it in a newly added column vol_moving_avg.

Similarly, calculate the rolling median and retain it in a newly added column adj_close_rolling_med.

Retain the resulting dataset into the same format as Problem 1, but in its own stage/directory distinct from the first.

(Bonus) Write unit tests for any relevant logic.

In [44]:
import unittest

In [39]:
os.mkdir('stage')
os.mkdir('stage/stocks_result')
os.mkdir('stage/etfs_result')

In [68]:
# Read in the data
df = pd.read_parquet("stocks_result/A.parquet")

# Convert the Date column to a datetime data type
df["Date"] = pd.to_datetime(df["Date"])

# Set the Date column as the DataFrame's index
df.set_index("Date", inplace=True)

# Calculate the rolling 30-day average of the Volume column
df["vol_moving_avg"] = df["Volume"].rolling(window=30).mean()
df["adj_close_rolling_med"] = df["Adj Close"].rolling(window=30).median()

# Drop any rows with missing values
df.dropna(inplace=True)
df.reset_index(inplace=True)
df = df.astype(dtypes)
print(df)

            Date       Open       High        Low      Close  Adj Close   
0     1999-12-31  56.866951  57.179901  54.542202  55.302216  47.562416  \
1     2000-01-03  56.330471  56.464592  48.193848  51.502148  44.294170   
2     2000-01-04  48.730328  49.266811  46.316166  47.567955  40.910591   
3     2000-01-05  47.389126  47.567955  43.141987  44.617310  38.372894   
4     2000-01-06  44.080830  44.349072  41.577251  42.918453  36.911816   
...          ...        ...        ...        ...        ...        ...   
5090  2020-03-26  70.000000  74.449997  69.650002  73.720001  73.532867   
5091  2020-03-27  71.550003  73.209999  70.279999  70.910004  70.730003   
5092  2020-03-30  71.059998  73.180000  71.059998  72.669998  72.669998   
5093  2020-03-31  72.339996  72.800003  70.500000  71.620003  71.620003   
5094  2020-04-01  69.470001  70.230003  68.150002  68.919998  68.919998   

       Volume Symbol                            Security Name  vol_moving_avg   
0     1931100     

In [56]:
class TestRollingMetrics(unittest.TestCase):
    
    def setUp(self):
        # Create a sample DataFrame with dates and prices
        dates = pd.date_range('2023-04-01', periods=100, freq='D')
        prices = [i**2 for i in range(100)]
        self.df = pd.DataFrame({'Date': dates, 'Adj Close': prices})
        self.df.set_index('Date', inplace=True)

    def test_adj_close_rolling_median(self):
        # Calculate the rolling median using the DataFrame's built-in method
        df_roll = self.df.rolling(window=30).median()

        # Calculate the rolling median using our custom function
        adj_close_rolling_median = self.df['Adj Close'].rolling(window=30).median()

        # Compare the two results
        self.assertTrue(adj_close_rolling_median.equals(df_roll['Adj Close']))

    def test_volume_30d_ma(self):
        # Add a sample Volume column
        self.df['Volume'] = [i**3 for i in range(100)]

        # Calculate the 30-day moving average using the DataFrame's built-in method
        df_roll = self.df.rolling(window=30).mean()

        # Calculate the 30-day moving average using our custom function
        volume_30d_ma = self.df['Volume'].rolling(window=30).mean()

        # Compare the two results
        self.assertTrue(volume_30d_ma.equals(df_roll['Volume']))


In [57]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestRollingMetrics)
unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.009s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>

In [69]:
def fe_stock_file(symbol):
    df = pd.read_parquet('stocks_result/'+symbol+'.parquet')
    # Convert the Date column to a datetime data type
    df["Date"] = pd.to_datetime(df["Date"])

    # Set the Date column as the DataFrame's index
    df.set_index("Date", inplace=True)

    # Calculate the rolling 30-day average of the Volume column
    df["vol_moving_avg"] = df["Volume"].rolling(window=30).mean()
    df["adj_close_rolling_med"] = df["Adj Close"].rolling(window=30).median()

    # Drop any rows with missing values
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df = df.astype(dtypes)
    df.to_parquet('stage/stocks_result/'+symbol+'.parquet')
    
# Create a thread pool with a maximum threads
max_threads_list = [16]
for max_threads in max_threads_list:
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Submit each stock symbol to the thread pool for processing
        futures = [executor.submit(fe_stock_file, symbol) for symbol in symbols_valid_meta_stocks_symbol]
        # Wait for all threads to complete
        concurrent.futures.wait(futures)
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time:", execution_time)

Execution time: 91.37823987007141


In [71]:
def fe_etf_file(symbol):
    df = pd.read_parquet('etfs_result/'+symbol+'.parquet')
    # Convert the Date column to a datetime data type
    df["Date"] = pd.to_datetime(df["Date"])

    # Set the Date column as the DataFrame's index
    df.set_index("Date", inplace=True)

    # Calculate the rolling 30-day average of the Volume column
    df["vol_moving_avg"] = df["Volume"].rolling(window=30).mean()
    df["adj_close_rolling_med"] = df["Adj Close"].rolling(window=30).median()

    # Drop any rows with missing values
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df = df.astype(dtypes)
    df.to_parquet('stage/etfs_result/'+symbol+'.parquet')
    
# Create a thread pool with a maximum threads
max_threads_list = [16]
for max_threads in max_threads_list:
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        # Submit each stock symbol to the thread pool for processing
        futures = [executor.submit(fe_etf_file, symbol) for symbol in symbols_valid_meta_etfs_symbol]
        # Wait for all threads to complete
        concurrent.futures.wait(futures)
    end_time = time.time()
    execution_time = end_time - start_time
    print("Execution time:", execution_time)

Execution time: 26.856921672821045


# Problem 3: Integrate ML Training

Objective: Integrate an ML predictive model training step into the data pipeline.

You can use the following simple Random Forest model as a reference:

In [62]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (8.3 MB)
     ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/8.3 MB 81.9 kB/s eta 0:01:41
     ---------------------------------------- 0.0/8.3 MB 81.9 kB/s eta 0:01:41
     ---------------------------------------- 0.0/8.3 MB 98.5 kB/s eta 0:01:24
     ---------------------------------------- 0.0/8.3 MB 98.5 kB/s eta 0:01:24
     ---------------------------------------- 0.1/8.3 MB 126.1 kB/s eta 0:01:06
     ---------------------------------------- 0.1/8.3 MB 126.9 kB/s eta 0:01:05
     -

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, 

In [33]:
data = pd.read_parquet('stage/stocks_result/'+'A'+'.parquet')

In [34]:
# Assume `data` is loaded as a Pandas DataFrame
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Remove rows with NaN values
data.dropna(inplace=True)

# Select features and target
features = ['vol_moving_avg', 'adj_close_rolling_med']
target = ['Volume']

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on test data
y_pred = model.predict(X_test)

# Calculate the Mean Absolute Error and Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

  model.fit(X_train, y_train)


In [35]:
mae, mse, rmse

(1186762.5740922473, 4129717606796.904, 2032170.663796942)

In [78]:
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o



Looking in links: http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html
Collecting h2o
  Downloading h2o-3.40.0.4.tar.gz (177.6 MB)
     ---------------------------------------- 0.0/177.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/177.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/177.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/177.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/177.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/177.6 MB ? eta -:--:--
     -------------------------------------- 0.0/177.6 MB 100.9 kB/s eta 0:29:21
     -------------------------------------- 0.0/177.6 MB 109.3 kB/s eta 0:27:05
     -------------------------------------- 0.0/177.6 MB 109.3 kB/s eta 0:27:05
     -------------------------------------- 0.1/177.6 MB 142.6 kB/s eta 0:20:45
     -------------------------------------- 0.1/177.6 MB 145.6 kB/s eta 0:20:20
     -----------------

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 19.0.2+7-44, mixed mode, sharing)
  Starting server from C:\Users\y46ju\anaconda3\envs\riskthinking\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\y46ju\AppData\Local\Temp\tmp2scpj3ew
  JVM stdout: C:\Users\y46ju\AppData\Local\Temp\tmp2scpj3ew\h2o_y46ju_started_from_python.out
  JVM stderr: C:\Users\y46ju\AppData\Local\Temp\tmp2scpj3ew\h2o_y46ju_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Hong_Kong
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,3 days
H2O_cluster_name:,H2O_from_python_y46ju_kgm5de
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.910 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


In [None]:
h2o_train, h2o_test = train_test_split(data, test_size=0.2, random_state=42)

In [29]:
h2o_train, h2o_test = h2o.H2OFrame(h2o_train), h2o.H2OFrame(h2o_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [31]:
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=features, y=target[0], training_frame=h2o_train)

AutoML progress: |
05:46:57.133: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),9/20
# GBM base models (used / total),3/10
# DeepLearning base models (used / total),4/7
# DRF base models (used / total),2/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,1172603.5,62572.926,1261678.4,1139371.8,1203159.5,1160682.8,1098125.1
mean_residual_deviance,3763734800000.0,553092450000.0,4396738700000.0,3206608800000.0,3553183700000.0,4310217000000.0,3351925500000.0
mse,3763734800000.0,553092450000.0,4396738700000.0,3206608800000.0,3553183700000.0,4310217000000.0,3351925500000.0
null_deviance,4478887700000000.0,828659200000000.0,5492251000000000.0,4183988000000000.0,3837815300000000.0,5220853000000000.0,3659530400000000.0
r2,0.3100844,0.0381178,0.3301585,0.3485484,0.247884,0.3171034,0.3067277
residual_deviance,3072828400000000.0,508424400000000.0,3658086700000000.0,2725617400000000.0,2885185000000000.0,3564549600000000.0,2530703700000000.0
rmse,1935892.5,141664.02,2096840.2,1790700.6,1884989.0,2076106.2,1830826.5
rmsle,0.4103421,0.0118807,0.4197982,0.4116511,0.423571,0.4006482,0.3960417


In [32]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_1_20230502_54657,1944710.0,3781880000000.0,1173810.0,0.409369,3781880000000.0
StackedEnsemble_BestOfFamily_1_AutoML_1_20230502_54657,1945220.0,3783890000000.0,1172980.0,0.408562,3783890000000.0
GBM_grid_1_AutoML_1_20230502_54657_model_2,1955870.0,3825410000000.0,1190180.0,0.412887,3825410000000.0
DeepLearning_grid_1_AutoML_1_20230502_54657_model_1,1967840.0,3872410000000.0,1206750.0,0.421371,3872410000000.0
DeepLearning_grid_2_AutoML_1_20230502_54657_model_1,1969190.0,3877700000000.0,1185460.0,0.414661,3877700000000.0
GBM_grid_1_AutoML_1_20230502_54657_model_5,1969350.0,3878320000000.0,1191910.0,0.416927,3878320000000.0
DeepLearning_grid_1_AutoML_1_20230502_54657_model_2,1970900.0,3884450000000.0,1229040.0,0.427123,3884450000000.0
DeepLearning_1_AutoML_1_20230502_54657,1972000.0,3888780000000.0,1209000.0,0.419118,3888780000000.0
DeepLearning_grid_3_AutoML_1_20230502_54657_model_1,1977520.0,3910570000000.0,1212870.0,0.421113,3910570000000.0
GBM_grid_1_AutoML_1_20230502_54657_model_4,1981140.0,3924920000000.0,1213080.0,0.422635,3924920000000.0


In [37]:
perf = aml.leader.model_performance(h2o_test)
perf

In [38]:
model_path = h2o.save_model(model=aml.get_best_model(), path="./", force=True)
print(model_path)

C:\Users\y46ju\OneDrive\Documents\Python Scripts\riskThinkingWorkSample\experiment\StackedEnsemble_AllModels_1_AutoML_1_20230502_54657


In [6]:
saved_model = h2o.load_model("StackedEnsemble_AllModels_1_AutoML_1_20230502_54657")

In [41]:
h2o_test

Open,High,Low,Close,Adj Close,Volume,Symbol,Security Name,vol_moving_avg,adj_close_rolling_med
19.3133,20.1001,19.3133,19.8999,17.1148,3739300.0,A,"Agilent Technologies, Inc. Common Stock",3716530.0,16.761
36.6094,36.7382,36.3734,36.6309,34.284,2206100.0,A,"Agilent Technologies, Inc. Common Stock",3109330.0,34.3476
11.4807,11.7239,11.402,11.4735,9.86776,5414400.0,A,"Agilent Technologies, Inc. Common Stock",2813750.0,9.84623
33.1259,33.505,33.1259,33.2403,31.0105,4099200.0,A,"Agilent Technologies, Inc. Common Stock",4103120.0,29.5147
67.686,68.3566,64.3777,64.3777,55.3677,1123200.0,A,"Agilent Technologies, Inc. Common Stock",2342830.0,64.5957
17.9828,18.2976,17.8827,17.9185,16.3557,4399200.0,A,"Agilent Technologies, Inc. Common Stock",4390770.0,17.6224
23.9628,24.6209,23.7697,24.2203,22.108,5897400.0,A,"Agilent Technologies, Inc. Common Stock",4561900.0,21.8641
70.17,70.83,69.91,70.38,69.2888,2090200.0,A,"Agilent Technologies, Inc. Common Stock",2182750.0,66.2713
24.0558,24.2847,23.7053,23.7697,21.6966,4984200.0,A,"Agilent Technologies, Inc. Common Stock",5131300.0,23.7436
24.1774,24.7425,24.0129,24.0272,21.9317,4682100.0,A,"Agilent Technologies, Inc. Common Stock",4499870.0,23.0776


In [7]:
frame = h2o.H2OFrame({"vol_moving_avg":3716530, "adj_close_rolling_med":16.761})

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [8]:
frame

vol_moving_avg,adj_close_rolling_med
3716530.0,16.761


In [9]:
saved_model.predict(frame)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict
3717830.0
