# Stock Predictor

## Create S3 bucket for data

In [108]:
import boto3
s3 = boto3.resource('s3')

In [109]:
bucket_name = 'sevagstockpriceproject'
try:
    s3.create_bucket(Bucket=bucket_name, 
                     CreateBucketConfiguration={
                         'LocationConstraint': 'us-east-2'
                     })
    print('S3 bucked created successfully')
except Exception as e:
    print(e)

An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


## Fetch DOW stock data

In [110]:
%pip install yfinance

import pandas as pd
from datetime import datetime
import yfinance as yf

# start and end dates for stock data
start_date = datetime(2021, 1, 1)
end_date = datetime(2023, 1, 1)

# fetch data
df = yf.download('DJIA', start=start_date, end=end_date)
df



[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-02-24,23.780001,24.202999,23.650000,24.202999,20.650463,22700
2022-02-25,24.410000,24.580999,24.190001,24.580999,20.972975,26800
2022-02-28,24.670000,24.670000,24.440001,24.598000,20.987482,4100
2022-03-01,24.780001,24.780001,24.219999,24.271999,20.709337,8300
2022-03-02,24.650000,24.709999,24.420000,24.660000,21.040384,14300
...,...,...,...,...,...,...
2022-12-23,21.830000,21.910000,21.629999,21.809999,20.103123,28500
2022-12-27,21.799999,21.879999,21.770000,21.780001,20.075474,21700
2022-12-28,21.990000,22.190001,21.750000,21.760000,20.057037,68300
2022-12-29,21.650000,21.670000,21.600000,21.639999,20.137999,85000


In [111]:
# reset index
df.reset_index(inplace=True)
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-02-24,23.780001,24.202999,23.650000,24.202999,20.650463,22700
1,2022-02-25,24.410000,24.580999,24.190001,24.580999,20.972975,26800
2,2022-02-28,24.670000,24.670000,24.440001,24.598000,20.987482,4100
3,2022-03-01,24.780001,24.780001,24.219999,24.271999,20.709337,8300
4,2022-03-02,24.650000,24.709999,24.420000,24.660000,21.040384,14300
...,...,...,...,...,...,...,...
210,2022-12-23,21.830000,21.910000,21.629999,21.809999,20.103123,28500
211,2022-12-27,21.799999,21.879999,21.770000,21.780001,20.075474,21700
212,2022-12-28,21.990000,22.190001,21.750000,21.760000,20.057037,68300
213,2022-12-29,21.650000,21.670000,21.600000,21.639999,20.137999,85000


## Extract features

The target for each day corresponds to the next day's open

In [112]:
# remove adj close and date columns
df.drop(axis=1, columns=['Adj Close', 'Date'], inplace=True)
df

Unnamed: 0,Open,High,Low,Close,Volume
0,23.780001,24.202999,23.650000,24.202999,22700
1,24.410000,24.580999,24.190001,24.580999,26800
2,24.670000,24.670000,24.440001,24.598000,4100
3,24.780001,24.780001,24.219999,24.271999,8300
4,24.650000,24.709999,24.420000,24.660000,14300
...,...,...,...,...,...
210,21.830000,21.910000,21.629999,21.809999,28500
211,21.799999,21.879999,21.770000,21.780001,21700
212,21.990000,22.190001,21.750000,21.760000,68300
213,21.650000,21.670000,21.600000,21.639999,85000


In [113]:
# ignore last row for features since does not contain a target for next day
df_features = df.iloc[:-1,:]

# fetch targets for each day
df_targets = df.iloc[1:,0].rename('Targets')
df_targets

1      24.410000
2      24.670000
3      24.780001
4      24.650000
5      24.770000
         ...    
210    21.830000
211    21.799999
212    21.990000
213    21.650000
214    21.820000
Name: Targets, Length: 214, dtype: float64

In [114]:
pd.options.mode.chained_assignment = None

# attach targtets to features
df_features['Target'] = list(df_targets)

# reinsert target as first column
col = df_features.pop('Target')
df_features.insert(0, 'Target', col)

df_final = df_features
df_final

Unnamed: 0,Target,Open,High,Low,Close,Volume
0,24.410000,23.780001,24.202999,23.650000,24.202999,22700
1,24.670000,24.410000,24.580999,24.190001,24.580999,26800
2,24.780001,24.670000,24.670000,24.440001,24.598000,4100
3,24.650000,24.780001,24.780001,24.219999,24.271999,8300
4,24.770000,24.650000,24.709999,24.420000,24.660000,14300
...,...,...,...,...,...,...
209,21.830000,21.860001,21.860001,21.549999,21.719999,29800
210,21.799999,21.830000,21.910000,21.629999,21.809999,28500
211,21.990000,21.799999,21.879999,21.770000,21.780001,21700
212,21.650000,21.990000,22.190001,21.750000,21.760000,68300


## Train-test split

In [115]:
import numpy as np

train_size = 0.8

train_data, test_data = np.split(df_final, [int(train_size*len(df_final))])

  return bound(*args, **kwds)


## Set bucket path

In [116]:
import os

# save train, test data in bucket

# define bucket paths
train_csv_path = 's3://{}/{}/{}/{}'.format(bucket_name, 'xgboost', 'train', 'train.csv')
test_csv_path = 's3://{}/{}/{}/{}'.format(bucket_name, 'xgboost', 'test', 'test.csv')

# save data to paths
train_data.to_csv(train_csv_path, index=False, header=False)
test_data.to_csv(test_csv_path, index=False, header=False)

## Build XGBoost Model

In [117]:
import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

xgboost_container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-2')
display(xgboost_container)

'257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-2'

## Define XGBoost Hyperparameters

In [118]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "verbosity":"1",
        "objective":"reg:squarederror",
        "num_round":"1000",
        "early_stopping_rounds":"10"
        }

## Set output path for trained model

In [119]:
output_path = 's3://{}/{}/{}/'.format(bucket_name, 'xgboost', 'output')

## Construct SageMaker Estimator

In [120]:
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m4.xlarge', 
                                          volume_size=5, 
                                          output_path=output_path,
                                         use_spot_instances=True,
                                         max_run=300,
                                         max_wait=600)

## Data type and paths for training and validation

In [121]:
train_input = TrainingInput('s3://{}/{}/{}/'.format(bucket_name, 'xgboost', 'train'), content_type='csv')
test_input = TrainingInput('s3://{}/{}/{}/'.format(bucket_name, 'xgboost', 'test'), content_type='csv')

## Train XGBoost

In [122]:
estimator.fit({'train': train_input, 'validation': test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-10-04-20-16-902


2024-01-10 04:20:17 Starting - Starting the training job...
2024-01-10 04:20:31 Starting - Preparing the instances for training.........
2024-01-10 04:21:57 Downloading - Downloading input data...
2024-01-10 04:22:27 Downloading - Downloading the training image...
2024-01-10 04:23:18 Training - Training image download completed. Training in progress....
2024-01-10 04:23:38 Uploading - Uploading generated training model[34m[2024-01-10 04:23:34.765 ip-10-0-123-210.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-01-10:04:23:34:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-01-10:04:23:34:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-01-10:04:23:34:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-01-10:04:23:34:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2024-01-10:04:23:34:INFO] Determined de

## Deploy model as Endpoint

In [123]:
from sagemaker.serializers import CSVSerializer

xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-01-10-04-47-08-352
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-01-10-04-47-08-352
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-01-10-04-47-08-352


-----!

## Define CSV serializer for inputs as list

In [136]:
def csv_serialize_input(input):
    
    serialized_input = ','.join(map(str, input[0]))
    return serialized_input

## Make predictions using Endpoint

In [140]:
# get some data point
df_new_point = yf.download('DJIA', start=datetime(2023, 1, 3), end=datetime(2023, 1, 4))
df_new_point.reset_index(inplace=True)
df_new_point.drop(axis=1, columns=['Adj Close', 'Date'], inplace=True)

# extract features
data_features_list = df_new_point.values.tolist()

# serialize
serialized_input = csv_serialize_input(data_features_list)

# obtain prediction
y_pred = xgb_predictor.predict(serialized_input).decode('utf-8')
y_pred

[*********************100%%**********************]  1 of 1 completed


'21.66800308227539\n'

## Lambda function handler

In [151]:
ENDPOINT_NAME = xgb_predictor.endpoint_name # 'sagemaker-xgboost-2024-01-10-04-47-08-352'

runtime = boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    
    inputs = event['data']
    result = []
    
    for input in inputs:

        serialized_input = ','.join(map(str, input))

        response = runtime.invoke_endpoint(
            EndpointName=ENDPOINT_NAME, 
            ContentType='text/csv',
            Body=serialized_input
        )
    
        result.append(response['Body'].read().decode())
    
    return result

## Invoke API

In [155]:
import requests

API_ENDPOINT = 'https://uf7b1iydg1.execute-api.us-east-2.amazonaws.com/xgbmodel'

json = {
    "data": [[21.540000915527344,
  21.739999771118164,
  21.540000915527344,
  21.600000381469727,
  18600.0]]
}

r = requests.post(url=API_ENDPOINT, json=json)

In [156]:
print(f'Status Code: {r.status_code}, Response: {r.json()}')

Status Code: 200, Response: ['21.66800308227539\n']


## Close session

In [157]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-01-10-04-47-08-352


[{'ResponseMetadata': {'RequestId': 'DFGKM1ZQEYF2FCGR',
   'HostId': 'w0fY529IYekcVYjYgy43IPlI9Lsj+9D7bxLlJUKI9hY25c613oRueSzzLM17+pC1DWjibvJ03+1tU7wklTPDpQ==',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'w0fY529IYekcVYjYgy43IPlI9Lsj+9D7bxLlJUKI9hY25c613oRueSzzLM17+pC1DWjibvJ03+1tU7wklTPDpQ==',
    'x-amz-request-id': 'DFGKM1ZQEYF2FCGR',
    'date': 'Wed, 10 Jan 2024 19:29:56 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost/output/sagemaker-xgboost-2024-01-10-04-20-16-902/debug-output/events/000000000000/000000000000_worker_0.tfevents'},
   {'Key': 'xgboost/output/sagemaker-xgboost-2024-01-10-04-20-16-902/debug-output/training_job_end.ts'},
   {'Key': 'xgboost/output/sagemaker-xgboost-2024-01-10-04-20-16-902/debug-output/claim.smd'},
   {'Key': 'xgboost/output/sagemaker-xgboost-2024-01-10-04-20-16-902/debug-output/events/0000000000