# Install Yahoo Finance package

In [11]:
!pip install yfinance

[0m

# 1. S3 Bucket
## 1.1 Create S3 Bucket

import boto3
import uuid

In [14]:
# Specify the region of s3 bucket
# The region of s3 bucket has to the the same as the notebook instance
region = 'us-east-1'

# Create a unique bucket name using UUID
bucket_name = 'yahoofinancestockprice-us-east-1'

s3 = boto3.resource('s3', region_name=region)

try:
    if region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
        )
    print(f'S3 bucket "{bucket_name}" has been created')
except Exception as e:
    print('S3 error: ', e)


S3 bucket "yahoofinancestockprice-us-east-1" has been created


# 2. Dateset
## 2.1 Create train and validation csv

In [30]:
import pandas as pd
from datetime import datetime
import yfinance as yf

# Initialize parameters
start_date = datetime(2020, 1, 1)
end_date = datetime.now()  # This will use the current date and time

data = yf.download('NVDA', start=start_date, end=end_date)
data.reset_index(inplace=True)

data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-01-02,5.968750,5.997750,5.918000,5.997750,5.973633,237536000
1,2020-01-03,5.877500,5.945750,5.852500,5.901750,5.878019,205384000
2,2020-01-06,5.808000,5.931750,5.781750,5.926500,5.902670,262636000
3,2020-01-07,5.955000,6.044250,5.909750,5.998250,5.974132,314856000
4,2020-01-08,5.994000,6.051000,5.953750,6.009500,5.985336,277108000
...,...,...,...,...,...,...,...
1119,2024-06-13,129.389999,129.800003,127.160004,129.610001,129.610001,260704500
1120,2024-06-14,129.960007,132.839996,128.320007,131.880005,131.880005,309320400
1121,2024-06-17,132.990005,133.729996,129.580002,130.979996,130.979996,288504400
1122,2024-06-18,131.139999,136.330002,130.690002,135.580002,135.580002,294335100


## 2.2 Format Dataset

In [16]:
# Drop unwanted columns if they exist
if 'Adj Close' in data.columns:
    data.drop(columns=['Adj Close'], inplace=True)
if 'Date' in data.columns:
    data.drop(columns=['Date'], inplace=True)

# Set the 'Open' price of the next day as the target price
data['Target'] = data['Open'].shift(-1)

# Drop the last row as it will have NaN for 'Target'
data.dropna(inplace=True)

# Ensure 'Target' is the first column
column_order = ['Target'] + [col for col in data.columns if col != 'Target']
data_final = data[column_order]

data_final

Unnamed: 0,Target,Open,High,Low,Close,Volume
0,5.877500,5.968750,5.997750,5.918000,5.997750,237536000
1,5.808000,5.877500,5.945750,5.852500,5.901750,205384000
2,5.955000,5.808000,5.931750,5.781750,5.926500,262636000
3,5.994000,5.955000,6.044250,5.909750,5.998250,314856000
4,6.096250,5.994000,6.051000,5.953750,6.009500,277108000
...,...,...,...,...,...,...
1118,129.389999,123.059998,126.879997,122.570000,125.199997,299595000
1119,129.960007,129.389999,129.800003,127.160004,129.610001,260704500
1120,132.990005,129.960007,132.839996,128.320007,131.880005,309320400
1121,131.139999,132.990005,133.729996,129.580002,130.979996,288504400


## 2.3 Spilt Train Test Data

In [18]:
import numpy as np

data_randomized = data_final.sample(frac=1, random_state=123)

# Split data into training and testing sets
train_data, test_data = np.split(data_randomized, [int(0.7 * len(data_randomized))])

# Print the shapes of the datasets
print(train_data.shape, test_data.shape)

(786, 6) (337, 6)


## 2.4 Upload to S3 Bucket

In [20]:
# Local paths
train_csv_local_path = '/tmp/train.csv'
test_csv_local_path = '/tmp/test.csv'

# Save locally
train_data.to_csv(train_csv_local_path, index=False, header=False)
test_data.to_csv(test_csv_local_path, index=False, header=False)

prefix = 'xgboost-as-a-built-in-algo'

train_csv_s3_path = '{}/{}/train.csv'.format(prefix, 'train')
test_csv_s3_path = '{}/{}/test.csv'.format(prefix, 'test')

# Upload to S3
s3 = boto3.client('s3')

try:
    s3.upload_file(train_csv_local_path, bucket_name, train_csv_s3_path)
    s3.upload_file(test_csv_local_path, bucket_name, test_csv_s3_path)
    print('Files have been uploaded to S3')
except Exception as e:
    print('S3 upload error: ', e)


Files have been uploaded to S3


# 3. [XGBoost](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html)

## 3.1 XGBoost hyperparameters and image

In [22]:
import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

In [23]:
# Initialize hyperparameters
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",
    "num_round": "100"
}

# S3 paths
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)

# Look for the XGBoost image URI and build an XGBoost container
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, version="1.7-1")

# Print the retrieved image URI to verify correctness
print("XGBoost Container URI: ", xgboost_container)

# Construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1,
                                          instance_type='ml.m4.xlarge',
                                          volume_size=5,  # 5 GB
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

XGBoost Container URI:  683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1


<sagemaker.estimator.Estimator at 0x7f01fcade050>

## 3.2 Execute the XGBoost training Job

In [24]:
# Define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput("s3://{}/{}/{}/".format(bucket_name, prefix, 'train'), content_type=content_type)
test_input = TrainingInput("s3://{}/{}/{}/".format(bucket_name, prefix, 'test'), content_type=content_type)

# Execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-06-20-21-13-13-708


2024-06-20 21:13:13 Starting - Starting the training job...
2024-06-20 21:13:32 Starting - Preparing the instances for training...
2024-06-20 21:14:06 Downloading - Downloading input data...
2024-06-20 21:14:41 Downloading - Downloading the training image......
2024-06-20 21:15:32 Training - Training image download completed. Training in progress...[34m[2024-06-20 21:15:53.339 ip-10-2-250-139.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-06-20 21:15:53.367 ip-10-2-250-139.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-06-20:21:15:53:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-06-20:21:15:53:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-06-20:21:15:53:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-06-20:21:15:53:INFO] Running XGBoost Sagemaker in algorithm mode[0m


## 3.3 Deploy trained xgb model as Endpoints

In [31]:
from sagemaker.serializers import CSVSerializer

print("start")
xgb_predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    serializer=CSVSerializer()
)

print("Model deployed successfully!")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-06-20-21-43-49-375


start


INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-06-20-21-43-49-375
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-06-20-21-43-49-375


-------!Model deployed successfully!


In [32]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2024-06-20-21-43-49-375'

# 3.4 Make predictions using Endpoints

In [48]:
start_date_pred = datetime(2024, 6, 19)
end_date_pred = datetime.now()


df_data = yf.download('NVDA', start=start_date_pred, end=end_date_pred)
df_data.reset_index(inplace=True)

# Drop unwanted columns if they exist
if 'Adj Close' in data.columns:
    df_data.drop(columns=['Adj Close'], inplace=True)
if 'Date' in data.columns:
    df_data.drop(columns=['Date'], inplace=True)

df_data_arrays = df_data.values
df_data_arrays

[*********************100%%**********************]  1 of 1 completed


array([[1.39850006e+02, 1.40759995e+02, 1.29529999e+02, 1.30779999e+02,
        5.04887012e+08]])

# 3.5 Serialize the Data

In [49]:
Y_pred_function = xgb_predictor.predict(df_data_arrays).decode('utf-8')
print(Y_pred_function)

122.11325073242188



# 4. Lambda function handler

In [53]:
ENDPOINT_NAME = xgb_predictor.endpoint_name
runtime = boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    inputs = event['data']

    serialized_input = ','.join(map(str, inputs[0]))

    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME, 
                                       ContentType='text/csv', 
                                       Body=serialized_input)

    result = response['Body'].read().decode()
    
    return result

Input_json = {'data':
              [[1,2,3,4,5], [213,2144321,3412324,3241,2134]]
             }

result = lambda_handler(Input_json, _)

result

'5.761270046234131\n'

# 5. Send Result via Email