In [1]:
bucket_name="mystockpriceprediction"
scaler_filename = "minmax_scaler.joblib"
filename='latest_data.csv'
prefix = 'built-in-xgboost-algo'
minmax_scaler_path_key='scaler/minmax_scaler.joblib'
latest_data_path_key='latest_data/latest_data.csv'

In [2]:
import boto3
s3_resource=boto3.resource('s3')
s3_client = boto3.client("s3")

In [4]:
try:
    s3_resource.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': 'eu-north-1'})
    print('S3 bucket has been created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket has been created successfully


In [5]:
!pip install -qU yfinance

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split  # Data splitting
from sklearn.preprocessing import MinMaxScaler  # Scaling
import yfinance as yf
import joblib
import sagemaker
from sklearn.metrics import mean_absolute_error, mean_squared_error  # Model evaluation



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [7]:
#initialize parameters
start_date="2022-01-01"
end_date="2024-01-01"

In [8]:
#get the data
aapl=yf.download('AAPL',start=start_date,end=end_date)
aapl

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2022-01-03,178.879913,179.734962,174.653874,174.771805,104487900
2022-01-04,176.609634,179.793920,176.039607,179.489254,99310400
2022-01-05,171.911850,177.071564,171.636666,176.521197,94537600
2022-01-06,169.042053,172.285305,168.688244,169.730012,96904000
2022-01-07,169.209152,171.145275,168.088758,169.916771,86709100
...,...,...,...,...,...
2023-12-22,192.444580,194.243775,191.818335,194.015137,37122800
2023-12-26,191.897873,192.732856,191.679185,192.454528,28919300
2023-12-27,191.997269,192.345186,189.949565,191.341219,48087700
2023-12-28,192.424713,193.498269,192.017156,192.981369,34049900


In [9]:
def clean_df(df):
    df.columns = df.columns.droplevel(1)
    df=df.reset_index(drop=True)
    return df

In [10]:
aapl=clean_df(aapl)
aapl.head()

Price,Close,High,Low,Open,Volume
0,178.879913,179.734962,174.653874,174.771805,104487900
1,176.609634,179.79392,176.039607,179.489254,99310400
2,171.91185,177.071564,171.636666,176.521197,94537600
3,169.042053,172.285305,168.688244,169.730012,96904000
4,169.209152,171.145275,168.088758,169.916771,86709100


In [11]:
def get_features(df):
    
    #Helps model detect long-term trends.
    df['SMA_10'] = df['Close'].rolling(window=10).mean()  # 10-day simple moving average
    df['EMA_50'] = df['Close'].ewm(span=50, adjust=False).mean()  # 50-day Exponential Moving Average
    df['High_Low_Range'] = df['High'] - df['Low']   #Measures daily volatility, which can indicate breakouts or trend reversals.
    df['Open_Close_Change'] = (df['Close'] - df['Open'])/df['Open']  #If the Close > Open, it suggests bullish momentum; otherwise, it's bearish.
    
    # Create lag features 
    df['Close_Lag_1'] = df['Close'].shift(1)    # previous day's closing price
    df['Volume_Lag_1'] = df['Volume'].shift(1)  # Previous day's volume
 

    
    # Drop NaN values created due to rolling calculations
    df.dropna(inplace=True)
    return df

In [12]:
aapl2=get_features(aapl.copy())


In [18]:
# Define feature set (X) and target variable (y)
X = aapl2[['SMA_10' ,'EMA_50','High_Low_Range', 'Open_Close_Change','Volume_Lag_1','Close_Lag_1']]  # Input features
y= aapl2[['Close']].shift(-1)  # moves the "Close" column one step up, so each row’s target is the next day's closing price. This means for a given day's data, the model will learn to predict the closing price of the next day.


In [20]:
y.dropna(inplace=True)
y.rename(columns={'Close':'Target'},inplace=True)
y.head()

Price,Target
9,166.879913
10,163.371307
11,161.680878
12,159.616959
13,158.840546


In [21]:
X=X.head(-1)
X.head()

Price,SMA_10,EMA_50,High_Low_Range,Open_Close_Change,Volume_Lag_1,Close_Lag_1
9,171.878433,176.496662,2.643741,0.010097,84505800.0,169.228806
10,170.678433,176.119534,3.076162,-0.00997,80440800.0,170.093658
11,169.354601,175.619604,5.051606,-0.022176,90956700.0,166.879913
12,168.331503,175.072987,5.405415,-0.014792,94815000.0,163.371307
13,167.388994,174.466868,3.960693,-0.012225,91420500.0,161.680878


In [22]:
# Splitting into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [23]:
# Scale features between 0 and 1 for better model performance
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [27]:
# Save the scaler locally using joblib
joblib.dump(scaler, scaler_filename)

['minmax_scaler.joblib']

In [24]:
train_data = pd.concat([y_train, pd.DataFrame(X_train_scaled)], axis=1)
test_data = pd.concat([y_test, pd.DataFrame(X_test_scaled)], axis=1)

In [25]:
#Store the latest 50 data entries for extracting features for inference
from io import StringIO

latest_data=aapl.tail(50)
# Convert DataFrame to CSV
csv_buffer = StringIO()
latest_data.to_csv(csv_buffer, index=False)

In [29]:
train_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'train','train.csv')
test_csv_path='s3://{}/{}/{}/{}'.format(bucket_name,prefix,'test','test.csv')


train_data.to_csv(train_csv_path,index=False,header= False)
test_data.to_csv(test_csv_path,index=False,header= False)
s3_resource.Bucket(bucket_name).upload_file(scaler_filename,minmax_scaler_path_key)
s3_resource.Bucket(bucket_name).put_object(Key=latest_data_path_key,Body=csv_buffer.getvalue())


s3.Object(bucket_name='mystockpriceprediction', key='latest_data/latest_data.csv')

In [30]:
from sagemaker.session import Session       #Kickstart a training session
from sagemaker.inputs import TrainingInput
from sagemaker import image_uris        #Get the container image for xgboost 

In [31]:
xgboost_container=image_uris.retrieve("xgboost",boto3.Session().region_name,"1.2-2")
display(xgboost_container)

'662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-xgboost:1.2-2'

In [32]:
hyperparameters={
    "eta":"0.05",
    "num_round":"500",
    "objective":"reg:squarederror",
    "early_stopping_rounds":"10",
    "verbosity":"3",
    "subsample":"0.8",
    "min_child_weight":"8",
    "max_depth":"12",
    "gamma":"5"
}

In [33]:
output_path='s3://{}/{}/{}/'.format(bucket_name,prefix,'output')

In [34]:
estimator=sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                        hyperparameters=hyperparameters,
                                        role=sagemaker.get_execution_role(),
                                        instance_count=1,
                                        instance_type='ml.m5.xlarge',
                                        output_path=output_path,
                                        volume_size=5,
                                        use_spot_instances=True,
                                        max_run=300,
                                        max_wait=600)

In [35]:
content_type="csv" 
train_input=TrainingInput('s3://{}/{}/{}/{}'.format(bucket_name,prefix,'train','train.csv'),content_type=content_type)
test_input=TrainingInput('s3://{}/{}/{}/{}'.format(bucket_name,prefix,'test','test.csv'),content_type=content_type)

In [36]:
estimator.fit({'train':train_input, 'validation':test_input})

2025-03-18 14:16:44 Starting - Starting the training job......
..25-03-18 14:17:36 Starting - Preparing the instances for training.
.....03-18 14:18:14 Downloading - Downloading the training image.
.[34m[2025-03-18 14:19:28.783 ip-10-0-231-235.eu-north-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-03-18:14:19:28:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-03-18:14:19:28:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2025-03-18:14:19:28:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-03-18:14:19:28:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-03-18:14:19:28:INFO] Determined delimiter of CSV input is ','[0m
[34m[2025-03-18:14:19:28:INFO] Determined delimiter of CSV input is ','[0m
[34m[2025-03-18:14:19:28:INFO] Determined delimiter of CSV input is ','[0m
[34m[2025-03-18:14:19:28:INFO] Dete

In [37]:
from sagemaker.serializers import CSVSerializer
xgb_predictor=estimator.deploy(initial_instance_count=1,instance_type='ml.m5.xlarge',serializer=CSVSerializer())  

---------!

In [38]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2025-03-18-14-20-16-190'

Making prediction using endpoint

In [39]:
start_date = "2024-01-01"
duration=5
test = yf.download('AAPL', start=start_date, end=pd.Timestamp(start_date)+pd.Timedelta(days=duration))


[*********************100%***********************]  1 of 1 completed


In [40]:
test=clean_df(test)
test.head()

Price,Close,High,Low,Open,Volume
0,184.532074,187.315366,182.792518,186.033057,82488700
1,183.150375,184.770652,182.335262,183.120556,58414500
2,180.824341,181.997291,179.800489,181.062899,71983600
3,180.098709,181.669281,179.094742,180.903888,62303300


In [41]:
n=len(test)
n

4

In [44]:
aug=pd.concat([latest_data,test],axis=0)
aug

Price,Close,High,Low,Open,Volume
451,174.183395,176.546069,173.915356,174.759162,59302900
452,171.622131,174.143644,171.383872,174.034443,64189300
453,171.741257,172.743903,168.693587,169.666467,55980100
454,172.17807,172.406392,170.202544,171.790908,43816600
455,169.855103,171.800833,169.408365,170.629426,57157000
456,165.675735,170.133072,164.464611,169.130411,70625300
457,166.996048,167.730669,165.616162,165.695582,58499100
458,169.050964,169.924566,167.641298,167.790216,51131000
459,169.527496,169.65654,166.678368,168.11783,44846000
460,172.704224,172.962326,168.88223,169.755832,56934900


In [45]:
aug=get_features(aug)
aug.head()

Price,Close,High,Low,Open,Volume,SMA_10,EMA_50,High_Low_Range,Open_Close_Change,Close_Lag_1,Volume_Lag_1
460,172.704224,172.962326,168.88223,169.755832,56934900,170.353442,172.883756,4.080097,0.017368,169.527496,44846000.0
461,176.278015,176.486479,174.183367,174.242928,77334800,170.562904,173.016864,2.303112,0.01168,172.704224,56934900.0
462,175.3647,175.533477,172.088723,172.972247,79763700,170.937161,173.108936,3.444754,0.013831,176.278015,77334800.0
463,177.925934,178.124476,174.927918,175.096679,63841300,171.555629,173.297838,3.196558,0.016158,175.3647,79763700.0
464,180.497101,181.112585,177.667831,177.876295,70530000,172.387532,173.580162,3.444754,0.014734,177.925934,63841300.0


In [48]:
X_unseen = aug[['SMA_10' ,'EMA_50','High_Low_Range', 'Open_Close_Change','Volume_Lag_1','Close_Lag_1']].tail(n)  # Input features
X_unseen


Price,SMA_10,EMA_50,High_Low_Range,Open_Close_Change,Volume_Lag_1,Close_Lag_1
0,192.23484,187.25944,4.522848,-0.008068,42628800.0,191.380951
1,191.077786,187.0983,2.43539,0.000163,82488700.0,184.532074
2,189.583755,186.852262,2.196802,-0.001318,58414500.0,183.150375
3,188.226901,186.587417,2.574539,-0.004451,71983600.0,180.824341


In [49]:
# Reuse the same scaler fitted during training
# Download file from S3
s3_resource.Bucket(bucket_name).download_file(minmax_scaler_path_key, scaler_filename)

# Load the scaler
scaler = joblib.load(scaler_filename)
X_unseen_scaled = scaler.transform(X_unseen)

In [50]:
y_pred=xgb_predictor.predict(X_unseen_scaled).decode('utf-8') #byte to string
y_pred

'125.48660278320312\n125.48660278320312\n125.48660278320312\n125.48660278320312\n'

In [55]:
predictions = np.array([float(row) for row in y_pred.strip().split("\n") if row])
predictions

array([125.48660278, 125.48660278, 125.48660278, 125.48660278])

In [56]:
# Predict the next day's closing price
actual_prices = aug['Close'].shift(-1).tail(n).dropna()
predictions = predictions[:len(actual_prices)]  # Align lengths

In [57]:
# Calculate Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE)

mae = mean_absolute_error(actual_prices, predictions)
rmse = np.sqrt(mean_squared_error(actual_prices, predictions))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Mean Absolute Error (MAE): 55.87
Root Mean Squared Error (RMSE): 55.89


In [58]:
start_date = "2024-01-05"
duration=1
real_time_data = yf.download('AAPL', start=start_date, end=pd.Timestamp(start_date)+pd.Timedelta(days=duration))
real_time_data.columns = real_time_data.columns.droplevel(1)
real_time_data=np.array(real_time_data.reset_index(drop=True)).tolist()
real_time_data

[*********************100%***********************]  1 of 1 completed


[[180.0987091064453,
  181.66928147459845,
  179.09474225448028,
  180.90388776119542,
  62303300.0]]