In [172]:
!pip install yfinance



In [173]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import datetime
import seaborn as sns
sns.set_style('whitegrid')
from plotly.subplots import make_subplots

In [174]:
yf.__version__

'0.2.52'

In [175]:
# Define stock tickers and date range
tickers = ['PG', 'JNJ',
           '^GSPC']
start_date = '2010-03-02'
end_date = '2024-02-28'

# Fetch data
stock_data = {}
for ticker in tickers:
    stock_data[ticker] = yf.download(ticker, start=start_date, end=end_date)
    print(f"Downloaded {ticker}")

# Save individual CSVs (optional)
for ticker, data in stock_data.items():
   data.to_csv(f"{ticker}_data.csv")

[*********************100%***********************]  1 of 1 completed

Downloaded PG



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Downloaded JNJ
Downloaded ^GSPC


In [176]:
# Convert 'Date' column to datetime objects with UTC timezone and extract the date
for ticker, data in stock_data.items():
    # The index is already a DatetimeIndex, so we can access the date directly
    data['Date'] = data.index.date
    stock_data[ticker] = data.set_index('Date')  # Set 'Date' as index for each stock DataFrame

In [177]:
#Fetch data for PG
pg_data = stock_data['PG']
pg_data = pg_data.xs('PG',axis=1,level='Ticker')
pg_data.head()

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-03-02,41.14719,41.250526,40.908226,41.15365,8004000
2010-03-03,41.043861,41.269911,40.96636,41.114905,8608300
2010-03-04,41.121361,41.431372,40.95344,41.108446,8600300
2010-03-05,41.134266,41.192393,40.720921,41.134266,10101100
2010-03-08,40.811359,41.121368,40.708023,41.043866,11293300


In [178]:
# Fetch data for JNJ
jnj_data = stock_data['JNJ']
jnj_data = jnj_data.xs('JNJ',axis=1,level='Ticker')
jnj_data.head()

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-03-02,41.169708,41.2995,40.897148,41.085346,9014000
2010-03-03,41.143742,41.370875,41.02693,41.267042,9360400
2010-03-04,41.254066,41.338431,41.124275,41.228108,8067200
2010-03-05,41.55909,41.578558,41.241101,41.312486,10633600
2010-03-08,41.662903,41.799188,41.546091,41.585031,33380500


In [179]:
# Fetch data for'^GSPC'
gspc_data = stock_data['^GSPC']
gspc_data = gspc_data.xs('^GSPC',axis=1,level='Ticker')
gspc_data.head()

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-03-02,1118.310059,1123.459961,1116.51001,1117.01001,4134680000
2010-03-03,1118.790039,1125.640015,1116.579956,1119.359985,3951320000
2010-03-04,1122.969971,1123.72998,1116.660034,1119.119995,3945010000
2010-03-05,1138.699951,1139.380005,1125.119995,1125.119995,4133000000
2010-03-08,1138.5,1141.050049,1136.77002,1138.400024,3774680000


Feature Engineering:

Computing the stocks the "Returns" feature

Creating a “Tomorrow” column

Creating a “stocks-Direction” column

Computing the Return, Tomorrow, and Stock_Direction columns for each stocks

In [181]:
# For PG stock
pg_data['Return'] = pg_data['Close'].pct_change()
# For PG stock:
pg_data['Tomorrow'] = pg_data['Close'].shift(-1)
# For PG stock
pg_data['Stocks_Direction'] = (pg_data['Close'] > pg_data['Close'].shift(1)).astype(int)

In [182]:
pg_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, 2010-03-02 to 2024-02-27
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Close             3522 non-null   float64
 1   High              3522 non-null   float64
 2   Low               3522 non-null   float64
 3   Open              3522 non-null   float64
 4   Volume            3522 non-null   int64  
 5   Return            3521 non-null   float64
 6   Tomorrow          3521 non-null   float64
 7   Stocks_Direction  3522 non-null   int32  
dtypes: float64(6), int32(1), int64(1)
memory usage: 233.9+ KB


In [183]:
#pg_data.dropna(inplace=True)  # Removes rows with any missing values

In [184]:
# Handling 'Return' column NaNs
pg_data.loc[:, 'Return'] = pg_data['Return'].fillna(0)  # Fill first row with 0
pg_data.loc[:, 'Return'] = pg_data['Return'].ffill()  # Fill other potential NaNs using forward fill

# Handling 'Tomorrow' column NaNs
pg_data = pg_data.iloc[:-1]  # Drop the last row

# Handing "Stock_Direction column NaNs"
pg_data.loc[:, 'Stocks_Direction'] = pg_data['Stocks_Direction'].fillna(0)  # Fill first row with 0
pg_data.loc[:, 'Stocks_Direction'] = pg_data['Stocks_Direction'].ffill()  # Fill other potential NaNs using forward fill


In [185]:
pg_data.isna().sum()

Price
Close               0
High                0
Low                 0
Open                0
Volume              0
Return              0
Tomorrow            0
Stocks_Direction    0
dtype: int64

In [186]:
pg_data.head()

Price,Close,High,Low,Open,Volume,Return,Tomorrow,Stocks_Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-03-02,41.14719,41.250526,40.908226,41.15365,8004000,0.0,41.043861,0
2010-03-03,41.043861,41.269911,40.96636,41.114905,8608300,-0.002511,41.121361,0
2010-03-04,41.121361,41.431372,40.95344,41.108446,8600300,0.001888,41.134266,1
2010-03-05,41.134266,41.192393,40.720921,41.134266,10101100,0.000314,40.811359,1
2010-03-08,40.811359,41.121368,40.708023,41.043866,11293300,-0.00785,40.882393,0


In [187]:
# For JNJ stock
jnj_data['Return'] = jnj_data['Close'].pct_change()
# For JNJ stock:
jnj_data['Tomorrow'] = jnj_data['Close'].shift(-1)
# For JNJ stock
jnj_data['Stocks_Direction'] = (jnj_data['Close'] > jnj_data['Close'].shift(1)).astype(int)


In [188]:
jnj_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, 2010-03-02 to 2024-02-27
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Close             3522 non-null   float64
 1   High              3522 non-null   float64
 2   Low               3522 non-null   float64
 3   Open              3522 non-null   float64
 4   Volume            3522 non-null   int64  
 5   Return            3521 non-null   float64
 6   Tomorrow          3521 non-null   float64
 7   Stocks_Direction  3522 non-null   int32  
dtypes: float64(6), int32(1), int64(1)
memory usage: 233.9+ KB


In [189]:
# Handling 'Return' column NaNs
jnj_data.loc[:, 'Return'] = jnj_data['Return'].fillna(0)  # Fill first row with 0
jnj_data.loc[:, 'Return'] = jnj_data['Return'].ffill()  # Fill other potential NaNs using forward fill

# Handling 'Tomorrow' column NaNs
jnj_data = jnj_data.iloc[:-1]  # Drop the last row

# Handing "Stock_Direction' column NaNs
jnj_data.loc[:, 'Stocks_Direction'] = jnj_data['Stocks_Direction'].fillna(0)  # Fill first row with 0
jnj_data.loc[:, 'Stocks_Direction'] = jnj_data['Stocks_Direction'].ffill()

In [190]:
jnj_data.isna().sum()

Price
Close               0
High                0
Low                 0
Open                0
Volume              0
Return              0
Tomorrow            0
Stocks_Direction    0
dtype: int64

In [191]:
jnj_data.head()

Price,Close,High,Low,Open,Volume,Return,Tomorrow,Stocks_Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-03-02,41.169708,41.2995,40.897148,41.085346,9014000,0.0,41.143742,0
2010-03-03,41.143742,41.370875,41.02693,41.267042,9360400,-0.000631,41.254066,0
2010-03-04,41.254066,41.338431,41.124275,41.228108,8067200,0.002681,41.55909,1
2010-03-05,41.55909,41.578558,41.241101,41.312486,10633600,0.007394,41.662903,1
2010-03-08,41.662903,41.799188,41.546091,41.585031,33380500,0.002498,41.708332,1


In [192]:
# For ^GSPC stock
gspc_data['Return'] = gspc_data['Close'].pct_change()
# For ^GSPC stock:
gspc_data['Tomorrow'] = gspc_data['Close'].shift(-1)
# For ^GSPC stock
gspc_data['Stocks_Direction'] = (gspc_data['Close'] > gspc_data['Close'].shift(1)).astype(int)

In [193]:
gspc_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, 2010-03-02 to 2024-02-27
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Close             3522 non-null   float64
 1   High              3522 non-null   float64
 2   Low               3522 non-null   float64
 3   Open              3522 non-null   float64
 4   Volume            3522 non-null   int64  
 5   Return            3521 non-null   float64
 6   Tomorrow          3521 non-null   float64
 7   Stocks_Direction  3522 non-null   int32  
dtypes: float64(6), int32(1), int64(1)
memory usage: 233.9+ KB


In [194]:
gspc_data.loc[:, 'Return'] = gspc_data['Return'].fillna(0)  # Fill first row with 0
gspc_data.loc[:, 'Return'] = gspc_data['Return'].ffill()

gspc_data = gspc_data.iloc[:-1]  # Drop the last row

gspc_data.loc[:, 'Stocks_Direction'] = gspc_data['Stocks_Direction'].fillna(0)
gspc_data.loc[:, 'Stocks_Direction'] = gspc_data['Stocks_Direction'].ffill()

In [195]:
gspc_data.isna().sum()

Price
Close               0
High                0
Low                 0
Open                0
Volume              0
Return              0
Tomorrow            0
Stocks_Direction    0
dtype: int64

In [196]:
gspc_data.head()

Price,Close,High,Low,Open,Volume,Return,Tomorrow,Stocks_Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-03-02,1118.310059,1123.459961,1116.51001,1117.01001,4134680000,0.0,1118.790039,0
2010-03-03,1118.790039,1125.640015,1116.579956,1119.359985,3951320000,0.000429,1122.969971,1
2010-03-04,1122.969971,1123.72998,1116.660034,1119.119995,3945010000,0.003736,1138.699951,1
2010-03-05,1138.699951,1139.380005,1125.119995,1125.119995,4133000000,0.014007,1138.5,1
2010-03-08,1138.5,1141.050049,1136.77002,1138.400024,3774680000,-0.000176,1140.449951,0


Data Preparation, Modelling & Evaluation:

In [198]:
#!pip install mlflow pandas numpy scikit-learn xgboost tensorflow statsmodels


In [199]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import mlflow
import mlflow.xgboost
import mlflow.keras

In [209]:
X = pg_data[["Close", "High", "Low", "Open", "Volume"]]  # Features
y = pg_data["Stocks_Direction"]  # Target

In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [213]:
# Normalization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pg_data)

In [215]:
# Initialize and train the XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"XGBoost Accuracy: {xgb_accuracy}")
print(classification_report(y_test, xgb_predictions))

XGBoost Accuracy: 0.7460992907801418
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       351
           1       0.74      0.76      0.75       354

    accuracy                           0.75       705
   macro avg       0.75      0.75      0.75       705
weighted avg       0.75      0.75      0.75       705



In [217]:
# Initialize KFold for cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # You can adjust n_splits

# Perform cross-validation and get accuracy scores
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=kfold, scoring='accuracy')

# Print the cross-validation scores
print("XGBoost Cross-Validation Scores:", xgb_cv_scores)
print("XGBoost Mean Accuracy:", xgb_cv_scores.mean())
print("XGBoost Accuracy Standard Deviation:", xgb_cv_scores.std())

XGBoost Cross-Validation Scores: [0.74609929 0.73863636 0.75       0.73721591 0.76420455]
XGBoost Mean Accuracy: 0.747231221792392
XGBoost Accuracy Standard Deviation: 0.009707926575230865


In [219]:
# Scale data for LSTM
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM (samples, timesteps, features)
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Build the LSTM model
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dropout(0.2),
    LSTM(50),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_reshaped, y_train, epochs=20, batch_size=32, validation_data=(X_test_reshaped, y_test), verbose=1)

# Make predictions
lstm_predictions = (lstm_model.predict(X_test_reshaped) > 0.5).astype(int)

# Evaluate the model
lstm_accuracy = accuracy_score(y_test, lstm_predictions)
print(f"LSTM Accuracy: {lstm_accuracy}")
print(classification_report(y_test, lstm_predictions))

Epoch 1/20


  super().__init__(**kwargs)


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5164 - loss: 0.6924 - val_accuracy: 0.5021 - val_loss: 0.6949
Epoch 2/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5374 - loss: 0.6905 - val_accuracy: 0.5021 - val_loss: 0.6955
Epoch 3/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5192 - loss: 0.6919 - val_accuracy: 0.5021 - val_loss: 0.6959
Epoch 4/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5423 - loss: 0.6890 - val_accuracy: 0.5021 - val_loss: 0.6953
Epoch 5/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5121 - loss: 0.6926 - val_accuracy: 0.5021 - val_loss: 0.6964
Epoch 6/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5109 - loss: 0.6938 - val_accuracy: 0.5021 - val_loss: 0.6956
Epoch 7/20
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━

In [220]:
from sklearn.model_selection import cross_val_score, KFold

In [223]:
from sklearn.metrics import accuracy_score

# Initialize lists to store cross-validation scores
lstm_cv_scores = []

# Perform cross-validation
for train_index, test_index in kfold.split(X):
    # Get data for current fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Scale data for LSTM
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_test_fold_scaled = scaler.transform(X_test_fold)
    
    # Reshape data for LSTM
    X_train_fold_reshaped = X_train_fold_scaled.reshape(X_train_fold_scaled.shape[0], 1, X_train_fold_scaled.shape[1])
    X_test_fold_reshaped = X_test_fold_scaled.reshape(X_test_fold_scaled.shape[0], 1, X_test_fold_scaled.shape[1])
    
    # Train the model
    lstm_model.fit(X_train_fold_reshaped, y_train_fold, epochs=20, batch_size=32, verbose=0)  # Suppress verbose output

    # Make predictions
    lstm_predictions_fold = (lstm_model.predict(X_test_fold_reshaped) > 0.5).astype(int)
    
    # Calculate accuracy for current fold
    accuracy_fold = accuracy_score(y_test_fold, lstm_predictions_fold)
    
    # Store the accuracy
    lstm_cv_scores.append(accuracy_fold)

# Print the cross-validation scores
print("LSTM Cross-Validation Scores:", lstm_cv_scores)
print("LSTM Mean Accuracy:", np.mean(lstm_cv_scores))
print("LSTM Accuracy Standard Deviation:", np.std(lstm_cv_scores))

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
LSTM Cross-Validation Scores: [0.49645390070921985, 0.5411931818181818, 0.5696022727272727, 0.5553977272727273, 0.6051136363636364]
LSTM Mean Accuracy: 0.5535521437782076
LSTM Accuracy Standard Deviation: 0.035588070016927466


In [227]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Assuming you have the trained XGBoost model (xgb_model) and the latest data (pg_data)

# Prepare the data for the forecast
# Select the last 14 days of data for prediction
last_14_days_data = pg_data.tail(14)

# Select the features for prediction
forecast_features = last_14_days_data[["Close", "High", "Low", "Open", "Volume"]]

# Scale the features using the same scaler used for training
forecast_features_scaled = scaler.transform(forecast_features)

# Make predictions for the next 14 days
forecast_predictions = xgb_model.predict(forecast_features_scaled)

# Create a DataFrame for the forecast results
forecast_df = pd.DataFrame({'Date': last_14_days_data.index, 'Stocks_Direction': forecast_predictions})

# Print the forecast
print(forecast_df)

          Date  Stocks_Direction
0   2024-02-06                 0
1   2024-02-07                 0
2   2024-02-08                 0
3   2024-02-09                 0
4   2024-02-12                 0
5   2024-02-13                 0
6   2024-02-14                 0
7   2024-02-15                 0
8   2024-02-16                 0
9   2024-02-20                 0
10  2024-02-21                 0
11  2024-02-22                 0
12  2024-02-23                 0
13  2024-02-26                 0
