In [4]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'intel-stock-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5413902%2F8988991%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240825%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240825T151902Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D58a1b759fd425be7741dff42edd7d12775b5a0940523ea73dbfedf0fa6d3097cfcaa12bf2fc6697571d83800fe1fc152c7ba0cfbc12cb61be5685647f61c55ea21517f233f590a9e968d5753c90a5ce86cd4567bf9cf0a01203ea728d933570ba556b15a1fe49fd81ee0b022f961adc839da48c1e56484563b85e66c04b5d20086e7ab17f33a864038e96a65c93201f7d678c3b081aaa20c813b518b980e8e8df3a2f6226aafbff262b6f1352bfbd7b414cfb6f431945cd6f9c4ccc4287f1076cbdd8c66f6195fd23fd1149b8570488368423d2a4bdfbaeb2eb7fabde2294569a0c1dd090bbb97501562c1f99902a4b918fa7b87262f16863dd998cc2280af21'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading intel-stock-data, 422525 bytes compressed
Downloaded and uncompressed: intel-stock-data
Data source import complete.


In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import plotly.graph_objs as go
import seaborn as sns
import plotly.io as pio
from keras.layers import Dropout
import warnings
warnings.simplefilter("ignore", category=Warning)

In [6]:
df = pd.read_csv('/kaggle/input/intel-stock-data/INTC.csv',
                 usecols=lambda column: column != "Unnamed: 0",
                 parse_dates=['Date'], index_col='Date')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980-03-17,0.182651,0.185573,0.182651,0.182651,10924800
1980-03-18,0.182651,0.184112,0.18119,0.18119,17068800
1980-03-19,0.185573,0.188496,0.185573,0.185573,18508800
1980-03-20,0.185573,0.187765,0.184843,0.184843,11174400
1980-03-21,0.18119,0.18119,0.178267,0.178267,12172800


In [7]:
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] =df.index.day
df['dayofweek'] = df.index.dayofweek
df['weekno'] = df.index.isocalendar().week
df['isweekend'] = df.index.weekday // 5
df['seasons'] = df['month'].apply(lambda month: 1 if month in [12,1,2] else 2
                                   if month in [3,4,5] else 3 if month in [6,7,8] else 4)
df.sort_index(inplace= True)

In [8]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,year,month,day,dayofweek,weekno,isweekend,seasons
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-03-17,0.182651,0.185573,0.182651,0.182651,10924800,1980,3,17,0,12,0,2
1980-03-18,0.182651,0.184112,0.18119,0.18119,17068800,1980,3,18,1,12,0,2
1980-03-19,0.185573,0.188496,0.185573,0.185573,18508800,1980,3,19,2,12,0,2
1980-03-20,0.185573,0.187765,0.184843,0.184843,11174400,1980,3,20,3,12,0,2
1980-03-21,0.18119,0.18119,0.178267,0.178267,12172800,1980,3,21,4,12,0,2


**Correlation**

In [9]:
corr_matrix  = df.corr().abs()
target ="Close"
normalized_corr = (corr_matrix[f'{target}'] - corr_matrix[f'{target}'].min()) /(corr_matrix[f'{target}'].max() - corr_matrix[f'{target}'].min())
n= 6
top_features = normalized_corr.sort_values(ascending = False).index[:n].to_list()
print(f"Top features correlated with {target}:",top_features)
print("Correlation scores normalized to range [0, 1]:\n ", normalized_corr[top_features])

Top features correlated with Close: ['Close', 'Low', 'High', 'Open', 'year', 'Volume']
Correlation scores normalized to range [0, 1]:
  Close     1.000000
Low       0.999844
High      0.999830
Open      0.999674
year      0.862057
Volume    0.220322
Name: Close, dtype: float64


**sequence Creator**

In [1]:
def create_sequences_optimized (data, seq_length, target_idx):

  data_values = data.values.astype('float32')
  num_samples = len(data) - seq_length
  num_features = data.shape[1]

  xs = np.empty((num_samples,seq_length, num_features),dtype = 'float32')
  ys = np.empty(num_samples,dtype='float32')

  for i in range(num_samples):
    xs[i]  = data_values[i:i+seq_length]
    ys[i] = data_values[i+seq_length, target_idx]

  return xs,ys

In [11]:
def lstm_model(df_model, features, sequence, batch, target_column):

    data = df_model[features].copy()

    scalers = {}

    for feature in features:
        scaler = MinMaxScaler(feature_range=(0, 1))
        data[feature] = scaler.fit_transform(data[[feature]])
        scalers[feature] = scaler

    target_idx = features.index(target_column)

    seq_length = sequence
    X, y = create_sequences_optimized(data, seq_length, target_idx)

    split_ratio = 0.8
    split = int(split_ratio * len(X))

    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    model = Sequential()
    model.add(LSTM(300, return_sequences=True, input_shape=(seq_length, X_train.shape[2])))
    model.add(LSTM(200, return_sequences=False))
    model.add(Dense(25))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mae')

    #early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    best_model_path = 'best_model.keras'
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, monitor='val_loss', mode='min', verbose=1)

    history = model.fit(X_train, y_train, epochs=30, batch_size=batch, validation_split=0.2,
                        verbose=1, callbacks=[model_checkpoint])

    model = load_model('best_model.keras')

    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    y_test_actual = scalers[target_column].inverse_transform(y_test.reshape(-1, 1)).flatten()
    y_pred_actual = scalers[target_column].inverse_transform(y_pred).flatten()
    y_train_actual = scalers[target_column].inverse_transform(y_train.reshape(-1, 1)).flatten()
    y_train_pred_actual = scalers[target_column].inverse_transform(y_train_pred).flatten()

    r2_train = r2_score(y_train_actual, y_train_pred_actual)
    r2_test = r2_score(y_test_actual, y_pred_actual)

    return model, history, y_test_actual, y_pred_actual, r2_train, r2_test

sequence = 30
batch = 48
target_column = 'Close'

model, history, y_test_actual, y_pred_actual, r2_train, r2_test = lstm_model(df, top_features, sequence, batch, target_column)

Epoch 1/30
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step - loss: 0.0282
Epoch 1: val_loss improved from inf to 0.00806, saving model to best_model.keras
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 274ms/step - loss: 0.0281 - val_loss: 0.0081
Epoch 2/30
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step - loss: 0.0064
Epoch 2: val_loss did not improve from 0.00806
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 270ms/step - loss: 0.0064 - val_loss: 0.0139
Epoch 3/30
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step - loss: 0.0057
Epoch 3: val_loss improved from 0.00806 to 0.00639, saving model to best_model.keras
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 263ms/step - loss: 0.0057 - val_loss: 0.0064
Epoch 4/30
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step - loss: 0.0052
Epoch 4: val_loss improved from 0.00639

In [12]:
print(f'R² Score For Train Data: {round(r2_train * 100, 2)}%')
print(f'R² Score For Test Data: {round(r2_test * 100, 2)}%')

R² Score For Train Data: 99.84%
R² Score For Test Data: 98.55%


In [13]:
def plot_actual_vs_predicted(y_test, y_pred):

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=list(range(len(y_test))),
        y=y_test.flatten(),
        mode='lines',
        name='Actual'
    ))

    fig.add_trace(go.Scatter(
        x=list(range(len(y_pred))),
        y=y_pred.flatten(),
        mode='lines',
        name='Predicted',
        line=dict(dash='dash')
    ))

    fig.update_layout(
        title=dict(text='Actual vs Predicted', x=0.5),
        xaxis=dict(title='Index', showgrid=True),
        yaxis=dict(title='Value', showgrid=True),
        showlegend=True,
        width=1200,
        height=600
    )

    pio.show(fig)

plot_actual_vs_predicted(y_test_actual, y_pred_actual)