In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import glob

In [3]:
csv_files = glob.glob('stock-time-series-20050101-to-20171231/*.csv')
df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

In [4]:
common_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
df_list = [pd.read_csv(file)[common_columns] for file in csv_files]

In [5]:
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df = combined_df.sort_values('Date').reset_index(drop=True)

In [6]:
combined_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195005 entries, 0 to 195004
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   Date    195005 non-null  datetime64[ns]
 1   Open    194931 non-null  float64       
 2   High    194976 non-null  float64       
 3   Low     194946 non-null  float64       
 4   Close   195005 non-null  float64       
 5   Volume  195005 non-null  int64         
 6   Name    195005 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 10.4+ MB


In [7]:
combined_df.ffill(inplace=True)
#combined_df.isnull().sum()


In [8]:
combined_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name
0,2006-01-03,39.69,41.22,38.79,40.91,24232729,AABA
1,2006-01-03,126.7,129.44,124.23,128.87,6188700,GS
2,2006-01-03,17.21,17.49,17.18,17.45,55432166,CSCO
3,2006-01-03,40.39,41.45,39.77,41.24,8960100,HD
4,2006-01-03,82.45,82.55,80.81,82.06,11715200,IBM


In [9]:
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

In [10]:
combined_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Name
0,2006-01-03,39.69,41.22,38.79,40.91,24232729,AABA
1,2006-01-03,126.7,129.44,124.23,128.87,6188700,GS
2,2006-01-03,17.21,17.49,17.18,17.45,55432166,CSCO
3,2006-01-03,40.39,41.45,39.77,41.24,8960100,HD
4,2006-01-03,82.45,82.55,80.81,82.06,11715200,IBM


In [11]:
#sns.lineplot(data=combined_df, x='Date', y='Close')
#plt.title('Stock Closing Prices Over Time')

In [12]:
#sns.lineplot(data=combined_df[combined_df['Name'] == 'IBM'], x='Date', y='Close')
#plt.title('Stock Closing Prices Over Time (IBM)')
#plt.show()

In [13]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [14]:
IBM_df = combined_df[combined_df['Name'] == 'IBM'].copy()
IBM_df['Date'] = pd.to_datetime(IBM_df['Date'])

IBM_df = IBM_df.sort_values('Date').reset_index(drop=True)

X = IBM_df.drop(columns=['Date', 'Close','Name'])
y = IBM_df['Close']

n=len(IBM_df)
train_size = int(n * 0.7)
val_size = int(n * 0.15)
test_size = n - train_size - val_size

X_train = X[:train_size]
y_train = y[:train_size]
X_val = X[train_size:train_size + val_size]
y_val = y[train_size:train_size + val_size]
X_test = X[train_size + val_size:]
y_test = y[train_size + val_size:]

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")
print(f"Validation set: {len(X_val)} samples ({X_val.index[0]} to {X_val.index[-1]})")
print(f"Test set: {len(X_test)} samples ({X_test.index[0]} to {X_test.index[-1]})")

scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train.values.reshape(-1, 1))
X_val_scaled = scaler.transform(X_val.values.reshape(-1, 1))
X_test_scaled = scaler.transform(X_test.values.reshape(-1, 1))


Train size: 4403, Validation size: 943, Test size: 945
Validation set: 943 samples (4403 to 5345)
Test set: 945 samples (5346 to 6290)


In [16]:
def create_sequences(data, seq_length):
    xs ,ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:(i + seq_length)]
        y = data[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

SEQ_LENGTH = 60
X_train_seq, y_train_seq = create_sequences(X_train_scaled, SEQ_LENGTH)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, SEQ_LENGTH)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, SEQ_LENGTH)

# Reshape for PyTorch LSTM: (batch_size, seq_length, num_features)
# If only using 'Close' price, num_features = 1
X_train_seq = X_train_seq.reshape(X_train_seq.shape[0], X_train_seq.shape[1], 1)
X_val_seq = X_val_seq.reshape(X_val_seq.shape[0], X_val_seq.shape[1], 1)
X_test_seq = X_test_seq.reshape(X_test_seq.shape[0], X_test_seq.shape[1], 1)

In [17]:
import torch
from torch.utils.data import DataLoader, TensorDataset

X_train_tensor = torch.FloatTensor(X_train_seq)
y_train_tensor = torch.FloatTensor(y_train_seq)
X_val_tensor = torch.FloatTensor(X_val_seq)
y_val_tensor = torch.FloatTensor(y_val_seq)
X_test_tensor = torch.FloatTensor(X_test_seq)
y_test_tensor = torch.FloatTensor(y_test_seq)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
