In [45]:
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

In [46]:
data = pd.read_csv('stocks.csv')

In [47]:
# Dimensions of dataset
data.shape

(112457, 8)

In [48]:
# Preview dataset
print(data.head(5))
print(data.tail(5))

  Index        Date        Open  ...       Close   Adj Close  Volume
0   NYA  1965-12-31  528.690002  ...  528.690002  528.690002     0.0
1   NYA  1966-01-03  527.210022  ...  527.210022  527.210022     0.0
2   NYA  1966-01-04  527.840027  ...  527.840027  527.840027     0.0
3   NYA  1966-01-05  531.119995  ...  531.119995  531.119995     0.0
4   NYA  1966-01-06  532.070007  ...  532.070007  532.070007     0.0

[5 rows x 8 columns]
       Index        Date         Open  ...        Close    Adj Close       Volume
112452  N100  2021-05-27  1241.119995  ...  1247.069946  1247.069946  379696400.0
112453  N100  2021-05-28  1249.469971  ...  1256.599976  1256.599976  160773400.0
112454  N100  2021-05-31  1256.079956  ...  1248.930054  1248.930054   91173700.0
112455  N100  2021-06-01  1254.609985  ...  1258.579956  1258.579956  155179900.0
112456  N100  2021-06-02  1258.489990  ...  1263.619995  1263.619995  148465000.0

[5 rows x 8 columns]


In [49]:
# dataset statistics
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,110253.0,110253.0,110253.0,110253.0,110253.0,110253.0
mean,7658.515222,7704.372961,7608.000422,7657.545872,7657.351729,1273975000.0
std,9011.478913,9066.638548,8954.506981,9011.510444,9011.6089,4315783000.0
min,54.869999,54.869999,54.869999,54.869999,54.869999,0.0
25%,1855.030029,1864.51001,1843.97998,1855.060059,1854.179565,0.0
50%,5194.089844,5226.279785,5154.049805,5194.75,5194.75,432900.0
75%,10134.29981,10207.82031,10060.34961,10134.83008,10134.83008,173431400.0
max,68775.0625,69403.75,68516.99219,68775.0625,68775.0625,94403740000.0


In [50]:
# Group by 'Index'
print(data.groupby('Index').size())

Index
000001.SS     5963
399001.SZ     5928
GDAXI         8606
GSPTSE       10776
HSI           8750
IXIC         12690
J203.JO       2387
KS11          6181
N100          5507
N225         14500
NSEI          3381
NYA          13948
SSMI          7830
TWII          6010
dtype: int64


In [51]:
# modifying dates to get only 'month' & 'year'
data['year'] = pd.DatetimeIndex(data['Date']).year
data['month'] = pd.DatetimeIndex(data['Date']).month

# Dropping 'Date' column
data = data.drop(columns=['Date'])
data.head()

Unnamed: 0,Index,Open,High,Low,Close,Adj Close,Volume,year,month
0,NYA,528.690002,528.690002,528.690002,528.690002,528.690002,0.0,1965,12
1,NYA,527.210022,527.210022,527.210022,527.210022,527.210022,0.0,1966,1
2,NYA,527.840027,527.840027,527.840027,527.840027,527.840027,0.0,1966,1
3,NYA,531.119995,531.119995,531.119995,531.119995,531.119995,0.0,1966,1
4,NYA,532.070007,532.070007,532.070007,532.070007,532.070007,0.0,1966,1


In [52]:
# filtering dataset to show only years in 21st century
data = data[data['year'] > 2000]
print(data.groupby('year').size())

year
2001    3106
2002    3114
2003    3114
2004    3124
2005    3081
2006    3006
2007    3053
2008    3259
2009    3243
2010    3251
2011    3243
2012    3467
2013    3477
2014    3492
2015    3497
2016    3501
2017    3491
2018    3487
2019    3461
2020    3493
2021    1414
dtype: int64


In [54]:
# checking for null or na values & dropping them
print(data.isna().any())
data = data.dropna()

# checking for n/a values after drop
print(data.isna().any())

Index        False
Open          True
High          True
Low           True
Close         True
Adj Close     True
Volume        True
year         False
month        False
dtype: bool
Index        False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
year         False
month        False
dtype: bool
