In [1]:
# import necessary libraries
import pandas as pd
from collections import deque
from statistics import mean
import numpy as np
import datetime
np.random.seed(1337)

In [2]:
# try using TA lib (Technical Analysis library)
# import ta
from ta import add_all_ta_features

I recommend that you just give it the last year's worth of data. The processing below will output rows without NaN values within the rows, and will make sure to avoid outputting zeros (unless it's actually a calculated value).

In [3]:
gme_data = pd.read_csv('GME.csv')
gme_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,19.0,19.1,17.15,17.25,17.25,10022500
1,2021-01-05,17.35,18.08,17.23,17.370001,17.370001,4961500
2,2021-01-06,17.34,18.98,17.33,18.360001,18.360001,6056200
3,2021-01-07,18.469999,19.450001,18.02,18.08,18.08,6129300
4,2021-01-08,18.18,18.299999,17.08,17.690001,17.690001,6482000


# Data Augmentation

One problem with the data is that from january 1 to july 25, there are only 200 or so days. If we don't include weekends, then it's even fewer. In this section we will perform some basic data augmentation for the OHLC data using linear interpolation with a bit of noise added in to simulate typical price movement to enlarge the dataset.

we want around 10000 data points. Check how big the current data set is:

In [4]:
gme_data.shape[0]

140

if there are x rows, then there are x-1 gaps for linearly interpolated data (139 for this specific case). To get around 10000 rows, the equation to solve will be:

$10000 = 140 + 139x$

where x is the size of each linearly interpolated gap. Solving the equation, and rounding up to the nearest integer gives x = 71, which we will use as the number of elements between each currently existing price point. The result should be a 10009 row dataframe 

now, create an empty data frame with the appropriate number of NaN rows and column titles which will be filled in later

In [5]:
gme_data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [6]:
# initialize NaN dataframe for linear interpolation later
augmented_gme_data = pd.DataFrame(np.nan, index=[i for i in range(10009)], columns=[
    'Date',
    'Open',
    'High',
    'Low',
    'Close',
    'Adj Close',
    'Volume'
])
augmented_gme_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,


In [7]:
# change the Date and Volume columns to the original datatype
augmented_gme_data.dtypes

Date         float64
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume       float64
dtype: object

In [8]:
# insert the existing data into the appropriate positions
# 72 instead of 71 since have to take the existing row into account
augmented_og_data_positions = [i for i in range(0, 10009, 72)]
og_data_positions = [i for i in range(gme_data.shape[0])]
gme_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,19.0,19.1,17.15,17.25,17.25,10022500
1,2021-01-05,17.35,18.08,17.23,17.370001,17.370001,4961500
2,2021-01-06,17.34,18.98,17.33,18.360001,18.360001,6056200
3,2021-01-07,18.469999,19.450001,18.02,18.08,18.08,6129300
4,2021-01-08,18.18,18.299999,17.08,17.690001,17.690001,6482000


In [9]:
epoch_start = datetime.datetime(1970, 1, 1)
for aug_posn, og_posn in zip(augmented_og_data_positions, og_data_positions):
    # set the values of the appropriate rows to the values of the original to interpolate later
    augmented_gme_data.at[aug_posn, 'Date'] = (pd.to_datetime(gme_data.at[og_posn, 'Date'])-epoch_start).total_seconds()
    augmented_gme_data.at[aug_posn, 'Open'] = gme_data.at[og_posn, 'Open']
    augmented_gme_data.at[aug_posn, 'High'] = gme_data.at[og_posn, 'High']
    augmented_gme_data.at[aug_posn, 'Low'] = gme_data.at[og_posn, 'Low']
    augmented_gme_data.at[aug_posn, 'Close'] = gme_data.at[og_posn, 'Close']
    augmented_gme_data.at[aug_posn, 'Adj Close'] = gme_data.at[og_posn, 'Adj Close']
    augmented_gme_data.at[aug_posn, 'Volume'] = gme_data.at[og_posn, 'Volume']
    
    

In [10]:
augmented_gme_data.head(73)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1.609718e+09,19.00,19.10,17.15,17.250000,17.250000,10022500.0
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
...,...,...,...,...,...,...,...
68,,,,,,,
69,,,,,,,
70,,,,,,,
71,,,,,,,


In [11]:
# forward fill the dates until the next one in column
augmented_gme_data.loc[:, 'Date'] = augmented_gme_data.loc[:,'Date'].ffill()

# forward fill Volume since its value isn't necessary for technical indicators we're using
augmented_gme_data.loc[:, 'Volume'] = augmented_gme_data.loc[:, 'Volume'].ffill()

In [12]:
augmented_gme_data.head(73)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1.609718e+09,19.00,19.10,17.15,17.250000,17.250000,10022500.0
1,1.609718e+09,,,,,,10022500.0
2,1.609718e+09,,,,,,10022500.0
3,1.609718e+09,,,,,,10022500.0
4,1.609718e+09,,,,,,10022500.0
...,...,...,...,...,...,...,...
68,1.609718e+09,,,,,,10022500.0
69,1.609718e+09,,,,,,10022500.0
70,1.609718e+09,,,,,,10022500.0
71,1.609718e+09,,,,,,10022500.0


# Linear Interpolation

In [13]:
# linearly interpolate the rest of the columns without the date and volume values
augmented_gme_data.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)

In [14]:
augmented_gme_data.head(73)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1.609718e+09,19.000000,19.100000,17.150000,17.250000,17.250000,10022500.0
1,1.609718e+09,18.977083,19.085833,17.151111,17.251667,17.251667,10022500.0
2,1.609718e+09,18.954167,19.071667,17.152222,17.253333,17.253333,10022500.0
3,1.609718e+09,18.931250,19.057500,17.153333,17.255000,17.255000,10022500.0
4,1.609718e+09,18.908333,19.043333,17.154444,17.256667,17.256667,10022500.0
...,...,...,...,...,...,...,...
68,1.609718e+09,17.441667,18.136667,17.225556,17.363334,17.363334,10022500.0
69,1.609718e+09,17.418750,18.122500,17.226667,17.365001,17.365001,10022500.0
70,1.609718e+09,17.395833,18.108333,17.227778,17.366668,17.366668,10022500.0
71,1.609718e+09,17.372917,18.094167,17.228889,17.368334,17.368334,10022500.0


# Labelling and Technical Indicators

In [15]:
# create class labels as up

# ignore the final row, since won't be able to know if it goes up or down

# initialize the label list
price_labels = []

for i in range(1, len(augmented_gme_data['Close'])):
    if augmented_gme_data['Close'][i] > augmented_gme_data['Close'][i-1]:
        # add label for price increasing
        price_labels.append(1)
    else:
        # add label for price decreasing or same price
        price_labels.append(0)

        
# for now, just add a zero to the price label. Will figure out a way to deal by end of notebook
price_labels.append(0)

# add the labels to the dataframe
augmented_gme_data.insert(len(gme_data.columns), "up", price_labels)

# creating technical indicator values

In [16]:
# no data for weekends, so only include weekdays in calculation
fourteen_days = 14*72
thirty_days = 30*72

## RSI

background: RSI stands for Relative Strength Indicator. Its value is between 0 and 100. It is typically used as an overbought and oversold indicator. When RSI values move above 70, the asset is considered overbought, and should therefore decline in value. When RSI is below 30, the asset is considered oversold and could rally, and therefore should increase in value.

These assumptions are dangerous, so traders typically wait for the indicator to rise above 70, then drop below before selling, or drop below 30 and rise back above before buying.

https://www.investopedia.com/top-7-technical-analysis-tools-4773275

In [17]:
from ta.momentum import RSIIndicator

rsi_values = RSIIndicator(close=augmented_gme_data['Close'], window=fourteen_days)
rsi_values = rsi_values.rsi()
len(rsi_values)

10009

## SMA

background: SMA stands for Simple Moving Average. It calculates the average of a selected range of prices, typically closing prices, by a number of periods within that range. In the case of this problem, we are using 14 days as the range.

https://www.investopedia.com/terms/s/sma.asp

In [18]:
from ta.trend import SMAIndicator
sma_values = SMAIndicator(close=augmented_gme_data['Close'], window=fourteen_days)
sma_values = sma_values.sma_indicator()
len(sma_values)

10009

## LMA

background: LMA stands for Long Moving Average. It uses the same formulat as the simple moving average, but uses a longer time period as compared to your SMA. In this case, we'll arbitrarily use 30 for calculating LMA values

In [19]:
lma_values = SMAIndicator(close=augmented_gme_data['Close'], window=thirty_days)
lma_values = lma_values.sma_indicator()
len(lma_values)

10009

## ADX

background: ADX stands for Average Directional index. It is used to quantify the strength of a trend. For this value, we will be using the ta package for calculations

ta library python:
https://technical-analysis-library-in-python.readthedocs.io/en/latest/

background for indicator
https://www.investopedia.com/articles/trading/07/adx-trend-indicator.asp

In [20]:
from ta.trend import ADXIndicator

In [21]:
# default window size is 14
adx_indicator = ADXIndicator(
    high=augmented_gme_data['High'],
    low=augmented_gme_data['Low'],
    close=augmented_gme_data['Close'],
    window=fourteen_days
)

In [22]:
adx_values = adx_indicator.adx()

  dip[i] = 100 * (self._dip[i] / self._trs[i])
  din[i] = 100 * (self._din[i] / self._trs[i])


# finish off technical indicators

In [23]:
# add technical indicators to the gme_data
augmented_gme_data.insert(len(augmented_gme_data.columns)-1, "RSI", rsi_values)
augmented_gme_data.insert(len(augmented_gme_data.columns)-1, "SMA", sma_values)
augmented_gme_data.insert(len(augmented_gme_data.columns)-1, "LMA", lma_values)
augmented_gme_data.insert(len(augmented_gme_data.columns)-1, "ADX", adx_values)

In [24]:
augmented_gme_data.head(75)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up
0,1.609718e+09,19.000000,19.100000,17.150000,17.250000,17.250000,10022500.0,,,,0.0,1
1,1.609718e+09,18.977083,19.085833,17.151111,17.251667,17.251667,10022500.0,,,,0.0,1
2,1.609718e+09,18.954167,19.071667,17.152222,17.253333,17.253333,10022500.0,,,,0.0,1
3,1.609718e+09,18.931250,19.057500,17.153333,17.255000,17.255000,10022500.0,,,,0.0,1
4,1.609718e+09,18.908333,19.043333,17.154444,17.256667,17.256667,10022500.0,,,,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
70,1.609718e+09,17.395833,18.108333,17.227778,17.366668,17.366668,10022500.0,,,,0.0,1
71,1.609718e+09,17.372917,18.094167,17.228889,17.368334,17.368334,10022500.0,,,,0.0,1
72,1.609805e+09,17.350000,18.080000,17.230000,17.370001,17.370001,4961500.0,,,,0.0,1
73,1.609805e+09,17.349861,18.092500,17.231389,17.383751,17.383751,4961500.0,,,,0.0,1


In [25]:
# the first row that does not have NaN value for LMA since it uses 30 day moving average
augmented_gme_data.iloc[[(30*72)-1]]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up
2159,1613434000.0,49.810139,51.222082,44.622223,45.989582,45.989582,8175000.0,45.968632,111.187089,78.873193,26.659787,0


# post processing

There will be missing values for everything up to the 30th row. This is because lma value are calculated using 30 values windows, so it'll just be zero up to the 30th row.

In [26]:
augmented_gme_data = augmented_gme_data.iloc[thirty_days:]

In [27]:
augmented_gme_data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up
10004,1626912000.0,181.238889,181.938339,173.968329,180.276112,180.276112,1413000.0,43.119461,183.402915,201.013925,16.964182,1
10005,1626912000.0,181.179167,181.853756,173.936246,180.297085,180.297085,1413000.0,43.13101,183.380498,200.993778,16.961743,1
10006,1626912000.0,181.119445,181.769173,173.904163,180.318057,180.318057,1413000.0,43.142566,183.358122,200.97417,16.959332,1
10007,1626912000.0,181.059722,181.684589,173.872079,180.339029,180.339029,1413000.0,43.154129,183.335789,200.955099,16.956948,1
10008,1626998000.0,181.0,181.600006,173.839996,180.360001,180.360001,1312900.0,43.165699,183.313497,200.936567,16.954591,0


Also, we won't know whether the price in the final row goes up or down, since there won't be a subsequent row to compare its closing value with. Because of that, we delete the final row.

Much earlier in the notebook, this was labelled falsely as a placeholder. Now, just delete the row.

In [28]:
# remove the final row of the dataframe to avoid errors
augmented_gme_data = augmented_gme_data[:-1]

In [29]:
augmented_gme_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up
2160,1613520000.0,49.77,51.189999,44.560001,45.939999,45.939999,9186800.0,45.96259,110.887913,78.886475,26.642816,0
2161,1613520000.0,49.752222,51.157777,44.505695,45.867082,45.867082,9186800.0,45.953698,110.590784,78.899723,26.625878,0
2162,1613520000.0,49.734445,51.125555,44.45139,45.794166,45.794166,9186800.0,45.9448,110.295705,78.912937,26.608974,0
2163,1613520000.0,49.716667,51.093332,44.397084,45.721249,45.721249,9186800.0,45.935898,110.002673,78.926115,26.592103,0
2164,1613520000.0,49.698889,51.06111,44.342779,45.648332,45.648332,9186800.0,45.92699,109.71169,78.93926,26.575264,0


In [30]:
augmented_gme_data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up
10003,1626912000.0,181.298611,182.022922,174.000413,180.25514,180.25514,1413000.0,43.107918,183.425374,201.034609,16.966648,1
10004,1626912000.0,181.238889,181.938339,173.968329,180.276112,180.276112,1413000.0,43.119461,183.402915,201.013925,16.964182,1
10005,1626912000.0,181.179167,181.853756,173.936246,180.297085,180.297085,1413000.0,43.13101,183.380498,200.993778,16.961743,1
10006,1626912000.0,181.119445,181.769173,173.904163,180.318057,180.318057,1413000.0,43.142566,183.358122,200.97417,16.959332,1
10007,1626912000.0,181.059722,181.684589,173.872079,180.339029,180.339029,1413000.0,43.154129,183.335789,200.955099,16.956948,1


In [31]:
# convert Date from epoch seconds to string (datetime object)
augmented_gme_data['Date'] = pd.to_datetime(augmented_gme_data['Date'], unit='s')
augmented_gme_data.head(73)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up
2160,2021-02-17,49.770000,51.189999,44.560001,45.939999,45.939999,9186800.0,45.962590,110.887913,78.886475,26.642816,0
2161,2021-02-17,49.752222,51.157777,44.505695,45.867082,45.867082,9186800.0,45.953698,110.590784,78.899723,26.625878,0
2162,2021-02-17,49.734445,51.125555,44.451390,45.794166,45.794166,9186800.0,45.944800,110.295705,78.912937,26.608974,0
2163,2021-02-17,49.716667,51.093332,44.397084,45.721249,45.721249,9186800.0,45.935898,110.002673,78.926115,26.592103,0
2164,2021-02-17,49.698889,51.061110,44.342779,45.648332,45.648332,9186800.0,45.926990,109.711690,78.939260,26.575264,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2228,2021-02-17,48.561113,48.998888,40.867224,40.981666,40.981666,9186800.0,45.345650,95.349299,79.708673,25.565332,0
2229,2021-02-17,48.543335,48.966666,40.812919,40.908749,40.908749,9186800.0,45.336391,95.191458,79.719573,25.550595,0
2230,2021-02-17,48.525557,48.934443,40.758613,40.835832,40.835832,9186800.0,45.327127,95.035665,79.730438,25.535889,0
2231,2021-02-17,48.507780,48.902221,40.704308,40.762916,40.762916,9186800.0,45.317857,94.881920,79.741269,25.521215,0


# final thoughts before using the data

The data may need to be normalized for use within specific algorithms to improve performance. At the same time, the current plan is to use bullish/bearish sentiment for reddit post data to aid in price prediction. This may be included at a later time

# Sentiment Analysis

Use NLTK's Vader Sentiment Analyzer to classify reddit post data as positive, negative, or neutral.

NLTK library: https://www.nltk.org/api/nltk.sentiment.html?highlight=vader#module-nltk.sentiment.vader

In [32]:
import nltk
import pickle
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [33]:
# Import reddit data from the pickle file
with open('date_with_posts.pickle', 'rb') as handle:
    reddit_data = pickle.load(handle)

In [34]:
sia = SentimentIntensityAnalyzer()
scores = []

for date in augmented_gme_data['Date'].tolist():
    score = 0
    for post in reddit_data[date]:
        # Store compound score 
        score += sia.polarity_scores(post)["compound"]
    # Days without data will have score 0
    if len(reddit_data[date]) == 0:
        scores.append(0)
    else:
        # Calculate average score
        scores.append(score / len(reddit_data[date]))

In [35]:
scores

[-0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.1353066666666667,
 -0.135306

In [36]:
# Merge the score and sentiment to the dataset
augmented_gme_data['Score'] = scores

# Use numerical encoding for sentiment
# Positive is 1, negative is -1, neutral is 0
augmented_gme_data['Sentiment'] = 0
augmented_gme_data.loc[augmented_gme_data['Score'] > 0.1, 'Sentiment'] = 1
augmented_gme_data.loc[augmented_gme_data['Score'] < -0.1, 'Sentiment'] = -1

In [37]:
augmented_gme_data.head(73)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,LMA,ADX,up,Score,Sentiment
2160,2021-02-17,49.770000,51.189999,44.560001,45.939999,45.939999,9186800.0,45.962590,110.887913,78.886475,26.642816,0,-0.135307,-1
2161,2021-02-17,49.752222,51.157777,44.505695,45.867082,45.867082,9186800.0,45.953698,110.590784,78.899723,26.625878,0,-0.135307,-1
2162,2021-02-17,49.734445,51.125555,44.451390,45.794166,45.794166,9186800.0,45.944800,110.295705,78.912937,26.608974,0,-0.135307,-1
2163,2021-02-17,49.716667,51.093332,44.397084,45.721249,45.721249,9186800.0,45.935898,110.002673,78.926115,26.592103,0,-0.135307,-1
2164,2021-02-17,49.698889,51.061110,44.342779,45.648332,45.648332,9186800.0,45.926990,109.711690,78.939260,26.575264,0,-0.135307,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2228,2021-02-17,48.561113,48.998888,40.867224,40.981666,40.981666,9186800.0,45.345650,95.349299,79.708673,25.565332,0,-0.135307,-1
2229,2021-02-17,48.543335,48.966666,40.812919,40.908749,40.908749,9186800.0,45.336391,95.191458,79.719573,25.550595,0,-0.135307,-1
2230,2021-02-17,48.525557,48.934443,40.758613,40.835832,40.835832,9186800.0,45.327127,95.035665,79.730438,25.535889,0,-0.135307,-1
2231,2021-02-17,48.507780,48.902221,40.704308,40.762916,40.762916,9186800.0,45.317857,94.881920,79.741269,25.521215,0,-0.135307,-1


In [38]:
augmented_gme_data['Sentiment'].value_counts()

 0    4320
 1    3384
-1     144
Name: Sentiment, dtype: int64

As we can see, there are 4320 rows of data are neutral, 3384 rows of data are positive, 144 rows of data are negative.

# output data to csv

In [32]:
augmented_gme_data.to_csv('GME_augmented_processed.csv', index=False)

In [33]:
len(augmented_gme_data)

7848