## Can we actually predict market movements by analysing Reddit's /r/wallstreetbets?
### Introduction:
-In this project, I will be analyzing a large dataset from Reddit's /r/wallstreetbets of over 2.5m entries before analysing data scraped from Reddit using PushShift and Reddit's API PRAW. 
-We will be attempting to understand the effects of Reddit's /r/wallstreetbets comments on the market as a whole.
-We will also be scraping comments from /r/wallstreetbets Daily Discussion Threads because it has the most reliable and useful (not just spam) information from the users. 
-We will overlay the graphs with the price and volume trends of the filtered tickers and market indexes. 
-Lastly, we'll shift the close price of the Dow Jones by 1, 2 and 3 days and see if we can predict market movements based on the text we've analyzed. 

##### Cleaning and set up

In [None]:
#First thing, let's load in our data from json

file_path = 'YOUR_FILE_PATH'

empty = []
for line in open(file_path, 'r'):
    empty.append(json.loads(line))

In [None]:
#Cast it to a dataframe

df = pd.DataFrame(empty)
df.head()

In [None]:
#Begin cleaning the data
#Drop columns that serve no purpose in our analysis

df['date_created'] = pd.to_datetime(df['created_utc'].astype(int), unit='s')
df.drop(columns=['created_utc','archived', 'controversiality','retrieved_on','downs','ups','subreddit'], inplace=True)

In [None]:
df['date'] = df['date_created'].dt.date
df.drop(columns=['date_created','gilded','link_id','id', 
                 'score_hidden', 'name', 'author', 'subreddit_id', 'parent_id', 
                 'author_flair_text', 'author_flair_css_class','distinguished'], inplace=True)

In [None]:
#Drop all deleted values

df = df.drop(df[df['body'].map(lambda x: str(x)=="[deleted]")].index)

##### Use VADER on the body text

In [None]:
analyser = SentimentIntensityAnalyzer()

compound_scores=[]
positive_scores=[]
negative_scores=[]

for item in tqdm(df['body']):
    positive_score=0
    negative_score=0
    compound_score=0
    try:
        positive_score=positive_score+analyser.polarity_scores(item)['pos']
        negative_score=negative_score+analyser.polarity_scores(item)['neg']
        compound_score=compound_score+analyser.polarity_scores(item)['compound']
    except TypeError:
        sentiment_score=0
    
    positive_scores.append(positive_score)
    negative_scores.append(negative_score)
    compound_scores.append(compound_score)
    
    
df['compound_score'] = compound_scores
df['positive_score'] = positive_scores
df['negative_score'] = negative_scores

In [None]:
## Time to standardize the scoring column as a new column. 
## Create score times the sentiment score column as well

In [None]:
scaler = StandardScaler()

df['standardized_upvotes'] = scaler.fit_transform(df[['score']])

In [None]:
# Create score multiplied by upvote columns

df['compound_score_upvotes'] = df.compound_score * df.standardized_upvotes
df['positive_score_upvotes'] = df.positive_score * df.standardized_upvotes
df['negative_score_upvotes'] = df.negative_score * df.standardized_upvotes

In [None]:
# Reset the index and fix up the date

df.reset_index(inplace=True)

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df = df.sort_values('date')

In [None]:
# Sum up all the sentiments along the date

df.set_index('date', inplace=True)
df = df.resample('D').sum()

##### Time to grab market data from Yahoo finance

In [None]:
import yfinance as yf

dataDJI = yf.download("DJI", start="2012-04-11", end="2018-10-31")

##### Merge the dataframes

In [None]:
merged_df = pd.merge(df, DJI, how='inner', left_index=True, right_index=True)

In [None]:
##### Plot the dataframe to check the graphs and relationships

merged_df.plot(secondary_y='Close', figsize=(20,12))

In [None]:
# Standardise the Close column

merged_df['standardized_close'] = scaler.fit_transform(merged_df[['Close']])

##### Set up our target variable 'up'. 1 for positive change, 0 for negative.  

In [None]:
merged_df['up'] = (merged_df.Close.diff() > 0)*1

In [None]:
# Add 3 new features, the Close column shifted 1, 2 and 3 times

merged_df['Close_shift_1'] = merged_df.Close.shift(1)
merged_df['Close_shift_2'] = merged_df.Close.shift(2)
merged_df['Close_shift_3'] = merged_df.Close.shift(3)

X = merged_df[['up','Close', 'Close_shift_1', 'Close_shift_2', 'Close_shift_3', 
                   'standardized_close','negative_score_upvotes','positive_score_upvotes', 
                   'compound_score_upvotes', 'standardized_upvotes', 'negative_score', 'positive_score', 'compound_score', 'upvotes']].copy()


X.dropna(inplace=True)
X.head()

##### Set up the test variable

In [None]:
y = X.pop('up')

In [None]:
# Check the shape

X.shape

##### Check the baseline

In [None]:
y.value_counts(normalize=True)

##### Train-test split the data by index 

In [None]:
n = 1405
X_train, y_train = X[:n], y[:n]
X_test, y_test = X[n:], y[n:]

##### TimeSeriesSplit

In [None]:
ts = TimeSeriesSplit(n_splits=7)

splits = [(tr, te) for (tr, te) in ts.split(X_train)]

##### Time to begin modelling

In [None]:
#RandomForest

model = RandomForestClassifier(n_estimators=10000)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

scores = cross_val_score(model, X_train, y_train, cv=ts)
print(scores)
print(scores.mean())

#### Optional step to clean up the body text, run Tf-IDF on the cleaned text and add them as features

##### Use spaCy to clean up the text

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" } 

In [None]:
# Function to clean up the text

def spacy_cleaner(text):
    try:
        decoded = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
    except:
        decoded = unidecode.unidecode(text)
    apostrophe_handled = re.sub("’", "'", decoded)
    expanded = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    parsed = nlp(expanded)
    final_tokens = []
    for t in parsed:
        if t.is_punct or t.is_space or t.like_num or t.like_url or str(t).startswith('@'):
            pass
        else:
            if t.lemma_ == '-PRON-':
                final_tokens.append(str(t))
            else:
                sc_removed = re.sub("[^a-zA-Z]", '', str(t.lemma_))
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    return spell_corrected

In [None]:
cleaned_text = tqdm([spacy_cleaner(t) for t in df.body])

In [None]:
# Add it back to the dataframe 

df['cleaned_text'] = cleaned_text

In [None]:
# Change the dtype to string

df.cleaned_text = df.cleaned_text.astype('str')

In [None]:
# TF-IDF on the cleaned text

tvec = TfidfVectorizer()
tfed_train = tvec.fit_transform(df.cleaned_text[:1405])
tfed_test = tvec.transform(df.cleaned_text[1405:])

In [None]:
#Scale my X_train and X_test

sc = StandardScaler(with_mean=False)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Create a sparse matrix for my X_train and X_test so I can combine it with the Tf-idf'd features

X_sparse = scipy.sparse.csr_matrix(X_train.values)
X_sparse_test = scipy.sparse.csr_matrix(X_test.values)

In [None]:
#New X_train and X_test 

X_train = scipy.sparse.hstack((X_sparse, tfed_train))
X_test = scipy.sparse.hstack((X_sparse_test, tfed_test))

In [None]:
#Split the features again

ts = TimeSeriesSplit(n_splits=7)

splits = [(tr, te) for (tr, te) in ts.split(X_train)]

In [None]:
#Get the baseline

y.value_counts(normalize=True)

In [None]:
# Test a RandomForest again 

model = RandomForestClassifier(n_estimators=10000)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

scores = cross_val_score(model, X_train, y_train, cv=ts)
print(scores)
print(scores.mean())

--------------------------------

### Part 2: Scraping Reddit Using PushShift and PRAW to analyse market returns

This time we are going to scrape data from Reddit using Pushshift and PRAW. Then we'll run similar analysis as above with some additional features catered to our new data. 

In [None]:
#function to get data from pushshift api

def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

#get relevant data from data extracted using previous function

def collectSubData(subm):
    subData = [subm['id'], subm['title'], subm['url'], datetime.datetime.fromtimestamp(subm['created_utc']).date()]
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"
    subData.append(flair)
    subStats.append(subData)
    
#Subreddit to query
sub='wallstreetbets'

#before and after dates
before = "1596240000" #August 1 2020
after = "1541030400" #Nov 1 2018

#query string
query = "Daily Discussion Thread"
subCount = 0
subStats = []

data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date

while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)
    

#organize data into dataframe
data={}
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])
data['id']=ids
data['title']=titles
data['url']=urls
data['date']=dates
data['flair']=flairs
df_1=pd.DataFrame(data)
df_1=df_1[df_1['flair']=='Daily Discussion']

In [None]:
#connect to reddit api
reddit = praw.Reddit(client_id="YOUR_ID", client_secret="YOUR_SECRET", user_agent="USER_AGENT")

#collect comments using praw
comments_by_day=[]
for url in df_1['url'].tolist():
    try:
        submission = reddit.submission(url=url)
        submission.comments.replace_more(limit=0)
        comments=list([(comment.body) for comment in submission.comments])
    except:
        comments=None
    comments_by_day.append(comments)

In [None]:
# Add the comments as a column

df_1['comments'] = comments_by_day

In [None]:
## This gives me the overall vibe of the day's comments. This can be used to analyze the full market (index) rather than individual tickers. 

analyser = SentimentIntensityAnalyzer()

compound_scores=[]
positive_scores=[]
negative_scores=[]

for comments in tqdm(comments_by_day):
    positive_score=0
    negative_score=0
    compound_score=0
    try:
        for comment in comments:
            positive_score=positive_score+analyser.polarity_scores(comment)['pos']
            negative_score=negative_score+analyser.polarity_scores(comment)['neg']
            compound_score=compound_score+analyser.polarity_scores(comment)['compound']
    except TypeError:
        positive_score=0
        negative_score=0
        compound_score=0
    
    positive_scores.append(positive_score)
    negative_scores.append(negative_score)
    compound_scores.append(compound_score)
    
df_1['compound_score'] = compound_scores
df_1['positive_score'] = positive_scores
df_1['negative_score'] = negative_scores

###### Now we need to get the titles of posts from reddit and just run a bull/bear analysis on it. 
###### No need for praw here again, we'll just use pushshift

In [None]:
## Use pushshift again to get the titles

def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

def collectSubData(subm):
    subData = [subm['id'], subm['title'], subm['url'], datetime.datetime.fromtimestamp(subm['created_utc']).date()]
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"
    subData.append(flair)
    subStats.append(subData)
    
#Subreddit to query
sub='wallstreetbets'
#before and after dates
before = "1596240000" #August 1 2020
after = "1541030400" #Nov 1 2018
query = ''
subCount = 0
subStats = []

data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    try:
        data = getPushshiftData(query, after, before, sub)
    except:
        pass
      
data={}
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    flairs.append(stat[4])
data['id']=ids
data['title']=titles
data['url']=urls
data['date']=dates
data['flair']=flairs
df_2=pd.DataFrame(data)

In [None]:
##### Create a list of bullish and bearish sentiments to analyze the comments

titles=df_2['title'].tolist()
titles=list([(title.lower()) for title in titles])

bull_words=['call', 'long', 'all in', 'moon', 'going up', 'rocket', 'buy', 'long term', 'green']
bear_words=['put', 'short', 'going down', 'drop', 'bear', 'sell', 'red', 'sell', 'leave']

bull_scores=[]
bear_scores=[]
for title in titles:
    bull=False
    bear=False
    for word in bull_words:
        if word in title:
            bull=True
    if re.findall(r'(\b\d{1,4}[c]\b)|(\b\d{1,4}[ ][c]\b)', title):
            bull=True
            
    for word in bear_words:
        if word in title:
            bear=True
    if re.findall(r'(\b\d{1,4}[p]\b)|(\b\d{1,4}[ ][p]\b)', title):
            bear=True
            
    if bull==True and bear==True:
        bull_scores.append(0)
        bear_scores.append(0)
    if bull==False and bear==False:
        bull_scores.append(0)
        bear_scores.append(0)
    if bull==True and bear==False:
        bull_scores.append(1)
        bear_scores.append(0)
    if bull==False and bear==True:
        bull_scores.append(0)
        bear_scores.append(1)
        
df_2['bull_score']=bull_scores
df_2['bear_score']=bear_scores

In [None]:
indeces=[]
flairs=df_2['flair'].tolist()
for n in range(len(flairs)):
    if flairs[n]=='DD' or flairs[n]=='Discussion' or flairs[n]=='YOLO' or flairs[n]=='Fundamentals' or flairs[n]=='Stocks':
        indeces.append(n)
df_2=df_2.iloc[indeces]

In [None]:
## Standardize scores using total scores for day

scores_df=df_2.groupby('date').sum()
scores_df['bull_score']=scores_df['bull_score']/df_2.groupby('date').count()['bull_score']
scores_df['bear_score']=scores_df['bear_score']/df_2.groupby('date').count()['bear_score']

##### This time, instead of the Dow Jones price, gather the SPY price (tracks the S&P 500)

In [None]:
import yfinance as yf

spy_price = yf.download("SPY", start="2018-11-01", end="2020-08-01")
spy_price.head()

In [None]:
df_2=yf.download("SPY", start='2018-11-01')
df_2=df_2.loc[:'2020-08-01']

bull_vals=[]
bear_vals=[]

for date in df_2.index.tolist():
    bull_vals.append(float(scores_df.loc[date.date()]['bull_score']))
    bear_vals.append(float(scores_df.loc[date.date()]['bear_score']))
        
df_2['bull_score']=bull_vals
df_2['bear_score']=bear_vals

##### Graph the bull score with the SPY price

In [None]:
df_2[['Close', 'bull_score']].plot(secondary_y='bull_score', color=['b','c'], figsize=(16, 10));

##### Graph the bear score with the SPY price

In [None]:
df_2[['Close', 'bear_score']].plot(secondary_y='bear_score', color=['b','y'], figsize=(16, 10));

##### Graph the VADER compound score and the SPY price

In [None]:
merged_df[['Close', 'compound_score']].plot(secondary_y='compound_score', color=['b','r'], figsize=(16, 10));

##### At this point, we can see a trend of some sort but we need to transform the graphs using Fourier transformation to create new features and visualize trends 

In [None]:
## fourier transform

close_fft = np.fft.fft(np.asarray(df_2['bull_score'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
fft_list = np.asarray(fft_df['fft'].tolist())

for num_ in [10, 30]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    df_2['fourier bull '+str(num_)]=np.fft.ifft(fft_list_m10)

close_fft = np.fft.fft(np.asarray(df_2['bear_score'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
fft_list = np.asarray(fft_df['fft'].tolist())

for num_ in [10, 30]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    df_2['fourier bear '+str(num_)]=np.fft.ifft(fft_list_m10)

##### Plot the Fourier bull scores

In [None]:
df_2[['bull_score', 'fourier bull 10', 'fourier bull 30']].plot(color=['y','b','r'], figsize=(16, 10));

##### Plot the bear scores

In [None]:
df_2[['bear_score', 'fourier bear 10', 'fourier bear 30']].plot(color=['y','k','m'],figsize=(16, 10));

##### Normalize 

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc= MinMaxScaler(feature_range=(0,1))
df_2['norm_price']=sc.fit_transform(df_2['Close'].to_numpy().reshape(-1, 1))
df_2['Close_log']=np.log(df_2['Close']/df_2['Close'].shift(1))
df_2['norm_bull']=sc.fit_transform(df_2['bull_score'].to_numpy().reshape(-1, 1))
df_2['norm_bear']=sc.fit_transform(df_2['bear_score'].to_numpy().reshape(-1, 1))
df_2['norm_fourier_bull_10']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_2['fourier bull 10'].to_numpy()])).reshape(-1, 1))
df_2['norm_fourier_bear_10']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_2['fourier bear 10'].to_numpy()])).reshape(-1, 1))
df_2['norm_fourier_bull_30']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_2['fourier bull 30'].to_numpy()])).reshape(-1, 1))
df_2['norm_fourier_bear_30']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_2['fourier bear 30'].to_numpy()])).reshape(-1, 1))

##### Plot the bull score with the new normalized features

In [None]:
df_2[['norm_price', 'norm_fourier_bull_10', 'norm_fourier_bull_30']].plot(color=['k','c','m'], figsize=(16, 10));

##### Create another merged dataframe

In [None]:
merged_df = pd.merge(df_1, spy_price, how='inner', left_index=True, right_index=True)
merged_df.head()

##### Transform the sentiment scroes as well

In [None]:
#Fourier transform the sentiment score

close_fft = np.fft.fft(np.asarray(merged_df['compound_score'].tolist()))
fft_df = pd.DataFrame({'fft':close_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
fft_list = np.asarray(fft_df['fft'].tolist())

for num_ in [5, 10, 15, 20]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    merged_df['fourier '+str(num_)]=np.fft.ifft(fft_list_m10)
    
merged_df[['compound_score', 'fourier 5', 'fourier 10', 'fourier 15', 'fourier 20']].plot(color=['y','c','m','g','k'],figsize=(16, 10));

##### Pre-processing for modelling

In [None]:
df_2.head()
df_2['date'] = pd.to_datetime(df_2['Date'], format='%Y-%m-%d')
df_2.set_index('date', inplace=True)
df_2.drop(columns=['Date'], inplace=True)

In [None]:
##### Create the Robinhood dataframe

robinhood = pd.read_csv('FILEPATH_TO_ROBINTRACK_DATA')

In [None]:
robinhood['clean_date'] = pd.to_datetime(robinhood['date'], format='%Y-%m-%d')

In [None]:
robinhood.drop(columns=['date'], inplace=True)

In [None]:
robinhood.set_index('clean_date', inplace=True)

##### Merge the dataframes

In [None]:
merged_final = pd.merge(df_2, robinhood, how='inner', left_index=True, right_index=True)

In [None]:
merged_final.head()

In [None]:
# New temporary dataframe 

new_df = merged_df.drop(columns=['id','title','url','flair','fourier 5','fourier 10', 'fourier 15', 'fourier 20'])

super_final = pd.merge(merged_final, new_df, how='inner', left_index=True, right_index=True)

##### Begin setting up our modelling 

In [None]:
# Target variable 'up' as defined before

super_final['up'] = ((super_final).Close.diff() > 0)*1

In [None]:
super_final['Close_shift_1'] = super_final.Close.shift(1)
super_final['Close_shift_2'] = super_final.Close.shift(2)
super_final['Close_shift_3'] = super_final.Close.shift(3)

In [None]:
super_final.dropna(inplace=True)

In [None]:
#Target variable

y = super_final.up

In [None]:
super_final.rename(columns={'norm_price_y':'norm_price'}, inplace=True)

In [None]:
X = super_final[['bull_score', 'bear_score', 'Close_log', 'norm_bull',
       'norm_bear', 'norm_fourier_bull_10', 'norm_fourier_bear_10',
       'norm_fourier_bull_30', 'norm_fourier_bear_30', 'users_holding',
       'compound_score', 'positive_score', 'negative_score', 'Open', 'High',
       'Low', 'Close', 'Adj Close', 'Volume', 'norm_price', 'Close log',
       'norm_sentiment', 'norm_fourier5', 'norm_fourier10', 'norm_fourier15',
       'norm_fourier20', 'Close_shift_1', 'Close_shift_2',
       'Close_shift_3']]

##### Train-test split at our index again

In [None]:
n = 180
X_train, y_train = X[:n], y[:n]
X_test, y_test = X[n:], y[n:]

##### Standardized

In [None]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# TimeSeriesSplit

ts = TimeSeriesSplit(n_splits=7)

splits = [(tr, te) for (tr, te) in ts.split(X_train)]

In [None]:
#Baseline

y.value_counts(normalize=True)

##### Let's start modelling by using this fantastic function
This essentially GridSearches across LogisticRegression, DecisionTreeClassifier, RandomForest, KNeighborClassifier

In [None]:
#Big Boy Testing

def test_models(models, X=None, y=None, split_data=True, scaler_type=StandardScaler()):    
    results = {}
    fitted_models = {}
    train_test_sets = {}
    for i in models:
        print(f"{i} model is currently running...")
        # split data
        if split_data:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
        else:
            X_train, X_test, y_train, y_test = X[0], X[1], y[0], y[1]
            #print(X_train, X_test, y_train, y_test)
        train_test_sets[i] = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}
        # scale X if relevant
        if scaler_type is not None:
            scaler = scaler_type
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        # fit models
        train_model = models[i].fit(X_train, y_train)
        print(f'{i} model fitted successfully.')
        results[i] = pd.DataFrame(train_model.cv_results_)
        fitted_models[i] = train_model
        
    return results, fitted_models, train_test_sets

In [None]:
logit_params = {'C': np.logspace(-5, 5, 15),
          'penalty': ['l1', 'l2'],
          'fit_intercept': [True, False],
          'max_iter': [100000],
          'verbose': [1],
          'random_state': [7]}

knn_params = {'n_neighbors': [1, 3, 5, 10, 15, 20, 25],
        }

cart_params = {
    'max_depth': list(range(1, 21))+[None],
    'max_features': [None, 1, 2, 3],
    'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50],
    'ccp_alpha': [0, 0.001, 0.005, 0.01],
            }

random_params = {
    'n_estimators': [5, 10, 25, 40],
    'max_depth': [3, 5, 9],
}

kwargs = {
    'cv': ts,
    'n_jobs':2,
    'return_train_score':True,
    'verbose': True
}

models = {
    'logit': GridSearchCV(LogisticRegression(), param_grid=logit_params, **kwargs),
    'cart': GridSearchCV(DecisionTreeClassifier(), param_grid=cart_params, **kwargs),
    'knn' : GridSearchCV(KNeighborsClassifier(), param_grid=knn_params, **kwargs),
    'random_forest': GridSearchCV(RandomForestClassifier(), param_grid=random_params, **kwargs)
}

X, y = (X_train, X_test), (y_train, y_test)
results, gs, train_test_sets = test_models(models, X=X, y=y, split_data=False, scaler_type=None)

In [None]:
for k, v in gs.items():
    if k == 'Logit':
        print(k)
        print('Best Parameters:')
        print(v.best_params_)
        print('Best estimator mean cross validated training score:')
        print(v.best_score_)
        print('Best estimator score on the full training set:')
        print(v.score(train_test_sets[k]['X_train'], train_test_sets[k]['y_train']))
        print('Best estimator score on the test set:')
        print(v.score(train_test_sets[k]['X_test'], train_test_sets[k]['y_test']))
        print('Best estimator coefficients:')
        logr_model1_coefs = pd.DataFrame(list(zip(Xc.columns, v.best_estimator_.coef_[0])), columns=['feature', 'coef']).sort_values(by='coef')
        logr_model1_coefs
        print('\n')
    else:
        print(k)
        print('Best Parameters:')
        print(v.best_params_)
        print('Best estimator mean cross validated training score:')
        print(v.best_score_)
        print('Best estimator score on the full training set:')
        print(v.score(train_test_sets[k]['X_train'], train_test_sets[k]['y_train']))
        print('Best estimator score on the test set:')
        print(v.score(train_test_sets[k]['X_test'], train_test_sets[k]['y_test']))
        print('\n')

##### LogisticRegression was our best model. So now let's extract the confusion matrix, ROC/AUC curves and classification reports for that model. 

In [None]:
model = LogisticRegression()

In [None]:
#Try some regression models
X = super_final[['bull_score', 'bear_score', 'norm_bull',
       'norm_bear', 'norm_fourier_bull_10', 'norm_fourier_bear_10',
       'norm_fourier_bull_30', 'norm_fourier_bear_30', 'users_holding',
       'compound_score', 'positive_score', 'negative_score', 
       'norm_sentiment', 'norm_fourier5', 'norm_fourier10', 'norm_fourier15',
       'norm_fourier20', 'Close_shift_1', 'Close_shift_2',
       'Close_shift_3']]

In [None]:
y = super_final.up

In [None]:
y.value_counts(normalize=True)

In [None]:
n = 180
X_train, y_train = X[:n], y[:n]
X_test, y_test = X[n:], y[n:]

In [None]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
ts = TimeSeriesSplit(n_splits=3)

splits = [(tr, te) for (tr, te) in ts.split(X_train)]

In [None]:
params = {
          'C': np.logspace(-4, 4, 10),
          'penalty': ['l2'],
          'fit_intercept': [True, False],
          'max_iter': [100000],
          'verbose': [1]}

gs = GridSearchCV(estimator=model,
                  param_grid=params,
                  cv=ts,
                  scoring='accuracy',
                  return_train_score=True)

gs.fit(X_train, y_train)

# extract the grid search results

print('Best Parameters:')
print(gs.best_params_)
print('Best estimator C:')
print(gs.best_estimator_.C)
print('Best estimator mean cross validated training score:')
print(gs.best_score_)
print('Best estimator score on the full training set:')
print(gs.score(X_train, y_train))
print('Best estimator score on the test set:')
print(gs.score(X_test, y_test))
print('Best estimator coefficients:')
print(gs.best_estimator_.coef_)

In [None]:
#Classification Report
predictions_train = gs.predict(X_train)
predictions_test = gs.predict(X_test)

print(classification_report(y_train, predictions_train))

In [None]:
print(classification_report(y_test, predictions_test))

In [None]:
probabilities_train = gs.predict_proba(X_train)
probabilities_test = gs.predict_proba(X_test)

In [None]:
#ROC Curve
skplt.metrics.plot_roc(y_test, probabilities_test, cmap=cmap)
plt.show()

In [None]:
#Confusion matrix
skplt.metrics.plot_confusion_matrix(y_test, predictions_test, cmap='Wistia',labels=[1, 0], figsize=(6, 6))
plt.show()

In [None]:
#Gather feature importance
best_features = pd.DataFrame(gs.best_estimator_.coef_, columns=X.columns).transpose()
best_features.rename(columns={0:'Feature Importance'}, inplace=True)
best_features.sort_values(by='Feature Importance', ascending=False)