<a href="https://colab.research.google.com/github/vijaynadimpalli/numer_ai_experiments/blob/master/My_WSB_to_Numerai_Signals_CLEANED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install praw
!pip install vaderSentiment
!pip install ffn
!pip install numerapi


import gc
import re
import csv
import json
import time
import datetime
import requests
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from dateutil.relativedelta import relativedelta, FR

import praw #reddit data api
import ffn #for loading financial data
import numerapi #for numerai tickers

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer #VADER sentiment model

!pip install flair

from flair.models import TextClassifier
from flair.data import Sentence
from flair.models.text_classification_model import TARSClassifier



In [None]:
%matplotlib inline

In [None]:
import tensorflow as tf
from tensorflow import keras
tf.test.gpu_device_name() #run to make sure tensorflow is connected to gpu (if applicable)

'/device:GPU:0'

# Data Collection

### Tickers we(Numerai) want

In [None]:
# Converting Bloomberg tickers to yfinance tickers

napi = numerapi.SignalsAPI()

eligible_tickers = pd.Series(napi.ticker_universe(), name="bloomberg_ticker")
print(f"Number of eligible tickers : {len(eligible_tickers)}")
print(eligible_tickers.head(10))

ticker_map = pd.read_csv(
        'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv'
)
print(len(ticker_map))

#Yahoo <-> Bloomberg mapping
yfinance_tickers = eligible_tickers.map(
        dict(zip(ticker_map["bloomberg_ticker"], ticker_map["yahoo"]))
    ).dropna()

bloomberg_tickers = ticker_map["bloomberg_ticker"]
print(f"Number of eligible, mapped tickers: {len(yfinance_tickers)}")

Number of eligible tickers : 5380
0    SVW AU
1    GEM AU
2    AZJ AU
3    NXT AU
4    TWE AU
5    SGR AU
6    CKF AU
7    BGA AU
8    QUB AU
9    MMS AU
Name: bloomberg_ticker, dtype: object
5380
Number of eligible, mapped tickers: 5328


## get comments from reddit using pushshift and praw

In [None]:
def getPushshiftData(query, after, before, sub):
    '''
    Reddit Data collector from pushshift
    '''

    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    data = json.loads(requests.get(url).text)
    return data['data']


def collectSubData(subm, stats):
    '''
    Collect submission data and append to stats
    '''
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"

    if flair != 'Daily Discussion':
        return
    
    sub_array = [subm['id'], subm['title'], subm['url'], datetime.datetime.fromtimestamp(subm['created_utc']).date()]
    sub_array.append(flair)
    stats.append(sub_array)


# Start and end dates
date_1 = (datetime.datetime.today() + relativedelta(weekday=FR(-1)) -  datetime.timedelta(days=1 * 365)).strftime('%d/%m/%Y')
date_2 = (datetime.datetime.today() + relativedelta(weekday=FR(-1))).strftime('%d/%m/%Y')
print(date_1, date_2)

# Converting to unix time
date_1 = str(int(time.mktime(datetime.datetime.strptime(date_1, "%d/%m/%Y").timetuple()))) #july 1 2017
date_2 = str(int(time.mktime(datetime.datetime.strptime(date_2, "%d/%m/%Y").timetuple()))) #july 10 2020

subStats = [] # Holds submission metadata

23/07/2020 23/07/2021


In [None]:
temp_data = []
#Pushshift only collects 100 days of data. This loop is used to collect remaining data starting from last collected date
while 1:
    try:
        temp_data = getPushshiftData("Daily Discussion Thread", date_1, date_2, 'wallstreetbets')
        # Data collected from date_1 to date_2
        #Lazy collecting data
    except Exception as e:
        print(e)

    if len(temp_data) == 0:
        break
        
    for submission in temp_data:
        collectSubData(submission, subStats)
    # Calls getPushshiftData() with the created date of the last submission
    print(len(temp_data))
    print(str(datetime.datetime.fromtimestamp(temp_data[-1]['created_utc'])))
    date_1 = temp_data[-1]['created_utc']

100
2020-11-27 11:10:41
100
2021-02-11 11:00:23
100
2021-05-26 11:57:47
40
2021-07-22 10:00:18


In [None]:
#organize data into dataframe
data={x:[] for x in ['id', 'title', 'url', 'date', 'flair']}

for stat in subStats:
    data['id'].append(stat[0])
    data['title'].append(stat[1])
    data['url'].append(stat[2])
    data['date'].append(stat[3])
    data['flair'].append(stat[4])

df_1=pd.DataFrame(data)

## Download data from Reddit using praw

[How to set-up PRAW](https://towardsdatascience.com/scraping-reddit-data-1c0af3040768)

In [None]:
#connect to reddit api
reddit = praw.Reddit(client_id='oWl9JrNkK08efA',
                     client_secret='6c2AbGHqSTGxNf0YIGZeV7fVlbSt3A', 
                     user_agent='CakeDayBot',
                     username='TrailBraker_Bot', 
                     password='funracingisfun', check_for_async=False)

In [None]:
#This part is the crux of the data dowloading module, here we download comments using praw
daily_comments_text=[]
daily_comments_upvotes = []
for url in tqdm(df_1['url'].tolist()):
    try:
        reddit_submission = reddit.submission(url=url) # Dowloading submission data
        reddit_submission.comments.replace_more(limit=0)
        # Removing punctuations in string
        # Using regex
        comments=list([(re.sub(r'[^\w\s]', '', comment.body)) for comment in reddit_submission.comments])
        upvotes=list([(comment.score) for comment in reddit_submission.comments])
        
        daily_comments_text.append(comments)
        daily_comments_upvotes.append(upvotes)
    except:
        continue

HBox(children=(FloatProgress(value=0.0, max=244.0), HTML(value='')))






## Symbol filtering

Use some stop words that might create ambiguity with stock names in comments

In [None]:
#Downloading list of stopwords
!wget https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt

gist_file = open("gist_stopwords.txt", "r")
try:
    content = gist_file.read()
    stop_words = content.split(",")
finally:
    gist_file.close()

#Add more stop words that are used in the daily discussions
stop_words += ['ATH', 'SAVE', 'US', 'ALL', 'LOVE', 'FOR', 'ME', 
               'GET', "BEAT", 'JACK', "PUMP", "BIG", "KIDS", 'STAY', 
               'TRUE', 'EDIT','PLAY', "ROCK", "NICE", "DIE", "COST", 
               "WORK", "MF"]

stop_words = set(stop_words)

--2021-07-28 09:04:01--  https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6271 (6.1K) [text/plain]
Saving to: ‘gist_stopwords.txt.1’


2021-07-28 09:04:01 (46.6 MB/s) - ‘gist_stopwords.txt.1’ saved [6271/6271]



In [None]:
ticks = bloomberg_tickers.apply(lambda x: x.split(" ")[0])

# Removing ticks with len < 2
ticks = ticks[ticks.str.len()>=2].values
ticks = [t for t in ticks if not str.isdigit(t) and t not in stop_words and t.lower() not in stop_words]

2021-07-28 09:04:01,786 INFO numexpr.utils: NumExpr defaulting to 2 threads.


In [None]:
# ticks ban list
# exclude_ticks = ['GME','TSLA']
# ticks = [t for t in ticks if t not in exclude_ticks]

# Checking for any intersection
print(np.intersect1d(ticks, [s.upper() for s in stop_words]))

len(ticks)

[]


3932

In [None]:
# !pip install transformers
# !pip install sentencepiece

In [None]:
# # Trying this https://huggingface.co/facebook/bart-large-mnli


# # pose sequence as a NLI premise and label as a hypothesis
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# nli_model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')
# tokenizer = AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')

# premise = "I'm gonna sell GOOG"
# hypothesis = 'positive'

# # run through model pre-trained on MNLI
# x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
#                      truncation_strategy='only_first')
# logits = nli_model(x.to(device))[0]

# # we throw away "neutral" (dim 1) and take the probability of
# # "entailment" (2) as the probability of the label being true 
# entail_contradiction_logits = logits[:,[0,2]]
# probs = entail_contradiction_logits.softmax(dim=1)
# prob_label_is_true = probs[:,1]

In [None]:
#Try flair here https://github.com/flairNLP/flair



In [None]:
classifier = TextClassifier.load('en-sentiment')

def flair_scorer(comment):

  sentence = Sentence(comment)
  classifier.predict(sentence)# print sentence with predicted labels
  #TODO : Try using multi_class_prob here, result will be list of two labels instead of 1.
  flag = 1
  if sentence.labels[0].value == "NEGATIVE":
    flag = -1

  return flag * sentence.labels[0].score



# 1. Load our pre-trained TARS model for English
tars = TARSClassifier.load('tars-base')

def flair_score_noshot(comment):
  # 2. Prepare a test sentence
  sentence = Sentence(comment)

  # 3. Define some classes that you want to predict using descriptive names
  classes = ["positive", "negative"]

  #4. Predict for these classes
  tars.predict_zero_shot(sentence, classes)

  if len(sentence.labels) == 0:
    return 0

  flag = 1
  if sentence.labels[0].value == "negative":
    flag = -1

  return flag * sentence.labels[0].score


sia_analyser = SentimentIntensityAnalyzer()
def SIA(comment):
    sia_analyser.polarity_scores(comment)["compound"]

2021-07-28 09:04:01,904 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt
2021-07-28 09:04:11,154 loading file /root/.flair/models/tars-base-v8.pt
init TARS


In [None]:
flair_score_noshot('Flair is pretty great!')

0.9336798191070557

In [None]:
flair_scorer('Flair is pretty great!')

0.9987422823905945

In [None]:
def sort_list(list1, list2):
 
    zipped_pairs = zip(list2, list1)
 
    z = [x for _, x in sorted(zipped_pairs,reverse=True)]
    return z

In [None]:
# Score all comments for a day based on sentiment
# Log all tickers mentioned in those comments
# Assign the daily sentiment to tickers involved


## run vader sentiment analyzer

sentiment_scores = [] #For entire market
daily_tick_sentiments = [] #array of dictionaries with daily ticker sentiments

for i, comments in tqdm(enumerate(daily_comments_text)):
    # Looping through each day

    sentiment_score = 0 # Initializing sentiment score each day
    ticks_sent = {tick:0 for tick in ticks}          #daily sentiments for tickers
    try:

        # comments = sort_list(comments, daily_comments_upvotes[i])[:20]
        # Taking top 20 comments sorted by upvotes

        for j, comment in enumerate(comments):
            if daily_comments_upvotes[i][j] <= 1: # Only taking comments with score greater than 1
                continue

            if len(comment) == 0:
              continue

            ticks_in_comment = []

            for word in comment.split():
                #Scanning for ticks mentioned in the comment
                if word in ticks and (word.lower() not in stop_words) and (word not in stop_words):
                    ticks_in_comment.append(word)

            #comment_score = SIA(comment) #general score

            #Trying out the flair models here
            #comment_score = flair_scorer(comment)
            comment_score = flair_score_noshot(comment)

            if len(ticks_in_comment) == 1:
              # Only using the comments with a single tick, to avoid muddying the waters.
              for tick in ticks_in_comment:
                  #updating the scores of comment to all ticks in the comment
                  ticks_sent[tick] = comment_score + ticks_sent[tick]

            sentiment_score = sentiment_score + comment_score

        daily_tick_sentiments.append(ticks_sent) 
    except TypeError:
        sentiment_score = 0

    sentiment_scores.append(sentiment_score)

df_1["sentiment score"] = sentiment_scores

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
daily_ticks_df = pd.concat([pd.Series(day) for day in daily_tick_sentiments], 1) # Concatenating daily ticks data to dataframe
daily_ticks_df.columns = df_1.date.values


# Select 50 most talked about tickers in the last 30 days
tickers = abs(daily_ticks_df.iloc[:,-30:]).sum(1).sort_values(ascending=False)[:50].index
# tickers that are most talked about in the last 30 days

rolling_scores = daily_ticks_df.T.rolling(window=14).sum().T #Rolling sum of scores
rolling_scores = rolling_scores.iloc[:, -7:].sum(1)
rolling_scores = rolling_scores.loc[tickers][rolling_scores!=0]

top_scores = rolling_scores.rank(pct=True).sort_values(ascending=False).reset_index()
top_scores.columns = ["bloomberg_ticker", "signal"]


# Mapping symbols to bloomberg symbols for numerai submission

mapping = pd.Series(
    bloomberg_tickers.values, index=bloomberg_tickers.apply(lambda x: x.split(" ")[0])
)
top_scores["bloomberg_ticker"] = top_scores["bloomberg_ticker"].apply(
    lambda x: mapping[x] if type(mapping[x]) == str else mapping[x].values[0]
)
top_scores.set_index("bloomberg_ticker", inplace=True)
top_scores.to_csv("Signal_SUBMISSION.csv", index=True)

print(top_scores)

                    signal
bloomberg_ticker          
AAPL US           1.000000
CLF US            0.976744
AMA AU            0.953488
NOKIA FH          0.930233
MSFT US           0.906977
AMD US            0.883721
DKNG US           0.860465
TWTR US           0.837209
PLUG US           0.813953
BBBY US           0.790698
ATT SS            0.767442
KBH US            0.744186
WEED CN           0.720930
CORE US           0.697674
BRO US            0.674419
TSLA US           0.651163
PYPL US           0.627907
ABC US            0.604651
SENS SW           0.581395
MAN US            0.558140
LOW US            0.534884
NFLX US           0.511628
VIAC US           0.488372
CCP AU            0.465116
AMZN US           0.441860
LZB US            0.418605
ASO US            0.395349
RKT US            0.372093
GME US            0.348837
TEAM US           0.325581
CAT US            0.302326
BOX US            0.279070
MRNA US           0.255814
CASH US           0.232558
FORM US           0.209302
B

In [None]:
final_df = pd.read_csv("Signal_SUBMISSION.csv")

final_df.rename({'bloomberg_ticker':'ticker'},axis=1,inplace=True)

##MAKE SURE THIS FORMAT IS CORRECT

final_df.head()

Unnamed: 0,ticker,signal
0,AAPL US,1.0
1,CLF US,0.976744
2,AMA AU,0.953488
3,NOKIA FH,0.930233
4,MSFT US,0.906977


In [None]:
# public_id = "BX5I7ZCKH3HK2U337GQSTKIPQWEBPLTQ"
# secret_key = "DJK3BNVUPQNY2IAWCEHEPLEGFEXD2K6U6SOHA67F7U4U2MA7KJXED3XRAAEU4SMP"
# model_id = "0fdc3056-2d36-4d04-970d-655b9f7b2021"
# napi = numerapi.SignalsAPI(public_id=public_id, secret_key=secret_key)
# napi.upload_predictions("Signal_SUBMISSION.csv", model_id=model_id)

In [None]:
# # # FLAIR SUBMISSION(flair_scoreer)

# public_id = "BX5I7ZCKH3HK2U337GQSTKIPQWEBPLTQ"
# secret_key = "DJK3BNVUPQNY2IAWCEHEPLEGFEXD2K6U6SOHA67F7U4U2MA7KJXED3XRAAEU4SMP"
# model_id = "a0436466-72b1-415e-ae8e-1336dbe53e0a"
# napi = numerapi.SignalsAPI(public_id=public_id, secret_key=secret_key)
# napi.upload_predictions("Signal_SUBMISSION.csv", model_id=model_id)

In [None]:
# FLAIR SUBMISSION(flair_scorer_noshot)

# public_id = "BX5I7ZCKH3HK2U337GQSTKIPQWEBPLTQ"
# secret_key = "DJK3BNVUPQNY2IAWCEHEPLEGFEXD2K6U6SOHA67F7U4U2MA7KJXED3XRAAEU4SMP"
# model_id = "cd006ccf-4f69-4bbd-9dcb-20a270ebff66"
# napi = numerapi.SignalsAPI(public_id=public_id, secret_key=secret_key)
# napi.upload_predictions("Signal_SUBMISSION.csv", model_id=model_id)

In [None]:
# Plotting with S&P 500 price
spy=ffn.get('spy', start='2010-01-01') # Downloading daily S&P price
spy_dates=[]
for date in tqdm(df_1['date'].astype(str).values):
    try:
        spy_dates.append(float(spy.loc[date]))
    except KeyError:
        spy_dates.append(None)
        
df_1['spy']=spy_dates

df_plot_data=df_1[['date','sentiment score','spy']].set_index('date') # Data used for plotting
df_plot_data=df_plot_data[df_plot_data['spy'].notna()]

In [None]:
df_plot_data.plot(secondary_y='sentiment score', figsize=(16, 10))

In [None]:
## Looking at fourier transforms

close_fft = np.fft.fft(np.asarray(df_plot_data['sentiment score'].tolist())) # FFT on the sentiment score
fft_df = pd.DataFrame({'fft':close_fft})
fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
fft_list = np.asarray(fft_df['fft'].tolist())

for num_ in [5, 10, 15, 20]:
    fft_list_m10= np.copy(fft_list); fft_list_m10[num_:-num_]=0
    df_plot_data['fourier '+str(num_)]=np.fft.ifft(fft_list_m10)

In [None]:
df_plot_data[['sentiment score', 'fourier 5', 'fourier 10', 'fourier 15', 'fourier 20']].plot(figsize=(16, 10))

In [None]:
df_plot_data[['spy', 'fourier 20']].plot(secondary_y='fourier 20', figsize=(16, 10))

In [None]:
#normalize
from sklearn.preprocessing import MinMaxScaler
sc= MinMaxScaler(feature_range=(0,1))
df_plot_data['norm_price']=sc.fit_transform(df_plot_data['spy'].to_numpy().reshape(-1, 1))
df_plot_data['spy log']=np.log(df_plot_data['spy']/df_plot_data['spy'].shift(1))
df_plot_data['norm_sentiment']=sc.fit_transform(df_plot_data['sentiment score'].to_numpy().reshape(-1, 1))
df_plot_data['norm_fourier5']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_plot_data['fourier 5'].to_numpy()])).reshape(-1, 1))
df_plot_data['norm_fourier10']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_plot_data['fourier 10'].to_numpy()])).reshape(-1, 1))
df_plot_data['norm_fourier15']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_plot_data['fourier 15'].to_numpy()])).reshape(-1, 1))
df_plot_data['norm_fourier20']=sc.fit_transform(np.asarray(list([(float(x)) for x in df_plot_data['fourier 20'].to_numpy()])).reshape(-1, 1))

In [None]:
df_plot_data[['norm_price', 'norm_sentiment', 'norm_fourier5', 'norm_fourier20']].plot(figsize=(16, 10));

---