In [64]:
#imports
import os, json
import pandas as pd
from datetime import timedelta
import yfinance as yf

In [18]:
#options
pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_rows',1000)

In [67]:
#utillities
def append_all_json_data(folderPath: str, list_of_jsons) -> pd.DataFrame:
    df = pd.DataFrame()
    for element in list_of_jsons:
        full_path = folderPath+"/"+element
        #print (full_path)
        df = df.append(pd.read_json(full_path))
    return df

def get_raw_sentiment(sentimentPath):
    if sentimentPath == None:
        return None
    else:
        return sentimentPath["basic"]

def subtract_days_from_date(date, days):
    """Subtract days from a date and return the date.
    
    Args: 
        date (string): Date string in YYYY-MM-DD format. 
        days (int): Number of days to subtract from date
    
    Returns: 
        date (date): Date in YYYY-MM-DD with X days subtracted. 
    """
    
    subtracted_date = pd.to_datetime(date) - timedelta(days=days)
    subtracted_date = subtracted_date.strftime("%Y-%m-%d")

    return subtracted_date

def get_stockprice_at_time(day):
    #print(day)
    try:
        val = hist.loc[day].Close
    except KeyError:
        d = subtract_days_from_date(day, 1)
        val = get_stockprice_at_time(d)
        
    return round(val,3)

In [78]:
#Bag of words for sentiment prediction
#https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

dataset = pd.read_csv('bagofwords_data.csv')
dataset= dataset[dataset["Sentiment"].str.contains("neutral")==False]
dataset=dataset.reset_index(drop=True)
sentiment = {'positive': 1,'negative': 0}
dataset.Sentiment = [sentiment[item] for item in dataset.Sentiment]
corpus = []


    
for i in range(0,2712):
    review = re.sub('[^a-zA-Z]',' ',dataset['Sentence'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

from sklearn.model_selection import train_test_split
#train, test = train_test_split(dataset, test_size = 0.33, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.79, random_state=42)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)

def get_bow_sentiment(msg):
    msg_review = msg
    msg_review = re.sub('[^a-zA-Z]',' ',msg_review)
    msg_review = msg_review.lower()
    msg_review = msg_review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    msg_review = [ps.stem(word) for word in msg_review if not word in set(all_stopwords)]
    msg_review = ' '.join(msg_review)
    user_corpus = [msg_review]
    new_x_test = cv.transform(user_corpus).toarray()
    new_y_pred = classifier.predict(new_x_test)
    if new_y_pred[0] == 0:
        return 'Bearish'
    else:
        return 'Bullish'

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Developer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[ 363  325]
 [ 306 1149]]


0.705552963135791

In [49]:
path_to_json = 'Data/AAPL'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [50]:
len(json_files)

1374

In [60]:
raw_json_data = append_all_json_data(path_to_json,json_files)
raw_json_data=df.reset_index(drop=True)
raw_json_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41220 entries, 0 to 41219
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   id               41220 non-null  int64              
 1   body             41220 non-null  object             
 2   created_at       41220 non-null  datetime64[ns, UTC]
 3   user             41220 non-null  object             
 4   source           41220 non-null  object             
 5   symbols          41220 non-null  object             
 6   conversation     14000 non-null  object             
 7   likes            24928 non-null  object             
 8   mentioned_users  41220 non-null  object             
 9   entities         41220 non-null  object             
 10  links            5446 non-null   object             
 11  reshare_message  1348 non-null   object             
 12  reshares         1131 non-null   object             
 13  owned_symbols   

In [65]:
appl = yf.Ticker("AAPL")
hist = appl.history(period="max")

In [79]:
new_dataframe = pd.DataFrame(columns=["time_created","username","message","sentiment","derived_sentiment","stock_price"])
for i in range(df.shape[0]):
    ts = raw_json_data["created_at"][i]
    day = str(ts.year)+"-"+str(ts.month)+"-"+str(ts.day)
    new_dataframe.loc[i]= [raw_json_data["created_at"][i],raw_json_data["user"][i]["username"],raw_json_data["body"][i],get_raw_sentiment(raw_json_data["entities"][i]["sentiment"]),get_bow_sentiment(raw_json_data["body"][i]),get_stockprice_at_time(day)]

In [81]:
new_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41220 entries, 0 to 41219
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   time_created       41220 non-null  datetime64[ns, UTC]
 1   username           41220 non-null  object             
 2   message            41220 non-null  object             
 3   sentiment          20101 non-null  object             
 4   derived_sentiment  41220 non-null  object             
 5   stock_price        41220 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(4)
memory usage: 2.2+ MB


In [84]:
new_dataframe.sample(n=100)

Unnamed: 0,time_created,username,message,sentiment,derived_sentiment,stock_price
6684,2021-12-17 14:28:48+00:00,CptKangaroo,$AAPL Had a great run. Time to back and fill. Perfectly healthy. Will be back to new highs soon.,,Bearish,170.922
3668,2021-12-16 00:52:01+00:00,JDSoCal,@HollywoodWolf777 Apple has 271 stores in the USA dumb bunny. \n$AAPL,Bullish,Bearish,172.041
22803,2022-01-05 19:30:06+00:00,hoyasaxa1978,"$AAPL 🍏 Just, Breathe. Keep your eyes on the Indices. Here is where they are, now:",,Bullish,174.697
28425,2022-01-12 18:35:38+00:00,DrD_Trades,$AAPL 177 $ soon,Bullish,Bearish,175.307
7187,2021-12-17 16:40:58+00:00,Barren_Wufffett,$AAPL \n\nSomeone help these damn Bulls. They are sinking. Hahahahaha,Bearish,Bullish,170.922
39686,2022-01-25 20:51:04+00:00,investclem1682,$AAPL \n\nYikes short never stop attacking .. they are stubborn as fuck to make red to go on.,,Bearish,159.577
1304,2021-11-09 16:37:29+00:00,BoilingPoint,$COIN $AAPL Then Apple loses! Don’t worry Tim will have a change of heart when his users start ...,,Bearish,150.618
7950,2021-12-18 16:28:35+00:00,Rome777,$AAPL $AMZN $GOOG $GOOGL $FB https://newsfilter.io/articles/kickstarter-discord-and-others-are-r...,,Bullish,170.922
14137,2021-12-27 18:12:25+00:00,sionele,$AAPL huge 14k wall at 179.5,Bullish,Bullish,180.101
40344,2022-01-26 09:38:12+00:00,moongoal,$AAPL glad I didn&#39;t panic sell. Holding for retirement,Bullish,Bearish,159.487
