In [8]:
#imports
import os, json
import pandas as pd
from datetime import timedelta
import yfinance as yf

In [9]:
#options
pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_rows',1000)

In [10]:
#utillities
def append_all_json_data(folderPath: str, list_of_jsons) -> pd.DataFrame:
    df = pd.DataFrame()
    for element in list_of_jsons:
        full_path = folderPath+"/"+element
        #print (full_path)
        df = df.append(pd.read_json(full_path))
    return df

def get_raw_sentiment(sentimentPath):
    if sentimentPath == None:
        return None
    else:
        return sentimentPath["basic"]

def subtract_days_from_date(date, days):
    """Subtract days from a date and return the date.
    
    Args: 
        date (string): Date string in YYYY-MM-DD format. 
        days (int): Number of days to subtract from date
    
    Returns: 
        date (date): Date in YYYY-MM-DD with X days subtracted. 
    """
    
    subtracted_date = pd.to_datetime(date) - timedelta(days=days)
    subtracted_date = subtracted_date.strftime("%Y-%m-%d")

    return subtracted_date

def get_stockprice_at_time(day):
    #print(day)
    try:
        val = hist.loc[day].Close
    except KeyError:
        d = subtract_days_from_date(day, 1)
        val = get_stockprice_at_time(d)
        
    return round(val,3)

In [11]:
#Bag of words for sentiment prediction
#https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

dataset = pd.read_csv('bagofwords_data.csv')
dataset= dataset[dataset["Sentiment"].str.contains("neutral")==False]
dataset=dataset.reset_index(drop=True)
sentiment = {'positive': 1,'negative': 0}
dataset.Sentiment = [sentiment[item] for item in dataset.Sentiment]
corpus = []


    
for i in range(0,2712):
    review = re.sub('[^a-zA-Z]',' ',dataset['Sentence'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

from sklearn.model_selection import train_test_split
#train, test = train_test_split(dataset, test_size = 0.33, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.79, random_state=42)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)

def get_bow_sentiment(msg):
    msg_review = msg
    msg_review = re.sub('[^a-zA-Z]',' ',msg_review)
    msg_review = msg_review.lower()
    msg_review = msg_review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    msg_review = [ps.stem(word) for word in msg_review if not word in set(all_stopwords)]
    msg_review = ' '.join(msg_review)
    user_corpus = [msg_review]
    new_x_test = cv.transform(user_corpus).toarray()
    new_y_pred = classifier.predict(new_x_test)
    if new_y_pred[0] == 0:
        return 'Bearish'
    else:
        return 'Bullish'

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Developer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[ 363  325]
 [ 306 1149]]


0.705552963135791

In [12]:
path_to_json = 'Data/AAPL'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [13]:
len(json_files)

1374

In [15]:
raw_json_data = append_all_json_data(path_to_json,json_files)
raw_json_data=raw_json_data.reset_index(drop=True)
raw_json_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41220 entries, 0 to 41219
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   id               41220 non-null  int64              
 1   body             41220 non-null  object             
 2   created_at       41220 non-null  datetime64[ns, UTC]
 3   user             41220 non-null  object             
 4   source           41220 non-null  object             
 5   symbols          41220 non-null  object             
 6   conversation     14000 non-null  object             
 7   likes            24928 non-null  object             
 8   mentioned_users  41220 non-null  object             
 9   entities         41220 non-null  object             
 10  links            5446 non-null   object             
 11  reshare_message  1348 non-null   object             
 12  reshares         1131 non-null   object             
 13  owned_symbols   

In [1]:
appl = yf.Ticker("AAPL")
hist = appl.history(period="max")


NameError: name 'yf' is not defined

In [18]:
new_dataframe = pd.DataFrame(columns=["time_created","username","message","sentiment","derived_sentiment","stock_price"])
for i in range(raw_json_data.shape[0]):
    ts = raw_json_data["created_at"][i]
    day = str(ts.year)+"-"+str(ts.month)+"-"+str(ts.day)
    new_dataframe.loc[i]= [raw_json_data["created_at"][i],raw_json_data["user"][i]["username"],raw_json_data["body"][i],get_raw_sentiment(raw_json_data["entities"][i]["sentiment"]),get_bow_sentiment(raw_json_data["body"][i]),get_stockprice_at_time(day)]

In [19]:
new_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41220 entries, 0 to 41219
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   time_created       41220 non-null  datetime64[ns, UTC]
 1   username           41220 non-null  object             
 2   message            41220 non-null  object             
 3   sentiment          20101 non-null  object             
 4   derived_sentiment  41220 non-null  object             
 5   stock_price        41220 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(4)
memory usage: 2.2+ MB


In [20]:
new_dataframe.sample(n=100)

Unnamed: 0,time_created,username,message,sentiment,derived_sentiment,stock_price
3709,2021-12-16 01:03:08+00:00,DrunkHulk,"$AAPL WHAT A FUCKING DAY, LMAO. Day’s gain, by the way…",Bullish,Bullish,172.041
1921,2021-12-15 15:16:47+00:00,OptionsTrigger,$MARA 🧸 $AAPL 🧸 $UVXY 🚀 $MSTR 🧸\n\nWinning in life.,,Bullish,179.072
29445,2022-01-13 19:51:56+00:00,Bal33,$AAPL this will either be flat or ugly tomorrow. I don’t see how this could rally back tomorrow.,Bearish,Bullish,171.971
36992,2022-01-24 13:34:39+00:00,lrgnike20,$AAPL 😂✅👀🛰🛰🛰🛰🚗🚗✈️✈️🚗📈📈,Bullish,Bearish,161.414
4300,2021-12-16 14:56:28+00:00,tengis06,$AAPL Another great day. Follow me to follow my trades.,Bearish,Bullish,172.041
15411,2021-12-28 16:00:32+00:00,Epic_Economics,$AAPL oh no. More made in China. \n https://www.tradingview.com/chart/AAPL/rvCgDRdv-Apple-Ponzie...,Bearish,Bullish,179.062
32378,2022-01-19 06:40:29+00:00,Dinwiddie23,$AAPL,Bullish,Bearish,166.018
28424,2022-01-12 18:36:11+00:00,California_bull,$AAPL Rest of the market is beautiful. Apple is losing steam though!,,Bearish,175.307
5720,2021-12-16 20:22:52+00:00,EveryTimeICash,$QQQ Buyers stepped back in near 3 day lows again \n \n$TSLA $AAPL $AMZN $AFRM,,Bullish,172.041
16989,2021-12-30 04:35:43+00:00,nuggets,$AAPL Bullish pattern forming today. Follow for more,Bullish,Bullish,177.973


In [21]:
new_dataframe.to_csv("wrangled_data.csv")