In [34]:
import os, json
import pandas as pd
from datetime import timedelta
import yfinance as yf
from sklearn import linear_model 

In [8]:
appl = yf.Ticker("AAPL")
hist = appl.history(period="max")

In [17]:
def subtract_days_from_date(date, days):
    """Subtract days from a date and return the date.
    
    Args: 
        date (string): Date string in YYYY-MM-DD format. 
        days (int): Number of days to subtract from date
    
    Returns: 
        date (date): Date in YYYY-MM-DD with X days subtracted. 
    """
    
    subtracted_date = pd.to_datetime(date) - timedelta(days=days)
    subtracted_date = subtracted_date.strftime("%Y-%m-%d")

    return subtracted_date

def append_all_json_data(folderPath: str, list_of_jsons) -> pd.DataFrame:
    df = pd.DataFrame()
    for element in list_of_jsons:
        full_path = folderPath+"/"+element
        #print (full_path)
        df = df.append(pd.read_json(full_path))
    return df

def get_raw_sentiment(sentimentPath):
    if sentimentPath == None:
        return None
    else:
        return sentimentPath["basic"]
    
def get_stockprice_at_time(d):
    day = pd.to_datetime(d)
    #print(day.day)
    _day = str(day.year)+"-"+str(day.month)+"-"+str(day.day)
    try:
        val = hist.loc[_day].Close
    except KeyError:
        d = subtract_days_from_date(_day, 1)
        val = get_stockprice_at_time(d)
        
    return round(val,3)

def get_openprice_at_time(d):
    day = pd.to_datetime(d)
    #print(day.day)
    _day = str(day.year)+"-"+str(day.month)+"-"+str(day.day)
    try:
        val = hist.loc[_day].Open
    except KeyError:
        d = subtract_days_from_date(_day, 1)
        val = get_openprice_at_time(d)
        
    return round(val,3)

def get_volume_at_time(d):
    day = pd.to_datetime(d)
    #print(day.day)
    _day = str(day.year)+"-"+str(day.month)+"-"+str(day.day)
    try:
        val = hist.loc[_day].Volume
    except KeyError:
        d = subtract_days_from_date(_day, 1)
        val = get_volume_at_time(d)
        
    return round(val,3)

In [26]:
#Bag of words for sentiment prediction
#https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

dataset = pd.read_csv('bagofwords_data.csv')
dataset= dataset[dataset["Sentiment"].str.contains("neutral")==False]
dataset=dataset.reset_index(drop=True)
sentiment = {'positive': 1,'negative': 0}
dataset.Sentiment = [sentiment[item] for item in dataset.Sentiment]
corpus = []


    
for i in range(0,2712):
    review = re.sub('[^a-zA-Z]',' ',dataset['Sentence'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

from sklearn.model_selection import train_test_split
#train, test = train_test_split(dataset, test_size = 0.33, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.79, random_state=42)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)

def get_bow_sentiment(msg):
    msg_review = msg
    msg_review = re.sub('[^a-zA-Z]',' ',msg_review)
    msg_review = msg_review.lower()
    msg_review = msg_review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    msg_review = [ps.stem(word) for word in msg_review if not word in set(all_stopwords)]
    msg_review = ' '.join(msg_review)
    user_corpus = [msg_review]
    new_x_test = cv.transform(user_corpus).toarray()
    new_y_pred = classifier.predict(new_x_test)
    if new_y_pred[0] == 0:
        return 0#'Bearish'
    else:
        return 1#'Bullish'

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
#print(cm)
#accuracy_score(y_test,y_pred)
def categoryize_sentiment(sentiment):
    if sentiment == "Bullish":
        return 1
    if sentiment == "Bearish":
        return 0
    if sentiment == "None":
        return 0.5

def avg_sentiment(s1,s2):
    _s1 = s1
    _s2 = s2
    if s1 == None:
        _s1 = 0.5
    if s2 == None:
        _s2 = 0.5
    return (_s1+_s2)/2

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Developer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
path_to_json = 'stocks/AAPL/Raw/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [15]:
raw_json_data = append_all_json_data(path_to_json,json_files)
raw_json_data=raw_json_data.reset_index(drop=True)

In [27]:
new_dataframe = pd.DataFrame(columns=["time_created","username","message","old_sentiment","derived_sentiment","sentiment","open","close","volume"])
for i in range(raw_json_data.shape[0]):
    ts = raw_json_data["created_at"][i]
    day = str(ts.year)+"-"+str(ts.month)+"-"+str(ts.day)
    new_dataframe.loc[i]= [raw_json_data["created_at"][i],raw_json_data["user"][i]["username"],raw_json_data["body"][i],categoryize_sentiment(get_raw_sentiment(raw_json_data["entities"][i]["sentiment"])),get_bow_sentiment(raw_json_data["body"][i]),avg_sentiment(categoryize_sentiment(get_raw_sentiment(raw_json_data["entities"][i]["sentiment"])),get_bow_sentiment(raw_json_data["body"][i])),get_openprice_at_time(day),get_stockprice_at_time(day),get_volume_at_time(day)]

In [30]:
ndf = new_dataframe.groupby(by=new_dataframe['time_created'].dt.date).mean()

In [39]:
X = ndf[['sentiment', 'open']]
y = ndf['close'] 
regr = linear_model.LinearRegression()
regr.fit(X, y) 
ndf['predicted_closing'] = ndf.apply(lambda x: regr.predict([[x['sentiment'],x['open']]])[0], axis=1)

Unnamed: 0_level_0,sentiment,open,close,volume,predicted_closing
time_created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-11-06,0.694444,151.697,151.088,65463900.0,152.660605
2021-11-07,0.703039,151.697,151.088,65463900.0,152.79366
2021-11-08,0.621941,151.217,150.249,55020900.0,151.055925
2021-11-09,0.631768,150.009,150.618,56787900.0,149.994537
2021-12-15,0.580139,174.887,179.072,131063300.0,174.186859
2021-12-16,0.570624,179.052,172.041,150185800.0,178.22357
2021-12-17,0.564965,169.714,170.922,195432700.0,168.755313
2021-12-18,0.64717,169.714,170.922,195432700.0,170.027997
2021-12-19,0.654321,169.714,170.922,195432700.0,170.13871
2021-12-20,0.591241,168.066,169.534,107499100.0,167.506583


In [38]:
ndf.to_csv("stocks/AAPL/Wrangled/wrangled_data.csv")