In [483]:
import warnings

import io

import multiprocessing
import statsmodels.api as sm
import string 

import pandas as pd
import glob
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import *

from typing import List

from sklearn import utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import resample

import re

import xgboost
from xgboost import XGBClassifier

from keras import optimizers
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
from keras.utils.np_utils import to_categorical

from gensim.models import word2vec
from gensim.models import KeyedVectors
from gensim.corpora.dictionary import Dictionary
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec


from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

from tqdm import notebook
import math


from numpy.random import randn
from numpy.random import seed
from scipy.stats import pearsonr
warnings.filterwarnings('ignore')

# Part 1: Sentiment Analysis

__Prepare the data__

In [484]:
text = []
for filename in glob.glob('*.txt'):
    text.append(filename)
text

['Mandy.txt', 'Mayank.txt', 'Teju.txt', 'Willa.txt']

In [485]:
def open_txt_file(file):
    '''
    Get the content in the txt file
    '''
    with open(file, 'r') as f:
        content = f.read()
    return content

In [486]:
headlines = pd.read_csv('news_full_list.csv', encoding = 'unicode_escape')
headlines.reset_index(inplace = True)
headlines = headlines[['index', 'company', 'titles', 'release_date']]
headlines.head()

Unnamed: 0,index,company,titles,release_date
0,0,AAPL,Relative Strength Alert For Apple,2/27/2020
1,1,AAPL,Why Computer Stocks Fell Today,2/27/2020
2,2,AAPL,2 Key Trends to Watch in Music Streaming,2/27/2020
3,3,AAPL,Apple (AAPL) Down 9.8% Since Last Earnings Rep...,2/27/2020
4,4,AAPL,Apple's Coronavirus Weakness Could Mean Invest...,2/27/2020


In [487]:
def parse_sentiment(df, titles):
    sentiment_list = titles.split('\n')
    for sen in sentiment_list:
        last_word = sen.split(' ')[-1]
        try:
            if int(last_word) in [0, 1, -1]:
                index = int(sen.split('\t')[0])
                df.loc[index, 'sentiment'] = int(last_word)
        except:
            continue
    return df

In [488]:
for txt in ['Willa.txt', 'Mandy.txt', 'Teju.txt', 'Mayank.txt']:
    titles = open_txt_file(txt)
    headlines = parse_sentiment(headlines, titles)

headlines = headlines.fillna('Unknown')

In [16]:
def headlines_with_label(df):
    '''
    Retrive headlines with sentiment labelled
    '''
    label = df[df['sentiment'] != 'Unknown']
    return label

In [17]:
stopword = set(stopwords.words('english'))
labelled_lines = headlines_with_label(headlines)
labelled_lines.reset_index(drop = True, inplace = True)
labelled_lines['sentiment'].value_counts()

 0.0    1759
 1.0     721
-1.0     372
Name: sentiment, dtype: int64

In [215]:
# neu = labelled_lines[labelled_lines['sentiment'] == 0].iloc[:400]
# pos = labelled_lines[labelled_lines['sentiment'] == 1].iloc[:400]
# neg = labelled_lines[labelled_lines['sentiment'] == -1]
# labelled = pd.concat([neu, pos, neg])
# labelled.reset_index(drop = True, inplace = True)
# labelled['sentiment'].value_counts()

__Incorporate pre-trained vectors__

In [216]:
# def load_vectors(fname):
#     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     word_vector = {}
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         word_vector[tokens[0]] = np.array(list(map(float, tokens[1:])))
#     return word_vector

In [218]:
# word_vector = load_vectors('wiki-news-300d-1M.vec')

In [None]:
# len(word_vector)

__Incorporate Doc2Vec__

In [18]:
def remove_punctuation(line):
    line = str(line)
    if line.strip()=='':
        return ''
    rule = re.compile(u"[^a-zA-Z0-9\u4E00-\u9FA5]")
    line = rule.sub(' ',line)
    return line

In [19]:
headlines_copy = headlines[headlines['sentiment'] != 'Unknown']
headlines_copy['clean_titles'] = headlines_copy['titles'].progress_apply(remove_punctuation)

HBox(children=(FloatProgress(value=0.0, max=2852.0), HTML(value='')))




In [20]:
def lowercase_remove_stopwords(text):
    '''
    Remove stop words
    '''
    text_without_stopwords = [word for word in text.lower().split(' ') if word not in stop_words]
    return ' '.join(text_without_stopwords).strip()

In [21]:
words = stopwords.words('english')
punctuations = list(string.punctuation)
stop_words = words + punctuations
headlines_copy['clean_titles'] = headlines_copy['clean_titles'].progress_apply(lowercase_remove_stopwords)

HBox(children=(FloatProgress(value=0.0, max=2852.0), HTML(value='')))




In [22]:
stemmer = PorterStemmer()
def stemming(text):
    '''
    Stem all words by Porter
    '''
    plurals = text.split()
    singles = [stemmer.stem(plural) for plural in plurals]
    return ' '.join(singles)

In [23]:
headlines_copy['clean_titles'] = headlines_copy['clean_titles'].progress_apply(stemming)
headlines_copy['clean_titles'] = headlines_copy['clean_titles'].apply(lambda x: x.split(' '))

HBox(children=(FloatProgress(value=0.0, max=2852.0), HTML(value='')))




__DBOW__

In [24]:
df_pos = headlines_copy[headlines_copy['sentiment'] == 1]
df_neg = headlines_copy[headlines_copy['sentiment'] == -1]
df_majority = headlines_copy[headlines_copy['sentiment'] == 0]
df_pos_unsampled = resample(df_pos, replace = True, n_samples = len(df_pos) * 2, random_state = 123)
df_neg_unsampled = resample(df_neg, replace = True, n_samples = len(df_neg) * 5, random_state = 123)
headlines_oversample = pd.concat([df_majority, df_pos_unsampled, df_neg_unsampled])

In [25]:
headlines_oversample['sentiment'].value_counts()

-1.0    1860
 0.0    1759
 1.0    1442
Name: sentiment, dtype: int64

In [26]:
train, test = train_test_split(headlines_oversample, test_size=0.3, random_state=42,stratify = headlines_oversample.sentiment.values)

train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['clean_titles'], tags=[r['sentiment']]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['clean_titles'], tags=[r['sentiment']]), axis=1)

In [27]:
cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=0,  negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 1044080.73it/s]


In [28]:
%%time
for epoch in range(10):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 1307190.92it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3554120.76it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3550722.94it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3419154.15it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3549026.46it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3547331.61it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3444522.32it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3551571.78it/s]
100%|███████████████████████████████████

Wall time: 1.33 s


In [29]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors
 
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [30]:
logreg = LogisticRegression(n_jobs=3, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.771560236998025
Testing F1 score: 0.7708124610365406


__DM__

In [31]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 1775149.33it/s]


In [32]:
%%time
for epoch in range(10):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 1777272.97it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3435759.66it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 1776635.35it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3551571.78it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3443723.87it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3437349.55it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 1774301.30it/s]
100%|█████████████████████████████████████████████████████████████████████████| 3542/3542 [00:00<00:00, 3554120.76it/s]
100%|███████████████████████████████████

Wall time: 2.03 s





In [33]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
 
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.7419354838709677
Testing F1 score: 0.7385351855572431


__Generate a new model__

In [34]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [35]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors
 
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

In [36]:
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.825543120473996
Testing F1 score: 0.8236678971155529


__Test for the result__

In [37]:
non_labelled = headlines[headlines['sentiment'] == 'Unknown']
non_labelled['clean_titles'] = non_labelled['titles'].progress_apply(remove_punctuation)
non_labelled['clean_titles'] = non_labelled['titles'].progress_apply(lowercase_remove_stopwords)
non_labelled['clean_titles'] = non_labelled['titles'].progress_apply(stemming)
non_labelled['clean_titles'] = non_labelled['clean_titles'].apply(lambda x: x.split(' '))

HBox(children=(FloatProgress(value=0.0, max=2148.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2148.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2148.0), HTML(value='')))




In [38]:
tagged = non_labelled.apply(lambda r: TaggedDocument(words=r['clean_titles'], tags=[r['sentiment']]), axis=1)
test = vec_for_learning(new_model, tagged)[1]
pred = logreg.predict(test)

non_labelled['sentiment'] = pred
non_labelled.reset_index(drop = True, inplace = True)
non_labelled.head()

Unnamed: 0,index,company,titles,release_date,sentiment,clean_titles
0,0,AAPL,Relative Strength Alert For Apple,2/27/2020,0.0,"[rel, strength, alert, for, appl]"
1,48,AAPL,Microsoft Stock Looks Ready for a Correction T...,2/24/2020,-1.0,"[microsoft, stock, look, readi, for, a, correc..."
2,115,AAPL,Better Buy: Apple vs. Google,2/18/2020,-1.0,"[better, buy:, appl, vs., googl]"
3,122,AAPL,Apple Cuts Its Revenue Guidance for Fiscal Q2,2/17/2020,0.0,"[appl, cut, it, revenu, guidanc, for, fiscal, Q2]"
4,127,AAPL,Glu Mobile Surged on Earnings: Is It Time to B...,2/15/2020,0.0,"[glu, mobil, surg, on, earnings:, Is, It, time..."


In [39]:
headlines_copy.reset_index(drop = True, inplace = True)

headlines_sentiment = pd.concat([non_labelled, headlines_copy])
headlines_sentiment = headlines_sentiment.sort_values('index')
headlines_sentiment['sentiment'] = headlines_sentiment['sentiment'].apply(lambda x: int(x))
headlines_sentiment.reset_index(drop = True, inplace = True)
headlines_sentiment.head()

Unnamed: 0,index,company,titles,release_date,sentiment,clean_titles
0,0,AAPL,Relative Strength Alert For Apple,2/27/2020,0,"[rel, strength, alert, for, appl]"
1,1,AAPL,Why Computer Stocks Fell Today,2/27/2020,0,"[comput, stock, fell, today]"
2,2,AAPL,2 Key Trends to Watch in Music Streaming,2/27/2020,0,"[2, key, trend, watch, music, stream]"
3,3,AAPL,Apple (AAPL) Down 9.8% Since Last Earnings Rep...,2/27/2020,-1,"[appl, aapl, 9, 8, sinc, last, earn, report, r..."
4,4,AAPL,Apple's Coronavirus Weakness Could Mean Invest...,2/27/2020,1,"[appl, coronaviru, weak, could, mean, investor..."


In [383]:
# headlines_sentiment.to_csv('Headlines with sentiments.csv', index = False)

__xgboost__

In [334]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors
 
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [335]:
X_train_df = pd.DataFrame(X_train)
y_train = list(y_train)
X_test_df = pd.DataFrame(X_test)
y_test = list(y_test)

In [336]:
model = XGBClassifier()
model.fit(X_train_df, y_train)

y_pred = model.predict(X_test_df)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 76.96%


__Create word2vec__

In [69]:
news_df = pd.read_csv('full_list_wweight.csv', encoding = 'unicode_escape')
news_df = news_df.dropna()

all_headlines = list(headlines['titles']) + list(news_df['article'])
prg_title = [] 
  
# iterate through each sentence in the file 
for i in all_headlines: 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        if j.isalpha():
            temp.append(j.lower()) 
  
    prg_title.append(temp) 
    
headlines_model = word2vec.Word2Vec(prg_title, size=100,min_count=10,window=5)
headlines_model.wv.save_word2vec_format("news_w2v.bin", binary=True)

In [70]:
# news_df['article'][0]
# Strip some unnecessary words

In [71]:
headlines_model.most_similar('fall')

[('rebound', 0.8655657768249512),
 ('drop', 0.855480432510376),
 ('rise', 0.8441327810287476),
 ('decline', 0.8367825746536255),
 ('surge', 0.8360257744789124),
 ('gain', 0.8251054883003235),
 ('jump', 0.8194429278373718),
 ('kick', 0.8179096579551697),
 ('miss', 0.8090846538543701),
 ('double', 0.7971330881118774)]

__LSTM__

In [72]:
def load_w2v(w2v_file):
    """
    load w2v file and return gensim.models.word2vec.Word2Vec object
    """
    
    return KeyedVectors.load_word2vec_format(w2v_file, binary=True)

In [73]:
def gen_w2ix(w2v_model):
    """
    Create a dictionary
    """
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(w2v_model.wv.vocab.keys(), allow_update=True)
    w2ix = {v: k + 1 for k, v in gensim_dict.items()}
    
    return w2ix

In [74]:
def get_ix_vec(sents: List[List[str]], w2ix):
    """
    Transfer a file to index array
    """
    new_sentences = []
    for sen in sents:
        new_sen = []
        for word in sen:
            try:
                new_sen.append(w2ix[word])
            except:
                new_sen.append(0)
        new_sentences.append(np.array(new_sen))

    return np.array(new_sentences)

In [75]:
def gen_w2ix_weight(index_dic, w2v_model):
    """
    Generate weights from w2v corresponding to the dictionary
    """
    weights = np.zeros((len(index_dic)+1, w2v_model.vector_size))
    for w, index in index_dic.items():
        weights[index, :] = w2v_model[w]
    
    return weights

In [76]:
# Define the layer of LSTM
def train_lstm(embedding_weights, x_train, y_train, x_test, y_test, **kwargs):    
    print (u'Creating a model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim = 128,
                        input_dim = W2IX_DIM,
                        mask_zero = True,
                        weights = [embedding_weights],
                        input_length = INPUT_LEN,
                       ))  # Adding Input Length
    model.add(LSTM(input_dim=128, 
                   output_dim = kwargs.get('lstm_out_dim', 64), 
                   activation = kwargs.get('lstm_actv', 'tanh'),
                   dropout=kwargs.get('lstm_drop_out', .2)))
    model.add(Dropout(kwargs.get('drop_out', .3)))
    model.add(Dense(output_dim=N_CLASS, activation=kwargs.get('dens_actv', 'softmax')))
    model.add(Activation('tanh'))

    print (u'Compiling...')
    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = sgd,
                  metrics = ['accuracy'])

    print (u"Training...")
    model.fit(x_train, y_train, batch_size = BATCH_SIZE, nb_epoch = EPOCH, validation_data = (x_test, y_test))

    print (u"Evalualting...")
    score, acc = model.evaluate(x_test, y_test, batch_size = BATCH_SIZE)
    print ('Test score: %.3f' % score)
    print ('Test accuracy: %.3f' % acc)
    return model, kwargs.get('dens_actv', 'softmax')

In [77]:
all_txt = list(labelled_lines['titles'].apply(lambda x: x.lower()))
all_lable = list(labelled_lines['sentiment'].apply(lambda x: str(int(x))))

In [78]:
w2v = load_w2v("news_w2v.bin")
w2ix = gen_w2ix(w2v)

In [79]:
# Define global variables
N_CLASS = 3
W2IX_DIM = len(w2ix) + 1
EPOCH = 15
BATCH_SIZE = 16

In [107]:
len(weights)

4556

In [80]:
# Prepare data
titles = get_ix_vec(all_txt, w2ix)
weights = gen_w2ix_weight(w2ix, w2v.wv)

INPUT_LEN = max([len(l) for l in titles])

train_x, test_x, train_y, test_y = train_test_split(titles, np.array(all_lable))
train_x = sequence.pad_sequences(train_x, INPUT_LEN)
test_x = sequence.pad_sequences(test_x, INPUT_LEN)
train_y = to_categorical(train_y,num_classes = N_CLASS)
test_y = to_categorical(test_y,num_classes = N_CLASS)

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(len(train_x)), "\nTest set: \t\t{}".format(len(test_x)))

			Feature Shapes:
Train set: 		2139 
Test set: 		713


In [407]:
# Train the model
model = train_lstm(weights, train_x, train_y, test_x, test_y, dens_actv='sigmoid')[0]

Creating a model...
Compiling...
Training...
Train on 1356 samples, validate on 452 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Evalualting...
Test score: 0.911
Test accuracy: 0.608


In [414]:
t_txt = headlines['titles'][1210]
test = sequence.pad_sequences([get_ix_vec([t_txt], w2ix)[0]], INPUT_LEN)
headlines['titles'][1210], model.predict_classes(test)

('Validea Martin Zweig Strategy Daily Upgrade Report - 1/28/2020', array([0]))

# Part 2: Predictive power of sentiments

Since using Doc2vec (combined) & MNL generates the highest accuracy, we stick with this method, and try to investigate the prediction power of the sentiments on the following day's stock price.

In [497]:
# read stock prices
price = pd.read_excel('api_full.xlsx', encoding = 'cp1252')
price.columns = ['date', 'open', 'high', 'low', 'close', 'adjusted close', 'vloume', 'dividend amount', 'split coef', 'company']
price.head()

Unnamed: 0,date,open,high,low,close,adjusted close,vloume,dividend amount,split coef,company
0,2020-03-10,277.14,286.44,269.37,285.34,285.34,70721316,0.0,1,AAPL
1,2020-03-09,263.75,278.09,263.0,266.17,266.17,71686208,0.0,1,AAPL
2,2020-03-06,282.0,290.82,281.23,289.03,289.03,56544246,0.0,1,AAPL
3,2020-03-05,295.52,299.55,291.41,292.92,292.92,46893219,0.0,1,AAPL
4,2020-03-04,296.44,303.4,293.13,302.74,302.74,54794568,0.0,1,AAPL


In [498]:
headlines_sentiment.head()

Unnamed: 0,index,company,titles,release_date,sentiment,clean_titles
0,0,AAPL,Relative Strength Alert For Apple,2/27/2020,0,"[rel, strength, alert, for, appl]"
1,1,AAPL,Why Computer Stocks Fell Today,2/27/2020,0,"[comput, stock, fell, today]"
2,2,AAPL,2 Key Trends to Watch in Music Streaming,2/27/2020,0,"[2, key, trend, watch, music, stream]"
3,3,AAPL,Apple (AAPL) Down 9.8% Since Last Earnings Rep...,2/27/2020,-1,"[appl, aapl, 9, 8, sinc, last, earn, report, r..."
4,4,AAPL,Apple's Coronavirus Weakness Could Mean Invest...,2/27/2020,1,"[appl, coronaviru, weak, could, mean, investor..."


In [530]:
price_company = list(price.company.unique())
delta_company_prices = pd.DataFrame()
for i in notebook.tqdm(price_company):
    delta_company_price = price.loc[price['company'] == i, 'low'].diff(periods = -1).reset_index(drop = True)
    delta_company_price = delta_company_price/price.loc[price['company'] == i, 'low'].reset_index(drop = True)
    delta_company_prices = pd.concat([delta_company_prices, delta_company_price], axis = 1)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [531]:
delta_company_prices.columns = price_company
delta_company_prices = pd.concat([delta_company_prices, price.loc[0:100, 'date']], axis = 1)
delta_company_prices = delta_company_prices.drop([99,100], axis = 0)

In [532]:
delta_company_prices.columns = ['AAPL', 'ADBE', 'AMZN','COSTCO', 'EXPEDIA', 'FB', 'GOOGL', 'NDAQ', 'NFLX', 'SBUX', 'TSLA', 'date']
# the time range for price is 2019-10-17 ~ 2020-03-10

__Calculate every day's sentiment__

In [533]:
def add_weekend_news_to_Friday(company_model):
    new_company_model = pd.DataFrame()
    for i in range(1, len(company_model)):
        new_level = company_model.iloc[i-1:i, :]
        if math.isnan(new_level.iloc[:, 3]):
            if len(new_company_model) != 0:
                new_company_model.iloc[-1, 1] = new_company_model.iloc[-1, 1] + new_level.iloc[0,1]
                new_company_model.iloc[-1, 2] = new_company_model.iloc[-1, 2] + new_level.iloc[0,2]
        else:
            new_company_model = pd.concat([new_company_model, new_level], axis = 0)
    return(new_company_model)

Create a function that loops through all the companies

In [534]:
def create_company_model (company_name):
    company_sentiment = headlines_sentiment.loc[(headlines_sentiment['company'] == company_name) & (headlines_sentiment['release_date'] != 'Unknown'), ['sentiment','release_date']].groupby(['release_date']).agg({'sentiment':['sum', 'count']})
    company_sentiment.reset_index(level=0, inplace=True)
    company_date = pd.to_datetime(company_sentiment.loc[company_sentiment['release_date'] != ('Unknown'),'release_date'])
    company_sentiment = pd.concat([company_date, company_sentiment['sentiment']], axis = 1)
    company_sentiment = company_sentiment.sort_values(by = 'release_date')
    company_price = delta_company_prices.loc[:,['date', company_name, 'NDAQ']].sort_values(by = 'date')
    company_model = company_sentiment.set_index('release_date').join(company_price.set_index('date'))
    company_model.reset_index(level=0, inplace=True)
    company_model_full = add_weekend_news_to_Friday(company_model)
    company_model_full['sentiment'] = company_model_full['sum']/company_model_full['count']
    company_model_full.columns = ['date', 'sum', 'count', 'company', 'market', 'sentiment']
    company_model_full['sentiment_yesterd'] = company_model_full['sentiment'].shift(1)
    return company_model_full

In [535]:
AAPL_model

Unnamed: 0,date,sum,count,company,market,sentiment,sentiment_yesterd
0,2020-01-15,1,13,-0.006561,0.008779,0.076923,
1,2020-01-16,1,23,0.000634,0.003958,0.043478,0.076923
2,2020-01-17,0,26,0.009538,0.006675,0.0,0.043478
6,2020-01-21,5,11,0.000878,0.007081,0.454545,0.0
7,2020-01-22,-2,22,0.003031,-0.006396,-0.090909,0.454545
8,2020-01-23,-1,13,-0.001346,0.002097,-0.076923,-0.090909
9,2020-01-24,4,27,0.01166,0.005531,0.148148,-0.076923
12,2020-01-27,-1,12,-0.037079,-9.1e-05,-0.083333,0.148148
13,2020-01-28,1,29,0.020823,0.011739,0.034483,-0.083333
14,2020-01-29,4,21,0.028824,0.036106,0.190476,0.034483


In [536]:
AAPL_model = create_company_model('AAPL')
ADBE_model = create_company_model('ADBE')
AMZN_model = create_company_model('AMZN')
COST_model = create_company_model('COSTCO')
EXPE_model = create_company_model('EXPEDIA')
FB_model = create_company_model('FB')
GOOGL_model = create_company_model('GOOGL')
NFLX_model = create_company_model('NFLX')
SBUX_model = create_company_model('SBUX')
TSLA_model = create_company_model('TSLA')

In [537]:
full_model = pd.concat([AAPL_model, ADBE_model, AMZN_model, COST_model, EXPE_model, FB_model, GOOGL_model, NFLX_model, SBUX_model, TSLA_model], axis = 0)
full_model_nona = full_model.dropna()

In [538]:
full_model_nona

Unnamed: 0,date,sum,count,company,market,sentiment,sentiment_yesterd
1,2020-01-16,1,23,0.008139,0.012804,0.043478,0.076923
2,2020-01-17,0,26,0.009238,0.002314,0.000000,0.043478
6,2020-01-21,5,11,0.003165,0.008626,0.454545,0.000000
7,2020-01-22,-2,22,0.004128,-0.006930,-0.090909,0.454545
8,2020-01-23,-1,13,-0.005259,0.002397,-0.076923,-0.090909
...,...,...,...,...,...,...,...
92,2020-02-20,3,9,-0.047771,-0.016590,0.333333,0.222222
93,2020-02-21,2,9,0.023295,0.005878,0.222222,0.333333
96,2020-02-24,-1,7,-0.070847,-0.014236,-0.142857,0.222222
97,2020-02-25,-2,5,-0.044727,-0.004828,-0.400000,-0.142857


In [539]:
Y = full_model_nona['company']
X = full_model_nona['sentiment_yesterd']
X_ = sm.add_constant(X)
model = sm.OLS(Y, X_).fit()
model.summary()

0,1,2,3
Dep. Variable:,company,R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,9.402
Date:,"Sun, 15 Mar 2020",Prob (F-statistic):,0.00227
Time:,21:35:17,Log-Likelihood:,1246.1
No. Observations:,567,AIC:,-2488.0
Df Residuals:,565,BIC:,-2480.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0002,0.001,0.194,0.846,-0.002,0.003
sentiment_yesterd,0.0094,0.003,3.066,0.002,0.003,0.015

0,1,2,3
Omnibus:,664.974,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,155395.513
Skew:,-5.123,Prob(JB):,0.0
Kurtosis:,83.453,Cond. No.,2.75
