In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pysentiment2 as ps
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import gensim
from sklearn.feature_extraction.text import CountVectorizer
import yfinance as yf


def sentiment_vader(var):
    #pip install vaderSentiment
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
    sentiment = SentimentIntensityAnalyzer() 
    out_score = sentiment.polarity_scores(var)
    return out_score
def sentiment_ps(var):
    lm = ps.LM()
    tokens = lm.tokenize(var)
    score = lm.get_score(tokens)
    return score
def topic_distribution(string_input):
    string_input = [string_input]
    # Fit and transform
    X = vect.transform(string_input)
 
    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
 
    output = list(ldamodel[corpus])[0]
 
    return output
def topic_prediction(my_document):
    string_input = [my_document]
    X = vect.transform(string_input)
    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)
    output = list(ldamodel[corpus])[0]
    topics = sorted(output,key=lambda x:x[1],reverse=True)
    return topics[0][0]
def normalize(arr):
     return (arr-min(arr))/(max(arr)-min(arr))
def normalize_2(arr):
     return np.interp(arr, (arr.min(), arr.max()), (-1, +1))

In [2]:
df = pd.read_pickle("sentiment.pkl")
df['Year']= pd.to_datetime(df['Year'])
df = df.set_index('Year')
df['Year'] = df.index

In [3]:
# Get Stock Market Data: VIX index
import yfinance as yf
stock_info = yf.download("^VIX", start="1993-02-01", end="2021-02-20")
stock_info.index = pd.to_datetime(stock_info.index)
stock_info

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-02-01,12.510000,12.920000,12.180000,12.330000,12.330000,0
1993-02-02,12.470000,12.890000,12.220000,12.250000,12.250000,0
1993-02-03,11.980000,12.340000,11.790000,12.120000,12.120000,0
1993-02-04,11.860000,12.840000,11.690000,12.290000,12.290000,0
1993-02-05,12.560000,13.450000,12.490000,12.900000,12.900000,0
1993-02-08,13.150000,13.940000,12.200000,13.220000,13.220000,0
1993-02-09,13.630000,13.760000,13.430000,13.480000,13.480000,0
1993-02-10,13.580000,14.260000,13.000000,13.430000,13.430000,0
1993-02-11,13.050000,13.650000,12.630000,12.690000,12.690000,0
1993-02-12,13.580000,13.660000,12.320000,12.380000,12.380000,0


In [4]:
# stock market volatibility within 5 days after the Fed meetings
import numpy as np
price_changes = []
price_change_percentages = []
labels = []
window = 5
# workaround for rolling widow
for index, row in df.iterrows():
    minutes_date = row['Year']
    try:
        start = stock_info.index.get_loc(minutes_date)
    except:
        start = stock_info.index.get_loc(minutes_date + pd.DateOffset(days=1))
    stock_closing_sum = stock_info.iloc[start+window]['Close']
    start_price = stock_info['Close'].iloc[start]
    price_change = stock_closing_sum - start_price
    price_change_percent = (stock_closing_sum - start_price) / start_price
    price_changes.append(price_change)
    price_change_percentages.append(price_change_percent)
    

df['price_change'] = price_changes
df['price_change_percent'] = price_change_percentages

#label =1 means high volatility, label = 1 means low volatility. High volatility is identified if VIX price change percent is far from its median (greater than 75% or smaller than 25%)
df['label'] = df['price_change_percent'].apply(lambda x: 1 if x> df["price_change_percent"].quantile(0.75) or x < df["price_change_percent"].quantile(0.25) else 0)

In [5]:
df

Unnamed: 0_level_0,Statement,polarity,positive,negative,subjectivity,Year,price_change,price_change_percent,label
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1993-02-03,A meeting of the Federal Open Market Committee...,-0.134503,74,97,0.080546,1993-02-03,1.310000,0.108086,1
1993-03-23,A meeting of the Federal Open Market Committee...,-0.302857,61,114,0.080571,1993-03-23,-0.790001,-0.060676,0
1993-05-18,A meeting of the Federal Open Market Committee...,-0.259615,77,131,0.093990,1993-05-18,-0.150000,-0.010571,0
1993-07-07,A meeting of the Federal Open Market Committee...,-0.156627,35,48,0.038444,1993-07-07,-2.270000,-0.173946,1
1993-08-17,A meeting of the Federal Open Market Committee...,-0.224299,83,131,0.097717,1993-08-17,0.210000,0.018119,0
1993-09-21,A meeting of the Federal Open Market Committee...,-0.173913,76,108,0.084949,1993-09-21,-5.110000,-0.295376,1
1993-11-16,A meeting of the Federal Open Market Committee...,0.023810,86,82,0.076642,1993-11-16,-0.839999,-0.055592,0
1993-12-21,A meeting of the Federal Open Market Committee...,0.024876,103,98,0.089096,1993-12-21,0.380000,0.037698,0
1994-02-04,A meeting of the Federal Open Market Committee...,-0.227723,78,124,0.088830,1994-02-04,-0.790000,-0.051803,0
1994-03-22,A meeting of the Federal Open Market Committee...,-0.153061,83,113,0.088608,1994-03-22,3.099999,0.231516,1


In [6]:
#Start feature dataset
X = df
X = np.array(X.drop(['label','Statement','Year','positive','negative','price_change','price_change_percent'],1))
y = np.array(df['label'])

In [7]:
#Split(80% training, 20% test)
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size= 0.4, random_state = 0)

In [8]:
# Create and train model
model = LinearDiscriminantAnalysis().fit(x_train, y_train)

In [9]:
predictions = model.predict(x_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.60      0.53      0.56        45
           1       0.55      0.62      0.58        42

    accuracy                           0.57        87
   macro avg       0.58      0.58      0.57        87
weighted avg       0.58      0.57      0.57        87



In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [11]:
randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')
m = randomclassifier.fit(x_train,y_train)
predictions_2 = m.predict(x_test)
score = accuracy_score(y_test,predictions)
report = classification_report(y_test,predictions)
print(score)
print(report)

0.5747126436781609
              precision    recall  f1-score   support

           0       0.60      0.53      0.56        45
           1       0.55      0.62      0.58        42

    accuracy                           0.57        87
   macro avg       0.58      0.58      0.57        87
weighted avg       0.58      0.57      0.57        87

