In [1]:
import json
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_ridge import KernelRidge
import numpy as np

# Creating Corpus

In [3]:
market_data = pd.read_csv('market_data.csv')# load up data
with open('readable.json') as json_file:
    heads = json.load(json_file)

In [4]:
market_data.head() # x is VIX, y is DJI

Unnamed: 0,Date,Open_x,High_x,Low_x,Close_x,Adj Close_x,Volume_x,Open_y,High_y,Low_y,Close_y,Adj Close_y,Volume_y
0,2000-01-03,24.360001,26.15,23.98,24.209999,24.209999,0,11501.849609,11522.009766,11305.69043,11357.509766,11357.509766,169750000
1,2000-01-04,24.940001,27.18,24.799999,27.01,27.01,0,11349.75,11350.05957,10986.450195,10997.929688,10997.929688,178420000
2,2000-01-05,27.98,29.0,25.85,26.41,26.41,0,10989.370117,11215.099609,10938.669922,11122.650391,11122.650391,203190000
3,2000-01-06,26.68,26.709999,24.700001,25.73,25.73,0,11113.370117,11313.450195,11098.450195,11253.259766,11253.259766,176550000
4,2000-01-07,25.139999,25.17,21.719999,21.719999,21.719999,0,11247.05957,11528.139648,11239.919922,11522.55957,11522.55957,184900000


In [6]:
head_dates =list(heads.keys()) #gets the dates into a list
dates = [d for d in market_data['Date'] if d.replace("-","") in heads.keys()] #there are headlines written on weekends
                                                                              #but the markets are closed on weekends
                                                                              #this only gets market data for when 
                                                                              #they're open

vix_close = [market_data.loc[i].values[4] for i in range(len(market_data)) if market_data['Date'][i] in dates] #vix closing price data
dji_close = [market_data.loc[i].values[-3] for i in range(len(market_data)) if market_data['Date'][i] in dates] #dji closing price data

In [7]:
corpus = ['. '.join(heads[d.replace("-","")]) for d in dates] #constructs each date of headlines as a document

# Create tf-idf Scores

In [8]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.2,  norm=None) #get vector and tdidf scores
tfidf_scores = vectorizer.fit_transform(corpus)

In [9]:
tfidf_matrix = tfidf_scores.toarray() #put it to a matrix
df_tfidf_scores = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names())
df_tfidf_scores.head()

Unnamed: 0,000,08,100,101,11,12,13,14,15,150,...,youtube,yuan,yukos,yum,zealand,zero,zika,zimbabwe,zone,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.477608,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,3.239483,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,2.704055,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling

## Kernel Ridge with VIX

In [16]:
y = vix_close
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=0)

clf = KernelRidge(alpha=1,kernel = "poly")
clf.fit(X_train, y_train)
pred =clf.predict(X_test)
print("R^2 Score: " + str(clf.score(X_test,y_test)))
print("MSE: " + str(mean_squared_error(pred,y_test)))
print("Sqrt(MSE): " + str(math.sqrt(mean_squared_error(pred,y_test))))

R^2 Score: 0.3671007717845833
MSE: 45.99317472403121
Sqrt(MSE): 6.781826798439431


## Kernel Ridge with DJI

In [17]:
y = dji_close
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=0)

clf = KernelRidge(alpha=1,kernel = "poly")
clf.fit(X_train, y_train)
pred =clf.predict(X_test)
print("R^2 Score: " + str(clf.score(X_test,y_test)))
print("MSE: " + str(mean_squared_error(pred,y_test)))
print("Sqrt(MSE): " + str(math.sqrt(mean_squared_error(pred,y_test))))

R^2 Score: 0.6956996617893683
MSE: 8295060.98485426
Sqrt(MSE): 2880.1147520288596
