In [2]:
import json
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [3]:
with open("./training_set.json", "r") as f:
    data = f.read()
    train_data = json.loads(data)
    
print("number of training instances:", len(train_data))

with open("./test_set.json", "r") as f:
    data = f.read()
    test_data = json.loads(data)
print("number of testing instances:", len(test_data))

number of training instances: 1396
number of testing instances: 634


In [4]:
train_data[:5]

[{'sentiment': -0.463,
  'snippet': 'downgrade',
  'target': '$PCAR',
  'tweet': 'downgrades $SON $ARI $GG $FLTX $WMC $MFA $IVR $CMI $PCAR $QLIK $AFOP $UNFI #stocks #investing #tradeideas'},
 {'sentiment': 0.678,
  'snippet': ['looking sexy this morning', 'break on volume'],
  'target': '$AMZN',
  'tweet': "$AMZN looking sexy this morning...$600 break on volume and it's gone."},
 {'sentiment': 0.377,
  'snippet': 'still long term fan!',
  'target': '$SBUX',
  'tweet': "@GerberKawasaki stock hasn't moved much since first few weeks after split but still long term fan! $sbux"},
 {'sentiment': 0.129,
  'snippet': '$TFM will have a way to go price wise to compete with Kroger. $KR',
  'target': '$KR',
  'tweet': 'Whole foods $WFM may feel price competition but $TFM will have a way to go price wise to compete with Kroger. $KR https://t.co/XBxJVG94mx'},
 {'sentiment': 0.395,
  'snippet': 'iPhone SE Could Be Doing Better Than Expected',
  'target': '$AAPL',
  'tweet': "Apple's iPhone SE Could B

In [5]:
test_data[:5]

[{'sentiment': 0.323,
  'snippet': 'ooks pretty bullish for now',
  'target': '$ATVI',
  'tweet': "$ATVI ooks pretty bullish for now. from a short-term perspective, it's got a good chance of maybe sliding back to 33.70 #stocks #investing"},
 {'sentiment': 0.579,
  'snippet': ['looks really interesting on drop',
   'grabbed some options and stock',
   'enough tme before earnings to grow the stock',
   'growth all sectors'],
  'target': '$CSCO',
  'tweet': '$CSCO looks really interesting on drop, grabbed some options and stock, enough tme before earnings to grow the stock, growth all sectors'},
 {'sentiment': 0.294,
  'snippet': 'covered some shorts',
  'target': '$TSLA',
  'tweet': '$TSLA : covered some shorts @ 246.00 for +9.22pts'},
 {'sentiment': 0.028,
  'snippet': 'triple top forming.',
  'target': '$TSLA',
  'tweet': 'Watching $TSLA W triple top forming.'},
 {'sentiment': -0.076,
  'snippet': 'Whole Foods shareholders vote down activist initiatives',
  'target': '$WFM',
  'tweet':

In [6]:
# load NTUSD
with open("./NTUSD-Fin/NTUSD_Fin_word_v1.0.json", "r") as f:
    data = f.read()
    NTUSD = json.loads(data)

In [7]:
word_sent_dict = {}
for i in range(len(NTUSD)):
    word_sent_dict[NTUSD[i]["token"]] = NTUSD[i]["market_sentiment"]

In [8]:
word_sent_dict["downgrades"]

-2.766749238286149

In [9]:
train_X = [item["tweet"].lower() for item in train_data]
train_y = np.array([item["sentiment"] for item in train_data],dtype=np.float)

test_X = [item["tweet"].lower() for item in test_data]
test_y = np.array([item["sentiment"] for item in test_data],dtype=np.float)

In [10]:
def sentByDict(sent):
    sent = sent.split()
    sent_value = []
    for s in sent:
        try: 
            sent_value.append(word_sent_dict[s])
        except:
            pass
    if sent_value==[]:
        return 0
    else:
        return np.average(sent_value)
train_pred = np.array([sentByDict(sentence) for sentence in train_X],dtype=np.float)
train_nor_pred = (((train_pred-min(train_pred))/ (max(train_pred)-min(train_pred))) - 0.5)*2.

test_pred = np.array([sentByDict(sentence) for sentence in test_X],dtype=np.float)
test_nor_pred = (((test_pred-min(test_pred))/ (max(test_pred)-min(test_pred))) - 0.5)*2.

In [11]:
print("train mse:", mean_squared_error(y_pred=train_nor_pred, y_true=train_y))
print("test mse:", mean_squared_error(y_pred=test_nor_pred, y_true=test_y))

train mse: 0.28521670923266274
test mse: 0.2614622213278173


In [93]:
# load RNN based predictoin
with open("rnn_pred.txt", "r") as f:
    rnn_test_pred = f.read().split("\n")
with open("rnn_train_pred.txt", "r") as f:
    rnn_train_pred = f.read().split("\n")

In [94]:
combine_train_pred = np.vstack((train_pred, np.array(rnn_train_pred))).astype(np.float).T
combine_test_pred = np.vstack((test_pred, np.array(rnn_test_pred))).astype(np.float).T

In [95]:
combine_train_pred.shape

(1396, 2)

In [96]:
# use linear regression for mapping
lr = LinearRegression()
lr.fit(combine_train_pred, train_y)
test_lr_pred = lr.predict(combine_test_pred)

In [97]:
print("test mse:",mean_squared_error(test_lr_pred, test_y))

test mse: 0.1102878817801669


1. create feature
    * average word embedding 
    * average NTUSD word sentiment
2. XGBoost


In [74]:
len(train_X) + len(test_X)

2030

In [57]:
# output .txt for rnn model prediction
with open("fin_tweet.txt", "w") as f:
    f.write("id,text\n")
    for i, line in enumerate(test_X):
        line = line.replace("\n"," ")
        f.write(str(i))
        f.write(",")
        f.write(line)
        if i+1 != len(test_X):
            f.write("\n")

with open("fin_tweet_train.txt", "w") as f:
    f.write("id,text\n")
    for i, line in enumerate(train_X):
        line = line.replace("\n"," ")
        f.write(str(i))
        f.write(",")
        f.write(line)
        if i+1 != len(train_X):
            f.write("\n")