In [1]:
# load model
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import json
import requests
import datetime
import time
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MaxAbsScaler

sia = SentimentIntensityAnalyzer()



In [2]:
# # parameters
sequence_length = 50
ticker_num = 24
coin = 'BTC'
coinID = '1182'
timeperiod = 'histohour'
timefrequency = 'hour'
agg = 1
apiKey = 'bf7c04a024b244dea99e95798fa8e102b7c9738c0933795253c2c8f39f2d160c'
cat = 'BTC'
lang = 'EN'

In [3]:
def price_to_return(df, target_col):
    # get price change
    s_test = df[target_col]
    log_return = np.log(s_test/s_test.shift())
    df['price change'] = log_return

    # feature extraction from OHLC
    # '''high/open'''
    df['high/open'] = np.log(df['high']/df['open'])
    # '''low/open'''
    df['low/open'] = np.log(df['low']/df['open'])
    # '''close/high'''
    df['close/high'] = np.log(df['close']/df['high'])
    # '''close/low'''
    df['close/low'] = np.log(df['close']/df['low'])
    df = df.drop(['close','high','low','open','volumefrom'],axis=1)
    df.rename(columns = {'volumeto':'volumn'},inplace=True)
    df = df.dropna()
    return df

In [4]:
def get_ticker_sentiment(df,news):
    holder = []
    for i in range(df.shape[0]):
        to_t = df.time[i]
        from_t = to_t - 86400
        news_interval = news[(news['published_on'] >= from_t) & (news['published_on'] <= to_t)]
        score = news_interval.mean()[-4:]
        score['time'] = to_t
        holder.append(score)
    score_df = pd.concat(holder,axis=1).T
    score_df['time'] = score_df['time'].apply(int)
    complete_df = pd.merge(df, score_df, on = 'time')
    return complete_df

In [5]:
def get_ticker_social(df,coin_social):
    coin_social = coin_social.pct_change()
    coin_social = coin_social.add(1)
    coin_social = np.log(coin_social)
    coin_social = coin_social.reset_index()
    complete_df = pd.merge(df,coin_social,on='time', how = 'inner')
    return complete_df

In [141]:
def time_series_to_supervised(df,sequence_length):
    
    '''for real time pipeline, we don't need label, i.e. actual y, so when we do indexing
    , we use "index+1 : index + sequence"
    '''
    temp = df.values
    temp = temp.tolist()
        
    result = []
    for index in range(len(temp) - sequence_length +1):
        result.append(temp[index +1: index + sequence_length])
    data = np.array(result)

    # get x and y
    x = data
        
    return x

In [148]:
def data_preparation(coin, coinID, time_period, time_frequency, agg, apiKey, cat, lang, ticker_num, sequence_length):
    

    def get_news_data_spec(cat, timestamp, apiKey, lang):
        news_url = "https://min-api.cryptocompare.com/data/v2/news/?categories={}&lang={}&lTs={}&api_key={}".format(cat, lang, timestamp ,apiKey)        
        r = requests.get(news_url)
        ipdata = r.json()
        return ipdata

    # get current time
    timestamp = int(time.time())
    
    prediction_length = ticker_num + sequence_length -1
    
    # APIs
    price_url = "https://min-api.cryptocompare.com/data/{}?fsym={}&tsym=USD&limit={}&toTs={}&api_key={}".format(time_period, coin, prediction_length, timestamp,apiKey)
    
    social_url = "https://min-api.cryptocompare.com/data/social/coin/histo/{}?coinId={}&aggregate={}&limit={}&toTs={}&api_key={}"\
        .format(time_frequency, coinID, agg, prediction_length, timestamp ,apiKey)
    

    # get data
    price = requests.get(price_url).json()
    social = requests.get(social_url).json()
    
    # deal with price
    price_df = pd.DataFrame(price['Data'])
    price_benchmark = price_df['close'][sequence_length-1]
    timestamp_benchmark = price_df['time'][sequence_length-1:]
    price_df['date/hour'] = pd.to_datetime(price_df['time'], unit='s') 
    price_df.set_index('date/hour', inplace=True)
    return_df = price_to_return(price_df,'close')
    
    # deal with news
    news_ts = timestamp
    from_t = news_ts - (prediction_length * 3600) - 86400
    holder = []
    while news_ts > from_t:
        news_data = get_news_data_spec(cat, news_ts, apiKey, lang)
        news_df = pd.DataFrame(news_data['Data'])
        holder.append(news_df)
        news_ts = news_df['published_on'].min()        
    df = pd.concat(holder, axis = 0)
    df = df[df['published_on']>from_t]
    df['time'] = df['published_on'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%m-%d"))
    news = df.set_index('time')
    # get news sentiment
    news['news_content'] = news['title'] + news['body']
    news['vader_polarity'] = news['news_content'].apply(lambda x: sia.polarity_scores(x))
    news['vader_compound'] = news['vader_polarity'].apply(lambda x: x['compound'])
    news['vader_neg'] = news['vader_polarity'].apply(lambda x: x['neg'])
    news['vader_neu'] = news['vader_polarity'].apply(lambda x: x['neu'])
    news['vader_pos'] = news['vader_polarity'].apply(lambda x: x['pos'])
    news = news[['id','published_on','vader_compound','vader_neg','vader_neu','vader_pos']]
    return_news_df = get_ticker_sentiment(return_df, news)
    
    # deal with social
    social_df = pd.DataFrame(social['Data'])
    social_df.set_index('time',inplace = True)
    coin_complete = get_ticker_social(return_news_df,social_df)
    
    # complete df
    coin_complete['time'] = pd.to_datetime(coin_complete['time'], unit='s')
    coin_complete['time'] = coin_complete['time'].dt.strftime('%Y-%m-%d %r')
    coin_complete.set_index('time', inplace=True)
    
    return price_benchmark, timestamp_benchmark, coin_complete
    

In [153]:

def pipeline(coin_complete, sequence_length, model):
    
    coin_complete = coin_complete.replace([np.inf, -np.inf], np.nan)
    coin_complete = coin_complete.fillna(0)
    
    # scale data
    max_abs_scaler = MaxAbsScaler()
    temp = max_abs_scaler.fit_transform(coin_complete)
    temp = pd.DataFrame(temp,columns=coin_complete.columns)
    coin_complete = temp.set_index(coin_complete.index)
    
    x = time_series_to_supervised(coin_complete, sequence_length)
    
    y_predict = model.predict(x)
    
    # inverse scaler
    y_predict_inverse = y_predict * max_abs_scaler.scale_[1]
    
    return y_predict, y_predict_inverse

In [150]:
def denormalized_to_actual_price(y_predict_inverse, price_benchmark, timestamp_benchmark):
    
    # log return to simple return
    predict_simple = np.exp(y_predict_inverse)
    #actual_simple = np.exp(y_actual_inverse)
    
    # get real price
    predict_simple = np.insert(predict_simple, 0, price_benchmark)
    predict_price = predict_simple.cumprod()[1:]
    #actual_simple = np.insert(actual_simple, 0, price_benchmark)
    #actual_price = actual_simple.cumprod()[1:]
    
    # get timestamp
    timestamp_benchmark = timestamp_benchmark[1:]
    timestamp_benchmark = timestamp_benchmark + 3600
    
    # zip
    predict_price = list(zip(timestamp_benchmark, predict_price))
    #actual_price = list(zip(timestamp_benchmark, actual_price))
    
    return predict_price
    

In [175]:
price_benchmark, timestamp_benchmark, coin_complete = data_preparation(coin, coinID, timeperiod, timefrequency, agg, apiKey, cat, lang, ticker_num, sequence_length)

In [176]:
coin_model = load_model('coin_model_1.h5')

In [177]:
y_predict, y_predict_inverse = pipeline(coin_complete, sequence_length, coin_model)

In [178]:
predict_price = denormalized_to_actual_price(y_predict_inverse, price_benchmark, timestamp_benchmark)

In [179]:
predict_price

[(1554620400, 5117.1875),
 (1554624000, 5116.0747),
 (1554627600, 5113.63),
 (1554631200, 5111.742),
 (1554634800, 5109.4478),
 (1554638400, 5106.435),
 (1554642000, 5102.2505),
 (1554645600, 5098.5596),
 (1554649200, 5095.1484),
 (1554652800, 5091.728),
 (1554656400, 5088.646),
 (1554660000, 5085.3086),
 (1554663600, 5082.603),
 (1554667200, 5079.6187),
 (1554670800, 5076.786),
 (1554674400, 5073.6006),
 (1554678000, 5070.482),
 (1554681600, 5068.2866),
 (1554685200, 5066.4604),
 (1554688800, 5065.0776),
 (1554692400, 5063.246),
 (1554696000, 5062.452),
 (1554699600, 5060.334),
 (1554703200, 5059.1353)]

In [180]:
predict_price = [[i[0]*1000, float(i[1])] for i in predict_price]

In [181]:
coin_complete.tail()

Unnamed: 0_level_0,volumn,price change,high/open,low/open,close/high,close/low,vader_compound,vader_neg,vader_neu,vader_pos,...,reddit_posts_per_day,reddit_posts_per_hour,reddit_subscribers,total_page_views,trades_page_views,twitter_favourites,twitter_followers,twitter_following,twitter_lists,twitter_statuses
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-04-08 01:00:00 AM,35281620.65,0.003397,0.01094,-0.001019,-0.007543,0.004416,0.35224,0.02465,0.898083,0.077217,...,0.021612,0.021391,1.7e-05,2.7e-05,1.6e-05,0.0,0.0,0.0,0.0,0.0
2019-04-08 02:00:00 AM,31393726.54,0.002398,0.012501,-0.000333,-0.010103,0.002731,0.361437,0.025444,0.894286,0.080222,...,0.002532,0.002642,1.7e-05,3.3e-05,2.6e-05,0.0,0.0,0.0,0.0,0.0
2019-04-08 03:00:00 AM,13434092.86,-0.00766,0.00013,-0.010766,-0.007791,0.003105,0.3532,0.025855,0.894548,0.079548,...,0.060463,0.0614,7e-06,3.4e-05,3.5e-05,0.0,0.0,0.0,0.0,0.0
2019-04-08 04:00:00 AM,9689713.13,0.012065,0.012516,0.0,-0.000451,0.012065,0.348951,0.026279,0.894344,0.079328,...,-0.009774,-0.009975,2.4e-05,2.7e-05,1e-05,0.0,0.0,0.0,0.0,0.0
2019-04-08 05:00:00 AM,1942244.09,-0.003948,0.00015,-0.003948,-0.004098,0.0,0.344016,0.026,0.894492,0.07946,...,-0.036174,-0.035718,1.8e-05,3.1e-05,2e-05,0.0,0.0,0.0,0.0,0.0


# prediction


In [182]:
import pika
import json
import numpy as np
import time
import datetime as dt

In [183]:

cred = pika.credentials.PlainCredentials('hwa125', '960923')
connection = pika.BlockingConnection(
    pika.ConnectionParameters(host='ec2-3-19-67-238.us-east-2.compute.amazonaws.com', credentials=cred))
channel = connection.channel()

channel.queue_declare(queue='predict-24')
channel.basic_publish(exchange='', routing_key='predict-24', body=json.dumps(predict_price))
channel.close()

In [161]:
json.dumps(predict_price)

'[[1554620400000, 5117.1875], [1554624000000, 5116.07470703125], [1554627600000, 5113.6298828125], [1554631200000, 5111.7421875], [1554634800000, 5109.44775390625], [1554638400000, 5106.43505859375], [1554642000000, 5102.25048828125], [1554645600000, 5098.5595703125], [1554649200000, 5095.1484375], [1554652800000, 5091.72802734375], [1554656400000, 5088.64599609375], [1554660000000, 5085.30859375], [1554663600000, 5082.60302734375], [1554667200000, 5079.61865234375], [1554670800000, 5076.7861328125], [1554674400000, 5073.6005859375], [1554678000000, 5070.48193359375], [1554681600000, 5068.28662109375], [1554685200000, 5066.46044921875], [1554688800000, 5065.07763671875], [1554692400000, 5063.24609375], [1554696000000, 5062.4521484375], [1554699600000, 5060.333984375], [1554703200000, 5059.0751953125]]'

In [167]:
dt.datetime.fromtimestamp(1554703200)

datetime.datetime(2019, 4, 7, 23, 0)