In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

import warnings
warnings.filterwarnings('ignore')

import matplotlib.dates as mdates
from mpl_finance import candlestick_ohlc

In [190]:
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [489]:
# load BTC data

import json
import requests

def get_data_spec(coin, date, time_period,apiKey):
    """ Query the API for 2000 units historical price data starting from "date". """
    url = "https://min-api.cryptocompare.com/data/{}?fsym={}&tsym=USD&limit=2000&toTs={}&api_key={}".format(time_period, coin, date,apiKey)
    r = requests.get(url)
    ipdata = r.json()
    return ipdata

def get_news_data_spec(category, timestamp, apiKey, language):
    """ Query the API for 2000 units historical price data starting from "date". """
    url = "https://min-api.cryptocompare.com/data/v2/news/?categories={}&lang={}&lTs={}&api_key={}".format(category, language, timestamp ,apiKey)
    r = requests.get(url)
    ipdata = r.json()
    return ipdata

def get_social_data_spec(coin, date, time_frequency, aggregate, apiKey):
    url = "https://min-api.cryptocompare.com/data/social/coin/histo/{}?coinId={}&aggregate={}&limit=2000&toTs={}&api_key={}"\
        .format(time_frequency, coin, aggregate,date,apiKey)
    r = requests.get(url)
    ipdata = r.json()
    return ipdata

def get_df_spec(from_date, to_date, time_period, coin, apiKey):
    """ Get historical price data between two dates. If further apart than query limit then query multiple times. """
    date = to_date
    holder = []
    while date > from_date:
        # Now we use the new function to query specific coins
        data = get_data_spec(coin, date, time_period,apiKey)
        holder.append(pd.DataFrame(data['Data']))
        date = data['TimeFrom']
    df = pd.concat(holder, axis = 0)
    df = df[df['time']>from_date]
    df['date/hour'] = pd.to_datetime(df['time'], unit='s') 
    df.set_index('date/hour', inplace=True)
    df.sort_index(ascending=False, inplace=True)
    # And just keep the close price, with the column heading as the name of the coin. 
    #df.rename(columns={'close':coin}, inplace=True)
    return df

def get_social_df_spec(from_date, to_date, time_frequency, coin, aggregate, apiKey):
    """ Get historical price data between two dates. If further apart than query limit then query multiple times. """
    date = to_date
    holder = []
    while date > from_date:
        # Now we use the new function to query specific coins
        data = get_social_data_spec(coin, date, time_frequency, aggregate, apiKey)
        data_df = pd.DataFrame(data['Data'])
        holder.append(data_df)
        date = data_df['time'].min()
    df = pd.concat(holder, axis = 0)
    df = df[df['time']>from_date]
    #df['date/hour'] = pd.to_datetime(df['time'], unit='s') 
    df.set_index('time', inplace=True)
    df.sort_index(ascending=True, inplace=True)
    # And just keep the close price, with the column heading as the name of the coin. 
    #df.rename(columns={'close':coin}, inplace=True)
    return df

In [402]:
# # parameters
fromdate = int(datetime.date(2017,01,01).strftime("%s"))
todate = datetime.date(2019,04,01).strftime("%s") #today
coin = 'BTC'
coinID = '1182'
timeperiod = 'histoday'
timefrequency = 'day'
agg = 1
apiKey = 'bf7c04a024b244dea99e95798fa8e102b7c9738c0933795253c2c8f39f2d160c'
cat = 'BTC'
lang = 'EN'

coin_price = get_df_spec(fromdate, todate, timeperiod, coin, apiKey)

In [403]:
def price_to_return(df, target_col):
    # get price change
    s_test = df[target_col]
    log_return = np.log(s_test/s_test.shift())
    df['price change'] = log_return

    # feature extraction from OHLC
    # '''high/open'''
    df['high/open'] = np.log(df['high']/df['open'])
    # '''low/open'''
    df['low/open'] = np.log(df['low']/df['open'])
    # '''close/high'''
    df['close/high'] = np.log(df['close']/df['high'])
    # '''close/low'''
    df['close/low'] = np.log(df['close']/df['low'])
    df = df.drop(['close','high','low','open','volumefrom'],axis=1)
    df.rename(columns = {'volumeto':'volumn'},inplace=True)
    df = df.dropna()
    return df

coin_return = price_to_return(coin_price,'close')


## NEWS DATA

In [251]:
news = pd.read_csv('/Users/michaelyang/PycharmProjects/all_news.csv')
news['time'] = news['published_on'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%m-%d"))
news = news.set_index('time')

# get vader sentiment score on news content
news['vader_polarity'] = news['news_content'].apply(lambda x: sia.polarity_scores(x))
news['vader_compound'] = news['vader_polarity'].apply(lambda x: x['compound'])
news['vader_neg'] = news['vader_polarity'].apply(lambda x: x['neg'])
news['vader_neu'] = news['vader_polarity'].apply(lambda x: x['neu'])
news['vader_pos'] = news['vader_polarity'].apply(lambda x: x['pos'])

In [253]:
news = news[['id','published_on','vader_compound','vader_neg','vader_neu','vader_pos']]

In [334]:
def get_ticker_sentiment(df,news):
    holder = []
    for i in range(df.shape[0]):
        to_t = df.time[i]
        from_t = to_t - 86400
        news_interval = news[(news['published_on'] >= from_t) & (news['published_on'] <= to_t)]
        score = news_interval.mean()[-4:]
        score['time'] = to_t
        holder.append(score)
    score_df = pd.concat(holder,axis=1).T
    score_df['time'] = score_df['time'].apply(lambda x: int(x))
    complete_df = pd.merge(df,score_df,on='time')
    return complete_df

In [335]:
coin_return_news = get_ticker_sentiment(coin_return)

## SOCIAL DATA

In [505]:
coin_social = get_social_df_spec(fromdate,todate,timefrequency,coinID,agg,apiKey)

In [506]:
def get_ticker_social(df,coin_social):
    coin_social = coin_social.pct_change()
    coin_social = coin_social.add(1)
    coin_social = np.log(coin_social)
    coin_social.reset_index()
    complete_df = pd.merge(df,coin_social,on='time')
    return complete_df


In [557]:
coin_complete = get_ticker_social(coin_return_news,coin_social)
coin_complete['time'] = pd.to_datetime(coin_complete['time'], unit='s')
coin_complete['time'] = coin_complete['time'].dt.date
coin_complete.set_index('time', inplace=True)

## IMPUTE NULL VALUE

In [558]:
coin_complete = coin_complete.replace([np.inf, -np.inf], np.nan)
coin_complete = coin_complete.fillna(0)
np.any(np.isinf(coin_complete))

False

## Normalize

In [559]:
from sklearn.preprocessing import MaxAbsScaler

# def normalized_data(df):
#     min_max_scaler = MinMaxScaler()
#     min_max_scaler.fit_transform(df)
    
max_abs_scaler = MaxAbsScaler()
temp = max_abs_scaler.fit_transform(coin_complete)
temp = pd.DataFrame(temp,columns=coin_complete.columns)
coin_complete = temp.set_index(coin_complete.index)

## Time Series to Supervised Learning

In [563]:
window = 10

coin_complete.values

array([[ 0.00908259, -0.04107339,  0.0109322 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01639528,  0.00580737,  0.03430021, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02736511, -0.00720698,  0.08453377, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.02731219,  0.60496525,  0.42183146, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00897987, -0.41401069,  0.07160909, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01057331, -0.06963988,  0.14266232, ...,  0.        ,
         0.        ,  0.        ]])

In [369]:
# social data

import datetime

url = 'https://min-api.cryptocompare.com/data/social/coin/histo/day'

payload = {
    "api_key": apiKey,
    "coinID": "1182",
    'aggregate':1,
    'limit':100,
    'toTs':todate
}

res_BTC = requests.get(url, params=payload)
social_BTC = pd.DataFrame(json.loads(res_BTC.content)['Data'])
social_BTC['time'] = social_BTC['time'].apply(lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%m-%d"))
social_BTC = social_BTC.set_index('time')

In [371]:
social_BTC.tail().T

time,2019-03-27,2019-03-28,2019-03-29,2019-03-30,2019-03-31
analysis_page_views,973999.0,974276.0,974512.0,974771.0,975153.0
charts_page_views,7682467.0,7685117.0,7687473.0,7689343.0,7692327.0
code_repo_closed_issues,5324.0,5323.0,5323.0,5324.0,5327.0
code_repo_closed_pull_issues,12184.0,12191.0,12196.0,12200.0,12207.0
code_repo_forks,25830.0,25833.0,25828.0,25836.0,25843.0
code_repo_open_issues,931.0,936.0,937.0,939.0,938.0
code_repo_open_pull_issues,346.0,345.0,346.0,347.0,353.0
code_repo_stars,45283.0,45295.0,45312.0,45325.0,45336.0
code_repo_subscribers,4168.0,4168.0,4169.0,4174.0,4174.0
comments,247241.0,247453.0,247559.0,247824.0,248072.0


In [341]:
# social data

import datetime

url = 'https://min-api.cryptocompare.com/data/social/coin/histo/day'

payload = {
    "api_key": apiKey,
    "coinID": "1182",
    'aggregate':1,
    'limit':100,
    'toTs':1543622400
}

res_BTC = requests.get(url, params=payload)
r = res_BTC.json()

In [348]:
r['Data']

1534982400