In [1]:
import numpy as np
import matplotlib.pyplot as plt
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
# built linear regression model
class LinearRegression(object):
    def __init__(self):
        self.input = None
        self.output = None
        self.n_features = None
        self.n_targets = None
        
        self.coef_ = None
    
    def cos_function(self, predictions, lables):
        return 0.5 * np.square(lables - predictions).mean()
      
    # training linear regression model
    def fit(self, X, Y, max_iters=1000000, lr=0.01, verbose=False, print_freq=20, tol=1e-6):
        self.input = copy.deepcopy(X)
        self.output = copy.deepcopy(Y)
        n_samples = X.shape[0]
        
        X = np.matrix(X)
        Y = np.matrix(Y.reshape(n_samples, -1))
        
        self.n_features = X.shape[1]
        self.n_targets = Y.shape[1]
        
        self.coef_ = np.mat(np.zeros([self.n_features,1]))
        self.intercept_ = 0.
        
        lr0 = copy.deepcopy(lr)
        decay_rate = 0.9
        decay_period = 2
        
        pre_loss = float('inf')
        for iter in range(1, max_iters+1):
            # learning rate decay 
            if iter > decay_period:
                lr = lr0 / (1 + decay_rate * (iter - decay_period))
            Y_pred = self.predict(X)

            self.coef_ = self.coef_ - lr * np.dot(X.T, (Y_pred - Y)) / n_samples
            self.intercept_ = self.intercept_ - lr * (Y_pred-Y).mean()
            loss = self.cos_function(Y_pred, Y)

            if verbose and (iter % print_freq == 0 or iter == max_iters):
                print('iteration:%d\t loss:%lf' %(iter, loss))
                if pre_loss - loss < tol:
                    break
                pre_loss = loss
            
        
    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_
    
    # 'R2 score'
    def score(self, X, Y):
        X = np.matrix(X)
        Y = np.matrix(Y.reshape(-1, self.n_targets))
        Y_pred = self.predict(X)
        u = np.sum(np.square(Y - Y_pred))
        v = np.sum(np.square(Y - np.mean(Y)))
        return 1 - (u/v)
        
    

In [4]:
# create regression date to test linear regression model
from sklearn import datasets
x, y = datasets.make_regression(n_samples=1000, n_features=3, n_targets=4, noise=13)

In [5]:
LR = LinearRegression()
LR.fit(x, y, lr=1, max_iters=500)
LR.score(x, y)

0.981428547596185

In [6]:
# load FinTech data
path_data = './data/Fintech_daily_data.csv'
df = pd.read_csv(path_data)

In [7]:
df.describe()

Unnamed: 0,DXYUSD_Open,DXYUSD_High,DXYUSD_Low,DXYUSD_Close,DXYUSD_AdjClose,DXYUSD_Volume,World_Index_Open,World_Index_High,World_Index_Low,World_Index_Close,...,TNXTbill_Open,TNXTbill_High,TNXTbill_Low,TNXTbill_Close,TNXTbill_AdjClose,TNXTbill_Volume,BTC_Price,BTC_market_cap,BTC_volume,Sentiment_24h
count,1917.0,1917.0,1917.0,1917.0,1917.0,1917.0,1933.0,1933.0,1933.0,1933.0,...,1917.0,1917.0,1917.0,1917.0,1917.0,1917.0,2803.0,2802.0,2803.0,2805.0
mean,93.021101,93.325806,92.748701,93.043193,93.043193,12149.19,80.024273,80.360797,79.595173,79.982856,...,2.145511,2.169513,2.121338,2.145574,2.145574,0.0,4125.471133,72309830000.0,7606231000.0,0.418845
std,6.427827,6.443005,6.370153,6.41071,6.41071,385624.5,11.289516,11.293338,11.294457,11.299139,...,0.639394,0.637413,0.639804,0.637501,0.637501,0.0,4721.972529,85437780000.0,12705740000.0,0.396992
min,79.139999,79.239998,78.910004,79.139999,79.139999,0.0,59.0,59.0,58.98,59.0,...,0.484,0.538,0.398,0.499,0.499,0.0,67.809,771368100.0,0.0,-1.083872
25%,90.25,90.57,90.010002,90.32,90.32,0.0,71.129997,71.379997,70.709999,71.040001,...,1.817,1.842,1.786,1.815,1.815,0.0,416.802,5835826000.0,64122860.0,0.148308
50%,95.169998,95.5,94.910004,95.150002,95.150002,0.0,76.5,76.970001,76.0,76.620003,...,2.271,2.293,2.243,2.268,2.268,0.0,1082.016823,17100970000.0,1275837000.0,0.214014
75%,97.410004,97.68,97.160004,97.419998,97.419998,0.0,89.190002,89.489998,88.739998,89.190002,...,2.61,2.631,2.585,2.605,2.605,0.0,7534.740259,133072700000.0,6834460000.0,0.62086
max,103.260002,103.82,103.029999,103.290001,103.290001,14290000.0,112.940002,112.940002,112.290001,112.410004,...,3.239,3.248,3.233,3.234,3.234,0.0,29022.418395,539438000000.0,81406690000.0,4.667784


In [8]:
# remove rows with empty values
df.dropna(inplace=True)
df.describe()

Unnamed: 0,DXYUSD_Open,DXYUSD_High,DXYUSD_Low,DXYUSD_Close,DXYUSD_AdjClose,DXYUSD_Volume,World_Index_Open,World_Index_High,World_Index_Low,World_Index_Close,...,TNXTbill_Open,TNXTbill_High,TNXTbill_Low,TNXTbill_Close,TNXTbill_AdjClose,TNXTbill_Volume,BTC_Price,BTC_market_cap,BTC_volume,Sentiment_24h
count,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,...,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0,1914.0
mean,93.027957,93.332435,92.755319,93.049765,93.049765,12168.23,79.997738,80.335481,79.567205,79.956865,...,2.145385,2.169404,2.121249,2.145509,2.145509,0.0,4099.635902,71831830000.0,7692876000.0,0.421149
std,6.427832,6.443079,6.370284,6.410779,6.410779,385926.4,11.252597,11.256787,11.257036,11.262245,...,0.639717,0.637739,0.640107,0.637803,0.637803,0.0,4673.56624,84525040000.0,12856170000.0,0.395252
min,79.139999,79.239998,78.910004,79.139999,79.139999,0.0,59.0,59.0,58.98,59.0,...,0.484,0.538,0.398,0.499,0.499,0.0,76.3915,870082000.0,0.0,-0.126262
25%,90.269997,90.589996,90.062498,90.324999,90.324999,0.0,71.122501,71.379997,70.702497,71.045,...,1.81625,1.84225,1.7865,1.815,1.815,0.0,416.865,5806414000.0,70053870.0,0.145776
50%,95.174999,95.510002,94.915001,95.155003,95.155003,0.0,76.490002,76.940002,75.990002,76.61,...,2.271,2.2935,2.244,2.268,2.268,0.0,1076.18429,17064440000.0,1257546000.0,0.210078
75%,97.4175,97.68,97.160004,97.419998,97.419998,0.0,89.187502,89.487499,88.699997,89.157504,...,2.61,2.631,2.58425,2.605,2.605,0.0,7596.436701,133072700000.0,7143630000.0,0.618245
max,103.260002,103.82,103.029999,103.290001,103.290001,14290000.0,112.940002,112.940002,112.290001,112.410004,...,3.239,3.248,3.233,3.234,3.234,0.0,28837.288529,535967300000.0,81406690000.0,4.652801


In [9]:
df_input = df.drop(['BTC_Price', 'BTC_market_cap', 'BTC_volume', 'Date'], axis=1)
df_ouput = df[['BTC_Price']]
df_input.shape, df_ouput.shape

((1914, 61), (1914, 1))

In [10]:
# 75% data is the train set, and 25% test set
X_train, X_test, Y_train, Y_test = train_test_split(df_input.values, df_ouput.values, test_size=0.25, random_state=1)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((1435, 61), (1435, 1), (479, 61), (479, 1))

In [11]:
# Normalized data
X_scaler = StandardScaler()
Y_scaler = StandardScaler()
X_scaler = X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

Y_scaler = Y_scaler.fit(Y_train)
Y_train = Y_scaler.transform(Y_train)
Y_test = Y_scaler.transform(Y_test)

In [12]:
X_train.mean(), X_train.std()

(-5.519724228215778e-16, 0.983469935866927)

In [13]:
LR = LinearRegression()
LR.fit(X_train, Y_train, verbose=False, max_iters=1000, lr=1, print_freq=50)
LR.score(X_test, Y_test)

0.8550510070936541

In [14]:
df_coef = pd.DataFrame(columns=['variable', 'coefficient'])
df_coef['variable'] = df_input.columns
df_coef['coefficient'] = LR.coef_
df_coef['abs_coefficient'] = df_coef['coefficient'].abs()
# sort by absolute value of coefficient
pd.set_option('display.max_rows',None)
df_coef.sort_values(by=['abs_coefficient'], ascending=False)

Unnamed: 0,variable,coefficient,abs_coefficient
9,World_Index_Close,0.111292,0.111292
8,World_Index_Low,0.110267,0.110267
7,World_Index_High,0.109183,0.109183
6,World_Index_Open,0.107353,0.107353
60,Sentiment_24h,0.104639,0.104639
10,World_Index_AdjClose,0.096154,0.096154
2,DXYUSD_Low,-0.076076,0.076076
3,DXYUSD_Close,-0.075532,0.075532
4,DXYUSD_AdjClose,-0.075532,0.075532
0,DXYUSD_Open,-0.074656,0.074656
