In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlalchemy as db

# Processing Data

In [2]:
def get_all_stock():
    con = db.create_engine("sqlite:///C:/Users/zheji/Desktop/TradingTools/dailyDB.sqlite")
    dfs = []
    for table in con.table_names():
        tmp = pd.read_sql(table, con)
        tmp['ticker'] = table
        tmp['1dayret%'] = (tmp['Adj Close'] - tmp['Adj Close'].shift(1))/tmp['Adj Close'].shift(1) * 100
        tmp['nextdayret%'] = (tmp['Adj Close'].shift(-1) - tmp['Adj Close'])/ tmp['Adj Close'] * 100
        tmp['nextdayrange'] = ()
        tmp['relativeVol'] = tmp['Volume'] / tmp['Volume'].rolling(20).mean()
        dfs.append(tmp)
    return pd.concat(dfs)
stock_df = get_all_stock()

In [3]:
con1 = db.create_engine('sqlite:///'+'shortDB'+'.sqlite')
ts = pd.read_sql_query('SELECT * from DARK', con1)
con2 = db.create_engine('sqlite:///'+'shortDB'+'.sqlite')
ts_lit = pd.read_sql_query('SELECT * from Lit', con2)
ts = ts.drop_duplicates()
ts_lit = ts_lit.drop_duplicates()

In [4]:
short = pd.merge(ts, ts_lit, left_on=['Date', 'Symbol'], right_on=['Date', 'Symbol'], how='inner')  # cboe + finra 
view = pd.merge(short, stock_df, left_on=['Date', 'Symbol'], right_on=['Date', 'ticker'], how='inner') # cboe + finra + stock

In [5]:
view['Lit_VolumeRatio'] = view['CBOE_TotalVolume']/view['Volume']
view['Dark_VolumeRatio'] = view['NMS_TotalVolume']/view['Volume']
view.head(5)

Unnamed: 0,index_x,Symbol,NASDAQCAR_ShortVolume,NASDAQCAR_TotalVolume,NYSE_ShortVolume,NYSE_TotalVolume,Date,NMS_ShortVolume,NMS_TotalVolume,NASDAQCAR_ShortRatio,...,High,Low,Close,Adj Close,Volume,ticker,1dayret%,nextdayret%,Lit_VolumeRatio,Dark_VolumeRatio
0,0,A,89508.0,412346.0,36071.0,146290.0,2019-02-20,125579.0,558636.0,0.21707,...,78.800003,77.330002,78.559998,77.697357,2076500.0,A,1.341609,-0.852852,0.125743,0.269028
1,5,AAL,844109.0,1372781.0,319835.0,584804.0,2019-02-20,1163944.0,1957585.0,0.61489,...,35.240002,34.77,35.060001,34.596169,6969900.0,AAL,-1.155894,-0.22819,0.16514,0.280863
2,9,AAP,272139.0,469307.0,18423.0,54741.0,2019-02-20,290562.0,524048.0,0.579874,...,167.029999,162.589996,166.0,165.279968,1990200.0,AAP,-0.682054,-2.825314,0.123362,0.263314
3,10,AAPL,2626567.0,5520623.0,1587174.0,3256997.0,2019-02-20,4213741.0,8777620.0,0.475774,...,173.320007,170.990005,172.029999,169.807388,26114400.0,AAPL,0.643545,-0.56385,0.121747,0.336122
4,18,ABBV,435128.0,1587068.0,39094.0,260434.0,2019-02-20,474222.0,1847502.0,0.274171,...,80.599998,79.139999,79.989998,73.424461,5564100.0,ABBV,-0.658219,-1.550194,0.118325,0.33204


In [58]:
view = view[abs(view['1dayret%']) < 50]
view = view[abs(view['nextdayret%']) < 50]

# Machine Learning Experiment

In [59]:
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
print(view.shape)
view = view[~view.isin([np.nan, np.inf, -np.inf]).any(1)]
print(view.shape)
features = ['NMS_ShortRatio', 'CBOE_ShortRatio', 'Lit_VolumeRatio', 'Dark_VolumeRatio', '1dayret%']
view['classification'] = np.where(view['nextdayret%']>1, 1, np.where(view['nextdayret%']<-1, -1, 0))
label = 'classification'
train = view[view['Date'] < '2020-02-15']
test = view[view['Date'] > '2020-02-15']
train_X = train[features]
#train_X = preprocessing.scale(train_X)
train_Y = train[label]
test_X = test[features]
#test_X = preprocessing.scale(test_X)
test_Y = test[label]

(138448, 43)
(138448, 43)


In [60]:
model = LinearRegression()
model.fit(train_X, train_Y)
model.score(test_X, test_Y)

-0.0017284496993419385

In [61]:
model = RandomForestClassifier(n_estimators=200)
model.fit(train_X, train_Y)
model.score(test_X, test_Y)

0.2732590020106013

In [62]:
confusion_matrix(test_Y, model.predict(test_X))

array([[ 1663, 11227,  2315],
       [  598,  6748,   900],
       [ 1564, 11228,  2054]], dtype=int64)

In [63]:
result = pd.DataFrame()
result['actual'] = test_Y
result['predict'] = model.predict(test_X)
result['ret'] = view['nextdayret%']
result.head(5)

Unnamed: 0,actual,predict,ret
100159,0,0,0.660446
100160,-1,0,-1.047849
100161,-1,0,-1.191737
100162,1,-1,1.448274
100163,0,0,0.566176


In [64]:
result.groupby('predict').ret.mean()

predict
-1   -0.102048
 0    0.024909
 1   -0.078132
Name: ret, dtype: float64

In [71]:
result.groupby('predict').ret.std()

predict
-1    6.948320
 0    4.896743
 1    5.989853
Name: ret, dtype: float64

In [66]:
result.groupby('predict').ret.count()

predict
-1     3825
 0    29203
 1     5269
Name: ret, dtype: int64

In [67]:
result.groupby('predict').ret.max()

predict
-1    42.192346
 0    41.097036
 1    41.015989
Name: ret, dtype: float64

In [68]:
result.groupby('predict').ret.min()

predict
-1   -46.852121
 0   -37.755270
 1   -44.645802
Name: ret, dtype: float64

In [69]:
result.groupby('predict').ret.quantile(0.05)

predict
-1   -11.161714
 0    -7.552218
 1    -9.260840
Name: ret, dtype: float64

In [70]:
result.groupby('predict').ret.quantile(0.95)

predict
-1    11.082019
 0     7.718538
 1     9.413094
Name: ret, dtype: float64

# Quantile Research On DIX

In [None]:
dix_bin_ret = {}
dp_vol = {}
tickers = ['FB', 'AAPL', 'NFLX', 'CAT', 'MMM', 'JNJ', 'NVDA', 'AMD', 'AMZN']

for ticker in tickers:
    aapl = ts[ts['Symbol']==ticker]
    aapl = pd.merge(aapl, stock_df[stock_df['Ticker']==ticker], on ='Date', how='inner')
    aapl.index = aapl['Date']
    aapl['nextday_ret'] = (aapl['Close'].shift(-1) - aapl['Close']) / aapl['Close'] * 100
    aapl['darkPoolRatio'] = aapl['NMS_TotalVolume'] / aapl['Volume']
    bin_labels = [i for i in range(5)]
    aapl['bin'] = pd.qcut(aapl['NMS_ShortRatio'], q=5, labels=bin_labels)
    dp_vol[ticker] = aapl.groupby('bin').darkPoolRatio.mean()
    dix_bin_ret[ticker] = aapl.groupby('bin').nextday_ret.std()
#     aapl['NMS_ShortRatio'].plot(figsize=(20,10), grid=True)
#     plt.show()
#     aapl['nextday_ret'].plot(figsize=(20,10), grid=True)