In [42]:
import pandas as pd
import numpy as np
import _pickle as cPickle
from datetime import timedelta
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from itertools import chain
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline

In [43]:
df1 = pd.read_csv('../custom_data/Feature_Engineering/for_cloud.csv')
df1.dropna(axis = 0, how = 'any', inplace = True)
df1.isna().sum()

df1['shifted_close'] = df1.groupby('ticker')['Close'].shift(-1)
df1['tgt'] = (df1.shifted_close - df1.Close).div(df1.shifted_close) * 100
df1['target'] = 0
df1.loc[(df1['tgt'] >= 0), 'target'] = 1
df1.loc[(df1['tgt'] < 0), 'target'] = 0
df1.drop(['shifted_close', 'tgt'], axis = 1 , inplace = True)

In [44]:
df1['Date_dt'] = pd.to_datetime(df1['Date'], format='%Y%m%d', errors='ignore')
df1['year'] = pd.DatetimeIndex(df1['Date']).year
df1['month'] = pd.DatetimeIndex(df1['Date']).month
df1.drop(['Date', 'Name'], axis = 1, inplace = True)

In [45]:
df1.head()

Unnamed: 0,ticker,Adj Close,Close,High,Low,Open,Volume,Sector,target,Date_dt,year,month
0,AAL,7.405228,7.73,7.87,7.48,7.48,4194100.0,Industrials,1,2008-12-31,2008,12
1,AAL,8.037499,8.39,8.48,7.67,7.73,5167000.0,Industrials,0,2009-01-02,2009,1
2,AAL,7.980019,8.33,8.39,7.96,8.38,3457100.0,Industrials,1,2009-01-05,2009,1
3,AAL,8.679349,9.06,9.21,8.13,8.15,5731000.0,Industrials,1,2009-01-06,2009,1
4,AAL,8.698509,9.08,9.47,8.66,8.66,5468900.0,Industrials,1,2009-01-07,2009,1


In [46]:
embedding_lengths = [100, 2, 2, 3]
encodings = {}
categorical_cols = ['ticker', 'Sector', 'year', 'month']

In [47]:
def get_xy(df):
    y = df['target']
    x = get_x(df)
    
    return x, y

def label_encode(series, min_counts=2):
    vc = series.value_counts()
    #reserve 0 for unknown
    le = {c : i+1 for i, c in enumerate(vc.index[vc > min_counts])}
    le['UNKN'] = 0
    return le

def get_encodings(df, cat_cols):
    if len(encodings) == 0:
        for col in cat_cols:
            encodings[col] = label_encode(df[col])
    return encodings

def map_encodings(df, cat_cols, encs):
    for col in cat_cols:
        df[col] = df[col].map(encs[col]).fillna(0).astype(int)
        
def get_x(df, isTrain=True):
    encodings = get_encodings(df, categorical_cols)
    map_encodings(df, categorical_cols, encodings) 
    if isTrain:
        cols_to_drop = ['target', 'Date_dt']
    else: 
        cols_to_drop = ['time']
    
    df.drop(columns=cols_to_drop, inplace=True)
        
    return df

In [48]:
X, y = get_xy(df1)

In [50]:
#get all the numeric columns
num_cols = [x for x in X.columns if x not in categorical_cols]

#remove assetCode from num_cols
num_cols = [x for x in num_cols if x not in ['ticker']]

In [51]:
#scale numeric cols
def scale_numeric(df):
    df[num_cols] = df[num_cols].fillna(0)

    scaler = StandardScaler()
    
    #need to do this due to memory contraints
    for i in range(0, len(num_cols), 4):
        cols = num_cols[i:i + 3]
        df[cols] = scaler.fit_transform(df[cols].astype(float))
        
scale_numeric(X)

In [52]:
n_train = int(X.shape[0] * 0.8)

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_valid, y_valid = X.iloc[n_train:], y.iloc[n_train:]

In [53]:
def get_cat_num_split(df):
    X = {} 
    X['num'] = df.loc[:, num_cols].values
    X['num'] = np.reshape(X['num'], (X['num'].shape[0], 1, X['num'].shape[1]))
    for cat in categorical_cols:
        X[cat] = df.loc[:, cat].values
    return X

In [54]:
X_train_split = get_cat_num_split(X_train)
X_valid_split = get_cat_num_split(X_valid)

In [59]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, LSTM, Dropout, Reshape
from keras.losses import binary_crossentropy, mse
from keras.regularizers import l2
import keras.backend as K

DROPOUT_RATE = 0.2

cat_inputs = [Input(shape=[1], name=cat) for cat in categorical_cols]
embeddings = [Embedding(encoding_len[cat], embedding_lengths[i])(cat_inputs[i]) for i, cat in enumerate(categorical_cols)]
categorical_logits = Concatenate()([(cat_emb) for cat_emb in embeddings])
categorical_logits = LSTM(128, activation='relu', input_shape=(1, len(categorical_cols)), return_sequences=True,
                         kernel_regularizer=l2(1e-5), kernel_initializer='random_uniform')(categorical_logits)

numerical_inputs = Input(shape=(1, len(num_cols)), name='num')
numerical_logits = LSTM(256, activation='relu', input_shape=(1, len(num_cols)), return_sequences=True,
                        kernel_regularizer=l2(1e-5), kernel_initializer='random_uniform')(numerical_inputs)
numerical_logits = Dropout(DROPOUT_RATE)(numerical_logits)

logits = Concatenate()([numerical_logits,categorical_logits])
logits = LSTM(256, activation='relu', kernel_initializer='random_uniform')(logits)
out = Dense(1, activation='sigmoid', name='confidence_level')(logits)

model = Model(inputs = cat_inputs + [numerical_inputs], outputs=out)
model.compile(loss='binary_crossentropy', optimizer='adam')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [60]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ticker (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
Sector (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
year (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
month (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
num (Input

In [62]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

check_point = ModelCheckpoint('model.hdf5',verbose=True, save_best_only=True)
early_stop = EarlyStopping(patience=1,verbose=True)
model.fit(X_train_split, y_train,
          validation_data=(X_valid_split, y_valid),
          epochs=1,
          verbose=True,
          callbacks=[early_stop,check_point]) 

Train on 934781 samples, validate on 233696 samples
Epoch 1/1


  9760/934781 [..............................] - ETA: 34:39:26 - loss: 0.697 - ETA: 17:36:53 - loss: 0.694 - ETA: 11:55:13 - loss: 0.696 - ETA: 9:02:59 - loss: 0.697 - ETA: 7:21:09 - loss: 0.69 - ETA: 6:12:35 - loss: 0.69 - ETA: 5:23:48 - loss: 0.69 - ETA: 4:47:02 - loss: 0.69 - ETA: 4:18:33 - loss: 0.69 - ETA: 3:55:35 - loss: 0.69 - ETA: 3:37:03 - loss: 0.70 - ETA: 3:21:30 - loss: 0.70 - ETA: 3:09:04 - loss: 0.70 - ETA: 2:57:52 - loss: 0.70 - ETA: 2:48:38 - loss: 0.70 - ETA: 2:40:03 - loss: 0.70 - ETA: 2:32:24 - loss: 0.71 - ETA: 2:25:45 - loss: 0.71 - ETA: 2:19:43 - loss: 0.70 - ETA: 2:14:19 - loss: 0.70 - ETA: 2:09:21 - loss: 0.70 - ETA: 2:04:50 - loss: 0.70 - ETA: 2:01:06 - loss: 0.70 - ETA: 1:57:43 - loss: 0.70 - ETA: 1:54:40 - loss: 0.70 - ETA: 1:51:44 - loss: 0.70 - ETA: 1:49:07 - loss: 0.70 - ETA: 1:46:23 - loss: 0.70 - ETA: 1:43:59 - loss: 0.70 - ETA: 1:41:31 - loss: 0.70 - ETA: 1:39:35 - loss: 0.70 - ETA: 1:37:39 - loss: 0.70 - ETA: 1:36:05 - loss: 0.70 - ETA: 1:34:27 - loss:

 29856/934781 [..............................] - ETA: 35:26 - loss: 0.69 - ETA: 35:26 - loss: 0.69 - ETA: 35:25 - loss: 0.69 - ETA: 35:24 - loss: 0.69 - ETA: 35:24 - loss: 0.69 - ETA: 35:23 - loss: 0.69 - ETA: 35:22 - loss: 0.69 - ETA: 35:22 - loss: 0.69 - ETA: 35:22 - loss: 0.69 - ETA: 35:21 - loss: 0.69 - ETA: 35:20 - loss: 0.69 - ETA: 35:19 - loss: 0.69 - ETA: 35:18 - loss: 0.69 - ETA: 35:17 - loss: 0.69 - ETA: 35:16 - loss: 0.69 - ETA: 35:16 - loss: 0.69 - ETA: 35:15 - loss: 0.69 - ETA: 35:14 - loss: 0.69 - ETA: 35:13 - loss: 0.69 - ETA: 35:12 - loss: 0.69 - ETA: 35:11 - loss: 0.69 - ETA: 35:10 - loss: 0.69 - ETA: 35:10 - loss: 0.69 - ETA: 35:09 - loss: 0.69 - ETA: 35:08 - loss: 0.69 - ETA: 35:08 - loss: 0.69 - ETA: 35:07 - loss: 0.69 - ETA: 35:06 - loss: 0.69 - ETA: 35:05 - loss: 0.69 - ETA: 35:05 - loss: 0.69 - ETA: 35:04 - loss: 0.69 - ETA: 35:04 - loss: 0.69 - ETA: 35:04 - loss: 0.69 - ETA: 35:03 - loss: 0.69 - ETA: 35:02 - loss: 0.69 - ETA: 35:01 - loss: 0.69 - ETA: 35:01 - lo

 37184/934781 [>.............................] - ETA: 33:26 - loss: 0.69 - ETA: 33:25 - loss: 0.69 - ETA: 33:25 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:24 - loss: 0.69 - ETA: 33:23 - loss: 0.69 - ETA: 33:23 - loss: 0.69 - ETA: 33:23 - loss: 0.69 - ETA: 33:22 - loss: 0.69 - ETA: 33:22 - loss: 0.69 - ETA: 33:21 - loss: 0.69 - ETA: 33:21 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:20 - loss: 0.69 - ETA: 33:19 - loss: 0.69 - ETA: 33:19 - loss: 0.69 - ETA: 33:18 - loss: 0.69 - ETA: 33:18 - loss: 0.69 - ETA: 33:17 - loss: 0.69 - ETA: 33:17 - loss: 0.69 - ETA: 33:16 - loss: 0.69 - ETA: 33:16 - loss: 0.69 - ETA: 33:15 - loss: 0.69 - ETA: 33:15 - lo

KeyboardInterrupt: 