## **Technical indicators**

In [None]:
# Download TA-Lib
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzf ta-lib-0.4.0-src.tar.gz
!cd ta-lib && ./configure --prefix=/usr && make && make install
!pip install TA-Lib

In [None]:
import pandas as pd
import numpy as np
import talib as ta

class TechnicalIndicators:
    def __init__(self, data):
        self.data = data

    def add_momentum_indicators(self):
        self.data['RSI'] = ta.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], self.data['MACD_hist'] = ta.MACD(self.data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
        self.data['Stoch_k'], self.data['Stoch_d'] = ta.STOCH(self.data['High'], self.data['Low'], self.data['Close'],
                                                              fastk_period=14, slowk_period=3, slowd_period=3)

    def add_volume_indicators(self):
        self.data['OBV'] = ta.OBV(self.data['Close'], self.data['Volume'])

    def add_volatility_indicators(self):
        self.data['Upper_BB'], self.data['Middle_BB'], self.data['Lower_BB'] = ta.BBANDS(self.data['Close'], timeperiod=20)
        self.data['ATR_1'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=1)
        self.data['ATR_2'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=2)
        self.data['ATR_5'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)
        self.data['ATR_10'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=10)
        self.data['ATR_20'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=20)

    def add_trend_indicators(self):
        self.data['ADX'] = ta.ADX(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['+DI'] = ta.PLUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['-DI'] = ta.MINUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['CCI'] = ta.CCI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=5)

    def add_other_indicators(self):
        self.data['DLR'] = np.log(self.data['Close'] / self.data['Close'].shift(1))
        self.data['TWAP'] = self.data['Close'].expanding().mean()
        self.data['VWAP'] = (self.data['Volume'] * (self.data['High'] + self.data['Low']) / 2).cumsum() / self.data['Volume'].cumsum()

    def add_all_indicators(self):
        self.add_momentum_indicators()
        self.add_volume_indicators()
        self.add_volatility_indicators()
        self.add_trend_indicators()
        self.add_other_indicators()
        return self.data

In [None]:
data = pd.read_csv('xnas-itch-20230703.tbbo.csv')

# Preprocessing to create necessary columns
data['price']=data['price']/1e9
data['bid_px_00']=data['bid_px_00']/1e9
data['ask_px_00']=data['ask_px_00']/1e9

data['Close'] = data['price']
data['Volume'] = data['size']
data['High'] = data[['bid_px_00', 'ask_px_00']].max(axis=1)
data['Low'] = data[['bid_px_00', 'ask_px_00']].min(axis=1)
data['Open'] = data['Close'].shift(1).fillna(data['Close'])


ti = TechnicalIndicators(data)
df_with_indicators = ti.add_all_indicators()
market_features_df = df_with_indicators[35:]

In [None]:
market_features_df.to_csv('market_features_df.csv')
from google.colab import files
files.download("market_features_df.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Add New Indicators**

In [None]:
import numpy as np
import pandas as pd
import requests
from textblob import TextBlob
from datetime import timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input

class MyIndicators:
    def __init__(self, ticker, data):
        self.ticker = ticker
        self.market_features_df = data
        self.market_features_df.loc[:, 'ts_event'] = pd.to_datetime(self.market_features_df['ts_event'], unit='ns')
        self.min_timestamp = self.market_features_df['ts_event'].min().strftime('%Y%m%d%H%M%S')
        self.max_timestamp = self.market_features_df['ts_event'].max().strftime('%Y%m%d%H%M%S')
        self.all_news_articles = None
        self.X = None
        self.y = None

    def fetch_all_news_gdelt(self):
        url = f"https://api.gdeltproject.org/api/v2/doc/doc?query={self.ticker}&mode=artlist&startdatetime={self.min_timestamp}&enddatetime={self.max_timestamp}&maxrecords=250&format=json"
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('articles', [])
        else:
            return []

    def get_sentiment_score(self, articles, timestamp, time_window=timedelta(minutes=5)):
        relevant_articles = [
            article for article in articles
            if 'seendate' in article and
            (timestamp - time_window) <= pd.to_datetime(article['seendate'], format='%Y%m%dT%H%M%SZ').replace(tzinfo=None) <= (timestamp + time_window)
        ]
        if not relevant_articles:
            return 0
        aggregated_score = 0
        for article in relevant_articles:
            title = article.get('title', '')
            description = article.get('description', '')
            content = title + ' ' + description
            score = TextBlob(content).sentiment.polarity
            aggregated_score += score
        return aggregated_score / len(relevant_articles)

    def add_sentiment_indicators(self):
        self.all_news_articles = self.fetch_all_news_gdelt()
        sentiment_scores = []
        for index, row in self.market_features_df.iterrows():
            sentiment_score = self.get_sentiment_score(self.all_news_articles, row['ts_event'])
            sentiment_scores.append(sentiment_score)
        self.market_features_df['sentiment_score'] = sentiment_scores

    def prepare_data_for_modeling(self):
        # Fill missing values and drop any rows with NaNs
        self.market_features_df.fillna(method='ffill', inplace=True)
        self.market_features_df.dropna(inplace=True)

        # Scale the features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data = scaler.fit_transform(self.market_features_df[['Close']])

        # Prepare the dataset for LSTM
        X, y = [], []
        time_step = 10
        for i in range(time_step, len(scaled_data)):
            X.append(scaled_data[i-time_step:i, 0])
            y.append(scaled_data[i, 0])
        X, y = np.array(X), np.array(y)
        X = np.reshape(X, (X.shape[0], X.shape[1], 1))

        # Store the prepared data
        self.X, self.y = X, y

    def add_lstmtrend_indicators(self):
        # Ensure data is prepared
        if self.X is None or self.y is None:
            self.prepare_data_for_modeling()

        # Build LSTM model
        model = Sequential()
        model.add(LSTM(units=50, return_sequences=True, input_shape=(self.X.shape[1], 1)))
        model.add(LSTM(units=50, return_sequences=False))
        model.add(Dense(units=1))

        model.compile(optimizer='adam', loss='mean_squared_error')
        self.model = model

        # Train LSTM model
        self.model.fit(self.X, self.y, epochs=8, batch_size=32)

        # Predict trends
        predictions = self.model.predict(self.X).flatten()

        # Create a new column with NaN values to store predictions
        self.market_features_df['predicted_trend'] = np.nan

        # Calculate the starting index for the predictions in the original dataframe
        start_idx = len(self.market_features_df) - len(predictions)

        # Insert predictions into the dataframe
        self.market_features_df.iloc[start_idx:, self.market_features_df.columns.get_loc('predicted_trend')] = predictions

    def add_anomaly_indicators(self):
        # Ensure data is prepared
        if self.X is None:
            self.prepare_data_for_modeling()

        # Build Autoencoder model
        input_layer = Input(shape=(self.X.shape[1], 1))
        encoded = LSTM(64, activation='relu')(input_layer)
        decoded = Dense(self.X.shape[1], activation='sigmoid')(encoded)
        autoencoder = Model(inputs=input_layer, outputs=decoded)
        autoencoder.compile(optimizer='adam', loss='mean_squared_error')

        # Train Autoencoder model
        autoencoder.fit(self.X, self.X, epochs=8, batch_size=32)

        # Calculate reconstruction error
        predictions = autoencoder.predict(self.X)
        predictions = predictions.reshape(predictions.shape[0], predictions.shape[1])
        reconstruction_error = np.mean(np.abs(predictions - self.X.reshape(self.X.shape[0], self.X.shape[1])), axis=1)

        # Add anomaly score to the dataframe
        anomaly_scores_padded = np.pad(reconstruction_error, (len(self.market_features_df) - len(reconstruction_error), 0), mode='constant', constant_values=np.nan)
        self.market_features_df['anomaly_score'] = anomaly_scores_padded

    def add_isolation_forest_outlier(self):
        # Ensure data is prepared
        if self.X is None:
            self.prepare_data_for_modeling()

        # Flatten the X array for Isolation Forest
        X_flat = self.X.reshape(self.X.shape[0], -1)

        # Train Isolation Forest model
        isolation_forest = IsolationForest(contamination=0.01, random_state=42)
        isolation_forest.fit(X_flat)

        # Predict outliers
        outlier_scores = isolation_forest.decision_function(X_flat)

        # Ensure the lengths match
        outlier_scores_padded = np.pad(outlier_scores, (len(self.market_features_df) - len(outlier_scores), 0), mode='constant', constant_values=np.nan)

        # Add outlier score to the dataframe
        self.market_features_df['isolation_forest_outlier'] = outlier_scores_padded

    def add_pca_feature(self, n_components=2):
        # Select the features for PCA
        features = ['Close', 'Volume', 'Open', 'High', 'Low']  # Adjust the features as needed
        data = self.market_features_df[features].dropna()

        # Scale the features
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(data)

        # Perform PCA
        pca = PCA(n_components=n_components)
        pca_features = pca.fit_transform(scaled_data)

        # Add PCA features to the dataframe
        for i in range(n_components):
            self.market_features_df[f'pca_feature_{i+1}'] = np.nan
            self.market_features_df.loc[data.index, f'pca_feature_{i+1}'] = pca_features[:, i]

    def add_all_new_indicators(self):
        print('Adding: Sentiment Indicators')
        self.add_sentiment_indicators()
        print('Added: Sentiment Indicators')

        print('Adding: LSTM Trend Indicators')
        self.add_lstmtrend_indicators()
        print('Added: LSTM Trend Indicators')

        print('Adding: Anomaly Detection')
        self.add_anomaly_indicators()
        print('Added: Anomaly Detection')

        print('Adding: Isolation Forest Outlier Detection')
        self.add_isolation_forest_outlier()
        print('Added: Isolation Forest Outlier Detection')

        print('Adding: PCA Feature')
        self.add_pca_feature(n_components=2)
        print('Added: PCA Feature')

        return self.market_features_df


# Create an instance of MyIndicators
my_ticker = 'AAPL'  # Example ticker
market_features_df = pd.read_csv('market_features_df.csv')
my_indicators = MyIndicators(my_ticker, market_features_df)

# Add all new indicators
new_df = my_indicators.add_all_new_indicators()


Adding: Sentiment Indicators
Added: Sentiment Indicators
Adding: LSTM Trend Indicators
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Added: LSTM Trend Indicators
Adding: Anomaly Detection
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Added: Anomaly Detection
Adding: Isolation Forest Outlier Detection
Added: Isolation Forest Outlier Detection
Adding: PCA Feature
Added: PCA Feature


In [None]:
# Display the dataframe with new indicators
market_features_df_new = new_df[10:]
market_features_df_new.head(40)

Unnamed: 0.1,Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,...,CCI,DLR,TWAP,VWAP,sentiment_score,predicted_trend,anomaly_score,isolation_forest_outlier,pca_feature_1,pca_feature_2
10,45,1688371230451995982,2023-07-03 08:00:30.451829005,1,2,32,T,A,0,194.0,...,-83.333333,0.0,194.03087,194.059686,0.0,0.883612,0.024932,-0.064118,1.065775,0.045113
11,46,1688371230451995982,2023-07-03 08:00:30.451829005,1,2,32,T,A,0,194.0,...,-55.555556,0.0,194.030213,194.059705,0.0,0.883739,0.026503,-0.061145,1.065775,0.045113
12,47,1688371230451995982,2023-07-03 08:00:30.451829005,1,2,32,T,A,0,194.0,...,-41.666667,0.0,194.029583,194.059766,0.0,0.883615,0.028294,-0.05917,1.065775,0.045113
13,48,1688371230566546422,2023-07-03 08:00:30.566381995,1,2,32,T,N,0,194.09,...,166.666667,0.000464,194.030816,194.059805,0.0,0.882839,0.023028,-0.051324,1.083569,0.038385
14,49,1688371237858109689,2023-07-03 08:00:37.857944791,1,2,32,T,B,0,194.12,...,90.643275,0.000155,194.0326,194.059824,0.0,0.918513,0.019332,-0.053779,1.103463,0.012729
15,50,1688371242324266534,2023-07-03 08:00:42.324101963,1,2,32,T,B,0,194.15,...,107.142857,0.000155,194.034902,194.059871,0.0,0.931284,0.017358,-0.052743,1.121249,0.018376
16,51,1688371247317894640,2023-07-03 08:00:47.317729998,1,2,32,T,B,0,194.1,...,18.518519,-0.000258,194.036154,194.059909,0.0,0.940394,0.017043,-0.061026,1.115553,-0.004265
17,52,1688371257325756491,2023-07-03 08:00:57.325590403,1,2,32,T,A,0,194.1,...,83.333333,0.0,194.037358,194.059918,0.0,0.9179,0.020487,-0.060038,1.117744,-0.001772
18,53,1688371257325756491,2023-07-03 08:00:57.325590403,1,2,32,T,A,0,194.1,...,55.555556,0.0,194.038519,194.059928,0.0,0.916302,0.02231,-0.060038,1.117744,-0.001772
19,54,1688371259762706298,2023-07-03 08:00:59.762541862,1,2,32,T,B,0,194.12,...,89.74359,0.000103,194.04,194.060393,0.0,0.918122,0.022649,-0.060532,1.121698,-0.003267


In [None]:
market_features_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59226 entries, 10 to 59235
Data columns (total 54 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Unnamed: 0                59226 non-null  int64         
 1   ts_recv                   59226 non-null  int64         
 2   ts_event                  59226 non-null  datetime64[ns]
 3   rtype                     59226 non-null  int64         
 4   publisher_id              59226 non-null  int64         
 5   instrument_id             59226 non-null  int64         
 6   action                    59226 non-null  object        
 7   side                      59226 non-null  object        
 8   depth                     59226 non-null  int64         
 9   price                     59226 non-null  float64       
 10  size                      59226 non-null  int64         
 11  flags                     59226 non-null  int64         
 12  ts_in_delta      

In [None]:
market_features_df_new.to_csv('market_features_df_new.csv')
from google.colab import files
files.download("market_features_df_new.csv")