In [68]:
import pandas as pd
import os
import glob

pandas_df = pd.read_csv('cleaned_optiver_data.csv') 


In [69]:
import pandas as pd

pandas_df['date_id'] = pd.to_datetime('2022-01-01') + pd.to_timedelta(pandas_df['date_id'], unit='D')

print(pandas_df.head())

   stock_id    date_id  seconds_in_bucket  imbalance_size  \
0        12 2022-01-01                  0     11739945.44   
1        12 2022-01-01                 10     10193058.04   
2        12 2022-01-01                 20     10168196.92   
3        12 2022-01-01                 30     10168196.92   
4        12 2022-01-01                 40      9954725.98   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                        1         0.999794   13597118.70        0.0   
1                        1         1.000882   15123925.96        0.0   
2                        1         1.001007   15123925.96        0.0   
3                        1         1.001133   15123925.96        0.0   
4                        1         1.001216   15337396.90        0.0   

   near_price  bid_price  bid_size  ask_price  ask_size       wap    target  \
0         0.0   0.999794  33221.00   1.000296  47824.00  1.000000  8.399487   
1         0.0   1.000673   1674.47   1.000

In [70]:
pandas_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5237980 entries, 0 to 5237979
Data columns (total 17 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   stock_id                 int64         
 1   date_id                  datetime64[ns]
 2   seconds_in_bucket        int64         
 3   imbalance_size           float64       
 4   imbalance_buy_sell_flag  int64         
 5   reference_price          float64       
 6   matched_size             float64       
 7   far_price                float64       
 8   near_price               float64       
 9   bid_price                float64       
 10  bid_size                 float64       
 11  ask_price                float64       
 12  ask_size                 float64       
 13  wap                      float64       
 14  target                   float64       
 15  time_id                  int64         
 16  row_id                   object        
dtypes: datetime64[ns](1), float

In [71]:
# This plot lets you select multiple stocks and monitor their attributes daily, weekly or monthly. 
# You can select the day, the week or the month that you want to look into.

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# Convert 'date_id' to datetime format
pandas_df['date_id'] = pd.to_datetime(pandas_df['date_id'], format='%Y-%m-%d')

# Calculate daily statistics
daily_stats = pandas_df.groupby(['date_id', 'stock_id']).agg({
    'imbalance_size': 'mean',
    'reference_price': 'mean',  # Add more columns as needed
    'far_price' : 'mean',
    'matched_size' : 'mean',
    'near_price' : 'mean',
    'bid_price' : 'mean',
    'ask_price': 'mean',
    'bid_size' : 'mean',
    'ask_size' : 'mean',
    'wap' : 'mean',
}).reset_index()

app = dash.Dash(__name__)

# Unique stock IDs
stock_ids = daily_stats['stock_id'].unique()

# Dropdown for stock selection
stock_dropdown = dcc.Dropdown(
    id='stock-dropdown',
    options=[{'label': str(stock_id), 'value': stock_id} for stock_id in stock_ids],
    multi=True,  # Allow multiple selections
    value=[stock_ids[0]]  # Default selection
)

# Dropdowns for selecting attributes
x_attribute_dropdown = dcc.Dropdown(
    id='x-attribute-dropdown',
    options=[{'label': col, 'value': col} for col in daily_stats.columns],
    multi=False,
    value='date_id'  # Default selection
)

y_attribute_dropdown = dcc.Dropdown(
    id='y-attribute-dropdown',
    options=[{'label': col, 'value': col} for col in daily_stats.columns],
    multi=False,
    value='imbalance_size'  # Default selection
)

# Dropdown for time granularity
time_granularity_dropdown = dcc.Dropdown(
    id='time-granularity-dropdown',
    options=[
        {'label': 'Daily', 'value': 'D'},
        {'label': 'Weekly', 'value': 'W'},
        {'label': 'Monthly', 'value': 'M'}
    ],
    multi=False,
    value='D'  # Default selection
)

# Dropdown for specifying day, week, or month
time_specification_dropdown = dcc.Dropdown(
    id='time-specification-dropdown',
    multi=False,
    disabled=True  # Disabled by default until time granularity is selected
)

# Layout of the app
app.layout = html.Div([
    html.H1("Stock Analysis Dashboard"),
    html.Label("Select Stock ID(s):"),
    stock_dropdown,
    html.Label("Select X-Axis Attribute:"),
    x_attribute_dropdown,
    html.Label("Select Y-Axis Attribute:"),
    y_attribute_dropdown,
    html.Label("Select Time Granularity:"),
    time_granularity_dropdown,
    html.Label("Specify Day/Week/Month:"),
    time_specification_dropdown,
    dcc.Graph(id='stock-graph'),
])

# Callback to update the time specification dropdown based on time granularity
@app.callback(
    Output('time-specification-dropdown', 'options'),
    [Input('time-granularity-dropdown', 'value')]
)
def update_time_specification_dropdown(time_granularity):
    if time_granularity == 'D':
        options = [{'label': str(day), 'value': day} for day in range(1, 32)]
    elif time_granularity == 'W':
        options = [{'label': f'Week {week}', 'value': week} for week in range(1, 54)]
    elif time_granularity == 'M':
        options = [{'label': f'Month {month}', 'value': month} for month in range(1, 13)]
    else:
        options = []
    return options

# Callback to update the graph based on user input
@app.callback(
    Output('stock-graph', 'figure'),
    [Input('stock-dropdown', 'value'),
     Input('x-attribute-dropdown', 'value'),
     Input('y-attribute-dropdown', 'value'),
     Input('time-granularity-dropdown', 'value'),
     Input('time-specification-dropdown', 'value')]
)
def update_graph(selected_stocks, x_attribute, y_attribute, time_granularity, time_specification):
    # Filter data based on selected stocks
    filtered_data = daily_stats[daily_stats['stock_id'].isin(selected_stocks)]

    # Filter data based on time specification
    if time_specification is not None:
        if time_granularity == 'D':
            filtered_data = filtered_data[filtered_data['date_id'].dt.day == int(time_specification)]
        elif time_granularity == 'W':
            start_date = filtered_data['date_id'].min() + pd.DateOffset(weeks=int(time_specification) - 1)
            end_date = start_date + pd.DateOffset(weeks=1) - pd.DateOffset(days=1)
            filtered_data = filtered_data[(filtered_data['date_id'] >= start_date) & (filtered_data['date_id'] <= end_date)]
            x_attribute = 'date_id'  # Change x-axis to date for weekly granularity
        elif time_granularity == 'M':
            start_date = pd.to_datetime(f'{filtered_data["date_id"].dt.year.min()}-{time_specification}-01')
            end_date = start_date + pd.offsets.MonthEnd()
            filtered_data = filtered_data[(filtered_data['date_id'] >= start_date) & (filtered_data['date_id'] <= end_date)]
            x_attribute = 'date_id'  # Change x-axis to date for monthly granularity

    # Create line plot using Plotly Express
    fig = px.line(filtered_data, x=x_attribute, y=y_attribute, color='stock_id',
                  title=f'Stock Analysis: {y_attribute} vs {x_attribute}')

    return fig

# Callback to enable/disable time specification dropdown based on time granularity
@app.callback(
    Output('time-specification-dropdown', 'disabled'),
    [Input('time-granularity-dropdown', 'value')]
)
def update_time_specification_dropdown_state(time_granularity):
    return False if time_granularity else True

if __name__ == '__main__':
    app.run_server(debug=True)


Run the following cells one by one to see the various plotly dashboards

In [51]:
# This is a plot to see the frequency distribution of various attributes.

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# Sample Dash app
app = dash.Dash(__name__)

# Unique stock IDs
stock_ids = pandas_df['stock_id'].unique()

# Dropdown for stock selection
stock_dropdown = dcc.Dropdown(
    id='stock-dropdown',
    options=[{'label': str(stock_id), 'value': stock_id} for stock_id in stock_ids],
    multi=False,
    value=stock_ids[0]  # Default selection
)

# Dropdown for feature selection
feature_dropdown = dcc.Dropdown(
    id='feature-dropdown',
    options=[
        {'label': 'Imbalance Size', 'value': 'imbalance_size'},
        {'label': 'Match Size', 'value': 'matched_size'},
        {'label': 'Ask Size', 'value': 'ask_size'},
        {'label': 'Bid Size', 'value': 'bid_size'},
        {'label': 'Reference Price', 'value': 'reference_price'},
        {'label': 'Far Price', 'value': 'far_price'},
        {'label': 'Near Price', 'value': 'near_price'},
        {'label': 'Bid Price', 'value': 'bid_price'},
    ],
    multi=False,
    value='imbalance_size'  # Default selection
)
,
# Layout of the app
app.layout = html.Div([
    html.H1("Feature Analysis Dashboard"),
    html.Label("Select Stock ID:"),
    stock_dropdown,
    html.Label("Select Feature:"),
    feature_dropdown,
    dcc.Graph(id='frequency-box-plot'),
    dcc.Graph(id='feature-histogram')
])

# Callback to update the frequency box plot based on user input
@app.callback(
    Output('frequency-box-plot', 'figure'),
    [Input('stock-dropdown', 'value'),
     Input('feature-dropdown', 'value')]
)
def update_frequency_box_plot(selected_stock, selected_feature):
    # Filter data based on selected stock and feature
    filtered_data = pandas_df[pandas_df['stock_id'] == selected_stock]

    # Create frequency box plot using Plotly Express
    fig = px.box(filtered_data, x=selected_feature, points='all', title=f'Frequency Box Plot: {selected_feature} for Stock {selected_stock}')
    
    return fig

# Callback to update the feature histogram based on user input
@app.callback(
    Output('feature-histogram', 'figure'),
    [Input('stock-dropdown', 'value'),
     Input('feature-dropdown', 'value')]
)
def update_feature_histogram(selected_stock, selected_feature):
    # Filter data based on selected stock and feature
    filtered_data = pandas_df[pandas_df['stock_id'] == selected_stock]

    # Create histogram using Plotly Express
    fig = px.histogram(filtered_data, x=selected_feature, title=f'Histogram: {selected_feature} for Stock {selected_stock}')
    
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)


In [65]:
# This plot is specificlly to analyse the attribute movements for a specific day.
# You can select stock, date and attribute needned to be visualized.

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# Convert 'date_id' to datetime format
pandas_df['date_id'] = pd.to_datetime(pandas_df['date_id'], format='%Y-%m-%d')

# Sample Dash app
app = dash.Dash(__name__)

# Unique stock IDs
stock_ids = pandas_df['stock_id'].unique()

# Dropdown for stock selection
stock_dropdown = dcc.Dropdown(
    id='second-stock-dropdown',
    options=[{'label': str(stock_id), 'value': stock_id} for stock_id in stock_ids],
    multi=False,
    value=stock_ids[0]  # Default selection
)

# Dropdown for selecting a specific date
date_dropdown = dcc.DatePickerSingle(
    id='date-picker',
    display_format='YYYY-MM-DD',
    date=str(pandas_df['date_id'].min())  # Default to the minimum date in the dataset
)

# Dropdown for selecting attributes
attribute_dropdown = dcc.Dropdown(
    id='second-y-attribute-dropdown',
    options=[{'label': col, 'value': col} for col in pandas_df.columns[2:]],  # Exclude 'date_id' and 'stock_id'
    multi=False,
    value='imbalance_size'  # Default selection
)

# Layout of the app
app.layout = html.Div([
    html.H1("Second Graph with Fixed X-Axis"),
    html.Label("Select Date:"),
    date_dropdown,
    html.Label("Select Stock ID:"),
    stock_dropdown,
    html.Label("Select Y-Axis Attribute:"),
    attribute_dropdown,
    dcc.Graph(id='second-stock-graph')
])

# Callback to update the second graph based on user input
@app.callback(
    Output('second-stock-graph', 'figure'),
    [Input('date-picker', 'date'),
     Input('second-stock-dropdown', 'value'),
     Input('second-y-attribute-dropdown', 'value')]
)
def update_second_graph(selected_date, selected_stock, second_y_attribute):
    # Filter data based on selected date and stock
    second_graph_data = pandas_df[(pandas_df['date_id'] == selected_date) & (pandas_df['stock_id'] == selected_stock)]

    # Create line plot using Plotly Express
    fig = px.line(second_graph_data, x='seconds_in_bucket', y=second_y_attribute,
                  title=f'Second Graph: {second_y_attribute} vs seconds_in_bucket for Stock {selected_stock} on {selected_date}')

    return fig

if __name__ == '__main__':
    app.run_server(debug=True)


In [63]:
# This is a plot to visualize multiple attributes for a single stock.
# You can select the stock, multiple attibutes and the time granularity (day, week or month).

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# Convert 'date_id' to datetime format
pandas_df['date_id'] = pd.to_datetime(pandas_df['date_id'], format='%Y-%m-%d')

# Calculate daily statistics
daily_stats = pandas_df.groupby(['date_id', 'stock_id']).agg({
    'imbalance_size': 'mean',
    'reference_price': 'mean',
    'far_price': 'mean',
    'matched_size': 'mean',
    'near_price': 'mean',
    'bid_price': 'mean',
    'ask_price': 'mean',
    'bid_size': 'mean',
    'ask_size': 'mean',
    'wap': 'mean',
}).reset_index()

# Sample Dash app
app = dash.Dash(__name__)

# Unique stock IDs
stock_ids = daily_stats['stock_id'].unique()

# Dropdown for stock selection
stock_dropdown = dcc.Dropdown(
    id='stock-dropdown',
    options=[{'label': str(stock_id), 'value': stock_id} for stock_id in stock_ids],
    multi=False,  # Allow single selection
    value=stock_ids[0]  # Default selection
)

# Dropdown for selecting attributes
attribute_dropdown = dcc.Dropdown(
    id='attribute-dropdown',
    options=[{'label': col, 'value': col} for col in daily_stats.columns[2:]],  # Exclude 'date_id' and 'stock_id'
    multi=True,
    value=['near_price']  # Default selection
)

# Dropdown for time granularity
time_granularity_dropdown = dcc.Dropdown(
    id='time-granularity-dropdown',
    options=[
        {'label': 'Daily', 'value': 'D'},
        {'label': 'Weekly', 'value': 'W'},
        {'label': 'Monthly', 'value': 'M'}
    ],
    multi=False,
    value='D'  # Default selection
)

# Dropdown for specifying day, week, or month
time_specification_dropdown = dcc.Dropdown(
    id='time-specification-dropdown',
    multi=False,
    disabled=True  # Disabled by default until time granularity is selected
)

# Layout of the app
app.layout = html.Div([
    html.H1("Stock Analysis Dashboard"),
    html.Label("Select Stock ID:"),
    stock_dropdown,
    html.Label("Select Columns:"),
    attribute_dropdown,
    html.Label("Select Time Granularity:"),
    time_granularity_dropdown,
    html.Label("Specify Day/Week/Month:"),
    time_specification_dropdown,
    dcc.Graph(id='stock-graph'),
])

# Callback to update the time specification dropdown based on time granularity
@app.callback(
    Output('time-specification-dropdown', 'options'),
    [Input('time-granularity-dropdown', 'value')]
)
def update_time_specification_dropdown(time_granularity):
    if time_granularity == 'D':
        options = [{'label': str(day), 'value': day} for day in range(1, 32)]
    elif time_granularity == 'W':
        options = [{'label': f'Week {week}', 'value': week} for week in range(1, 54)]
    elif time_granularity == 'M':
        options = [{'label': f'Month {month}', 'value': month} for month in range(1, 13)]
    else:
        options = []
    return options

# Callback to update the graph based on user input
@app.callback(
    Output('stock-graph', 'figure'),
    [Input('stock-dropdown', 'value'),
     Input('attribute-dropdown', 'value'),
     Input('time-granularity-dropdown', 'value'),
     Input('time-specification-dropdown', 'value')]
)
def update_graph(selected_stock, selected_attributes, time_granularity, time_specification):
    # Filter data based on selected stock
    filtered_data = daily_stats[daily_stats['stock_id'] == selected_stock]

    # Filter data based on time specification
    if time_specification:
        if time_granularity == 'D':
            filtered_data = filtered_data[filtered_data['date_id'].dt.day == time_specification]
        elif time_granularity == 'W':
            # Calculate the start and end dates of the selected week
            start_date = filtered_data['date_id'].min() + pd.DateOffset(weeks=time_specification - 1)
            end_date = start_date + pd.DateOffset(days=6)
            
            # Filter data for the selected week and aggregate daily averages
            filtered_data = filtered_data[(filtered_data['date_id'] >= start_date) & (filtered_data['date_id'] <= end_date)]
            filtered_data = filtered_data.groupby(['date_id', 'stock_id']).agg({
                'imbalance_size': 'mean',
                'reference_price': 'mean',
                'far_price': 'mean',
                'matched_size': 'mean',
                'near_price': 'mean',
                'bid_price': 'mean',
                'ask_price': 'mean',
                'bid_size': 'mean',
                'ask_size': 'mean',
                'wap': 'mean',
            }).reset_index()
        elif time_granularity == 'M':
            # Calculate the start and end dates of the selected month
            start_date = pd.to_datetime(f'2022-{time_specification:02d}-01')  # Adjust the year as needed
            end_date = start_date + pd.offsets.MonthEnd(0)
            
            # Filter data for the selected month and aggregate daily averages
            filtered_data = filtered_data[(filtered_data['date_id'] >= start_date) & (filtered_data['date_id'] <= end_date)]
            filtered_data = filtered_data.groupby(['date_id', 'stock_id']).agg({
                'imbalance_size': 'mean',
                'reference_price': 'mean',
                'far_price': 'mean',
                'matched_size': 'mean',
                'near_price': 'mean',
                'bid_price': 'mean',
                'ask_price': 'mean',
                'bid_size': 'mean',
                'ask_size': 'mean',
                'wap': 'mean',
            }).reset_index()

    # Create line plot using Plotly Express
    fig = px.line(filtered_data, x='date_id', y=selected_attributes,
                  title=f'Stock Analysis for Stock {selected_stock} - Multiple Columns')
    return fig



# Callback to enable/disable time specification dropdown based on time granularity
@app.callback(
    Output('time-specification-dropdown', 'disabled'),
    [Input('time-granularity-dropdown', 'value')]
)
def update_time_specification_dropdown_state(time_granularity):
    return False if time_granularity else True


if __name__ == '__main__':
    app.run_server(debug=True)