<a href="https://www.kaggle.com/code/yaaangzhou/optiver-eda?scriptVersionId=144529121" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Created by Yang Zhou**

**[Optiver]📊EDA**

**20 Sep 2023**

# <center style="font-family: consolas; font-size: 32px; font-weight: bold;">[Optiver]📊EDA</center>
<p><center style="color:#949494; font-family: consolas; font-size: 20px;">Predict US stocks closing movements</center></p>

***

# 0. Imports

In [None]:
# Basic
import numpy as np
import pandas as pd
import random
pd.set_option('display.max_columns', None)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Adjusting plot style

rc = {
    "axes.facecolor": "#F8F8F8",
    "figure.facecolor": "#F8F8F8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}

sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

from colorama import Style, Fore
blk = Style.BRIGHT + Fore.BLACK
mgt = Style.BRIGHT + Fore.MAGENTA
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL


# 1. Load Data

In [None]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')

revealed_target = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [None]:
train.head()

In [None]:
target = 'target'

num_var = [col for col in train.columns if train[col].nunique() > 10]

# Remove id cols
num_var.remove(target)
num_var.remove('time_id')
num_var.remove('row_id')
num_var.remove('stock_id')
num_var.remove('date_id')

cat_var = [col for col in train.columns if train[col].nunique() < 10]

# 2. EDA

Let's take a look at the data.

In [None]:
train.describe().T\
     .style.bar(subset=['mean'])\
     .background_gradient(subset=['std'])\
     .background_gradient(subset=['50%'])

In [None]:
def summary(df):
    sum = pd.DataFrame(df.dtypes,columns=['dtypes'])
    sum['missing'] = df.isna().sum()
    sum['missing%'] = df.isna().sum()/len(df)
    sum['uniques'] = df.nunique().values
    sum['counts'] = df.count().values
    return sum
    
summary(train).style.background_gradient(cmap='Blues')

Some insights:
+ The columns for `far_price` and `near_price` have very large missing values, taking up more than half of them.
+ There are a total of 200 stock data recorded in the data, and we can pick and choose a portion to view.

## Distribution of target for sample stocks

In [None]:
for stock_id in random.sample(sorted(train['stock_id'].unique()),5):
    df_stock = train[train['stock_id']==stock_id].set_index('date_id')['target']
    
    # Plot Time Series
    fig,ax = plt.subplots(figsize=(24,6),dpi=100)
    ax.plot(df_stock,linewidth=2)
    ax.tick_params(axis='x',pad=10)
    ax.tick_params(axis='y',pad=10)
    ax.set_title(f'stock_id {stock_id}: target')
    plt.show()

## Time Series Plots of `bid_price` and `ask_price` in a day

In [None]:
sample_stock = random.sample(sorted(train['stock_id'].unique()),1)
sample_date = random.sample(sorted(train['date_id'].unique()),1)

df_sample = train.query(f'stock_id == {sample_stock} & date_id == {sample_date}')
df_sample[['seconds_in_bucket','bid_price','ask_price']].set_index('seconds_in_bucket').plot(title=f'Stock: {sample_stock} in Day: {sample_date}');

In [None]:
sample_stock = random.sample(sorted(train['stock_id'].unique()),1)
sample_date = random.sample(sorted(train['date_id'].unique()),1)

df_sample = train.query(f'stock_id == {sample_stock} & date_id == {sample_date}')
df_sample[['seconds_in_bucket','bid_price','ask_price']].set_index('seconds_in_bucket').plot(title=f'Stock: {sample_stock} in Day: {sample_date}');

Ok, so `bid_price` looks always higher than `ask_price`.

## Distribution of numerical variables for sample stocks

In [None]:
for stock_id in random.sample(sorted(train['stock_id'].unique()),3):
    df_stock = train[train['stock_id']==stock_id]
    
    fig, axes = plt.subplots(len(num_var), 2 ,figsize = (16, len(num_var) * 4.2), 
                         gridspec_kw = {'hspace': 0.35, 'wspace': 0.3, 'width_ratios': [0.80, 0.20]});
    
    for i,col in enumerate(num_var):
        ax = axes[i,0];
        sns.kdeplot(data = df_stock, x = col, ax = ax, linewidth = 2.1)
        ax.set_title(f"Stock id: {stock_id}\n Column: {col}",fontsize = 9, fontweight= 'bold');
        ax.grid(visible=True, which = 'both', linestyle = '--', color='lightgrey', linewidth = 0.75);
        ax.set(xlabel = '', ylabel = '');
        ax = axes[i,1];
        sns.boxplot(data = df_stock, y = col, width = 0.25,saturation = 0.90, linewidth = 0.90, fliersize= 2.25, color = '#037d97',
                    ax = ax);
        ax.set(xlabel = '', ylabel = '');
        ax.set_title(f"Boxplot",fontsize = 9, fontweight= 'bold');

    plt.tight_layout();
    plt.show();

Most of the data conforms to a normal distribution.

## Scatter Plots for features and target

In [None]:
fig, ax = plt.subplots(len(num_var), 1, figsize=(8, 4 * len(num_var)))

for i, feature in enumerate(num_var):
    sns.scatterplot(x=feature, y='target', data=train, ax=ax[i], alpha=0.6, edgecolor=None)
    ax[i].set_title(f'{feature} vs Target', fontsize=14)
    ax[i].set_xlabel(feature, fontsize=12)
    ax[i].set_ylabel('Target', fontsize=12)

plt.tight_layout()
plt.show()

Can't see much correlation between those variables and the target.

## Distribution of target for all stocks

In [None]:
fig, axes = plt.subplots(1, 2);

# kdeplot
ax = axes[0];
sns.kdeplot(data = train, x = target, ax = ax, linewidth = 2.1)
ax.set_title(f"Target distribution",fontsize = 9, fontweight= 'bold');
ax.grid(visible=True, which = 'both', linestyle = '--', color='lightgrey', linewidth = 0.75);
ax.set(xlabel = '', ylabel = '');

# boxplot
ax = axes[1];
sns.boxplot(data = train, y = target, width = 0.25,saturation = 0.90, linewidth = 0.90, fliersize= 2.25, color = '#037d97',
            ax = ax);
ax.set(xlabel = '', ylabel = '');
ax.set_title(f"Target Boplot",fontsize = 9, fontweight= 'bold');


plt.tight_layout();
plt.show();