In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_parquet(r'janestreet\p10_train_prep_v2_nd_dropna_lf_row.parquet')

# janestreet\jane-street-real-time-market-data-forecasting\test.parquet\date_id=0\part-0.parquet


In [6]:
df.head()

Unnamed: 0,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,...,feature_67,feature_69,feature_70,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_6
0,68,0,3.084694,0.389057,2.749725,0.718424,1.256622,0.588988,0.247641,-2.901283,...,0.496783,0.729775,0.048784,0.008254,-0.084836,-0.198538,-0.317852,-0.160541,-0.281694,0.208288
1,68,1,2.232906,-0.026452,2.632656,0.187209,0.593603,0.481453,0.143793,-2.271466,...,0.410599,0.600609,0.306555,-0.038054,-0.104323,-0.22707,-0.227107,-0.095917,-0.119115,-0.550641
2,68,2,2.404948,1.013794,1.809883,-0.376363,0.559347,1.132689,0.188735,-2.102561,...,-0.204304,0.229562,0.201693,0.421981,0.506014,0.097484,0.061537,0.208154,0.424226,0.609057
3,68,3,1.986533,1.113055,2.298128,0.588921,0.045917,0.91262,0.213604,-2.596459,...,-0.079997,0.155563,0.202217,-0.488699,-0.281139,-0.91478,-0.920151,-1.141477,-0.942694,-0.66387
4,68,4,2.742601,0.429002,2.764029,0.838564,0.291231,0.691983,0.176078,-1.577382,...,-0.328838,0.009004,-0.157728,-0.293743,-0.398692,-0.361992,-0.482711,-0.302175,-0.290704,-0.251347


## feature plot

In [None]:
#features plot

# Get the number of columns
num_cols = len(df.columns)

# Choose the layout of the subplots (e.g., 3 columns wide)
# Adjust the layout depending on how many columns you have.
ncols = 3
nrows = int(np.ceil(num_cols / ncols))

# Create the figure and axes
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5*ncols, 4*nrows))

# Flatten axes for easy indexing (in case we have multiple rows)
axes = axes.flatten()

# Plot histogram for each column
for i, col in enumerate(df.columns):
    ax = axes[i]
    # Get column data range
    col_min, col_max = df[col].min(), df[col].max()
    
    # Plot the histogram
    df[col].hist(ax=ax, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    
    # Set x-limits to the min and max of the data
    if np.isfinite(col_min) and np.isfinite(col_max):
        ax.set_xlim(col_min, col_max)
    
    # Add labels and title
    ax.set_title(col, fontsize=12)
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any unused subplots if the number of columns is not a multiple of ncols
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

## symbol analysis

In [None]:
# Assuming 'df' is your DataFrame with columns mentioned above

# Step 1: Sort the DataFrame by symbol_id (and optionally by date_id and time_id)
df_sorted = df.sort_values(by=['symbol_id', 'date_id', 'time_id'])

# Step 2: Extract all feature columns
# Assuming all features have a column name starting with "feature_"
feature_cols = [col for col in df_sorted.columns if col.startswith('feature_')]

# Step 3: Get the unique symbols
symbol_ids = df_sorted['symbol_id'].unique()

# Step 4: Plot the features for each symbol_id
for symbol in symbol_ids:
    # Filter the DataFrame for the current symbol_id
    df_symbol = df_sorted[df_sorted['symbol_id'] == symbol]

    # Create a new figure for each symbol
    plt.figure(figsize=(12, 8))
    
    # Plot each feature column against time_id (or another suitable index)
    for f in feature_cols:
        plt.plot(df_symbol['time_id'], df_symbol[f], label=f)
    
    # Add plot details
    plt.title(f'Features for symbol_id = {symbol}')
    plt.xlabel('Time ID')
    plt.ylabel('Feature Values')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  # Legend outside for readability
    plt.tight_layout()
    plt.show()