In [None]:
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm

In [None]:
# Load the first partition of training data
df = pl.scan_parquet('train.parquet/partition_id=0/part-0.parquet').collect()

# Get feature and responder columns
feature_cols = [col for col in df.columns if 'feature' in col]
responder_cols = [col for col in df.columns if 'responder' in col]

print(f'Dataset shape: {df.shape}')
print(f'Number of features: {len(feature_cols)}')
print(f'Number of responders: {len(responder_cols)}')

In [None]:
# Calculate basic statistics for features
stats = df.select(feature_cols).describe()
display(stats)

# Calculate basic statistics for responders
responder_stats = df.select(responder_cols).describe()
display(responder_stats)

In [None]:
# Plot feature distributions
fig = make_subplots(rows=3, cols=3, subplot_titles=feature_cols[:9])

for idx, feature in enumerate(feature_cols[:9]):
    row = idx // 3 + 1
    col = idx % 3 + 1
    
    values = df.get_column(feature).to_numpy()
    
    fig.add_trace(
        go.Histogram(x=values, name=feature, nbinsx=50),
        row=row, col=col
    )

fig.update_layout(
    height=900,
    width=1200,
    showlegend=False,
    title_text='Feature Distributions'
)
fig.show()

In [None]:
# Plot responder distributions
fig = make_subplots(rows=3, cols=3, subplot_titles=responder_cols)

for idx, responder in enumerate(responder_cols):
    row = idx // 3 + 1
    col = idx % 3 + 1
    
    values = df.get_column(responder).to_numpy()
    
    fig.add_trace(
        go.Histogram(x=values, name=responder, nbinsx=50),
        row=row, col=col
    )

fig.update_layout(
    height=900,
    width=1200,
    showlegend=False,
    title_text='Responder Distributions'
)
fig.show()

# Responder correlations
responder_corr = df.select(responder_cols).corr()
fig = px.imshow(
    responder_corr,
    labels=dict(x="Responders", y="Responders", color="Correlation"),
    title='Responder Correlation Matrix',
    width=800,
    height=800,
    color_continuous_scale='RdBu_r'
)
fig.show()

In [None]:
# Calculate correlation matrix for features
corr_matrix = df.select(feature_cols).corr()

# Plot correlation heatmap
fig = px.imshow(
    corr_matrix,
    labels=dict(x="Features", y="Features", color="Correlation"),
    title='Feature Correlation Matrix',
    width=1000,
    height=1000,
    color_continuous_scale='RdBu_r'
)
fig.show()

# Find highly correlated features
def print_high_correlations(corr_matrix, threshold=0.8):
    high_corr = []
    for i in range(len(corr_matrix)):
        for j in range(i+1, len(corr_matrix)):
            if abs(corr_matrix[i, j]) > threshold:
                high_corr.append((feature_cols[i], feature_cols[j], corr_matrix[i, j]))
    
    print(f"\nFeature pairs with correlation > {threshold}:")
    for f1, f2, corr in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True):
        print(f"{f1:12} -- {f2:12}: {corr:.3f}")

print_high_correlations(corr_matrix.to_numpy())

In [None]:
# Analyze relationships between features and target responder
def plot_feature_target_relationships(df, features, target='responder_6', n_features=9):
    fig = make_subplots(rows=3, cols=3, subplot_titles=features[:n_features])
    
    for idx, feature in enumerate(features[:n_features]):
        row = idx // 3 + 1
        col = idx % 3 + 1
        
        # Sample data for better visualization
        sample_df = df.sample(n=1000, seed=42)
        x_vals = sample_df.get_column(feature).to_numpy()
        y_vals = sample_df.get_column(target).to_numpy()
        
        fig.add_trace(
            go.Scatter(
                x=x_vals,
                y=y_vals,
                mode='markers',
                marker=dict(size=3, opacity=0.5),
                name=feature
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        height=900,
        width=1200,
        showlegend=False,
        title_text=f'Feature vs {target} Relationships'
    )
    fig.show()

plot_feature_target_relationships(df, feature_cols)

# Calculate feature importance based on correlation with target
target_correlations = abs(df.select(feature_cols + ['responder_6']).corr()['responder_6'][:-1])
importance_df = pl.DataFrame({
    'feature': feature_cols,
    'correlation': target_correlations
}).sort('correlation', descending=True)

fig = px.bar(
    importance_df.to_pandas(),
    x='feature',
    y='correlation',
    title='Absolute Correlation with Target (responder_6)',
    width=1200,
    height=500
)
fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
# Analyze temporal patterns
def plot_time_series_patterns(df, cols, n_symbols=3):
    # Get first n symbols
    symbols = df['symbol_id'].unique()[:n_symbols]
    
    fig = make_subplots(rows=len(cols), cols=1, 
                       subplot_titles=cols,
                       vertical_spacing=0.1)
    
    for i, col in enumerate(cols):
        for symbol in symbols:
            symbol_data = df.filter(pl.col('symbol_id') == symbol)
            
            fig.add_trace(
                go.Scatter(
                    x=symbol_data['time_id'].to_numpy(),
                    y=symbol_data[col].to_numpy(),
                    name=f'Symbol {symbol} - {col}',
                    mode='lines'
                ),
                row=i+1, col=1
            )
    
    fig.update_layout(height=300*len(cols), showlegend=True,
                     title_text='Time Series Patterns by Symbol')
    fig.show()

# Plot patterns for target and a few features
plot_time_series_patterns(df, ['responder_6'] + feature_cols[:2])