In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [5]:
#mount drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
test_data_link = '/content/drive/MyDrive/NK-securities/test_data.parquet'

In [7]:
test_df=pd.read_parquet(test_data_link)

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
# Normalizing features (including 'underlying')
feature_cols = [f'X{i}' for i in range(42) if f'X{i}' in test_df.columns] + ['underlying']
scaler = StandardScaler()
test_df[feature_cols] = scaler.fit_transform(test_df[feature_cols])


In [10]:

numeric_cols = test_df.select_dtypes(include='number').columns



In [11]:
columns_to_drop = ['underlying'] + [f'X{i}' for i in range(42)]
numeric_cols = numeric_cols.drop(columns_to_drop, errors='ignore')

In [12]:
numeric_cols.drop('timestamp')

Index(['call_iv_24000', 'call_iv_24100', 'call_iv_24200', 'call_iv_24300',
       'call_iv_24400', 'call_iv_24500', 'call_iv_24600', 'call_iv_24700',
       'call_iv_24800', 'call_iv_24900', 'call_iv_25000', 'call_iv_25100',
       'call_iv_25200', 'call_iv_25300', 'call_iv_25400', 'call_iv_25500',
       'call_iv_25600', 'call_iv_25700', 'call_iv_25800', 'call_iv_25900',
       'call_iv_26000', 'call_iv_26100', 'call_iv_26200', 'call_iv_26300',
       'call_iv_26400', 'call_iv_26500', 'put_iv_23000', 'put_iv_23100',
       'put_iv_23200', 'put_iv_23300', 'put_iv_23400', 'put_iv_23500',
       'put_iv_23600', 'put_iv_23700', 'put_iv_23800', 'put_iv_23900',
       'put_iv_24000', 'put_iv_24100', 'put_iv_24200', 'put_iv_24300',
       'put_iv_24400', 'put_iv_24500', 'put_iv_24600', 'put_iv_24700',
       'put_iv_24800', 'put_iv_24900', 'put_iv_25000', 'put_iv_25100',
       'put_iv_25200', 'put_iv_25300', 'put_iv_25400', 'put_iv_25500'],
      dtype='object')

In [13]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [14]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler


In [15]:
feature_cols = [f'X{i}' for i in range(42) if f'X{i}' in test_df.columns] + ['underlying']
iv_cols = [col for col in test_df.columns if 'iv_' in col]

In [16]:


# Fast version with pre-tuned parameters
def fast_extratrees_imputation(test_df_path, output_path='test_fast_extratrees_imputed.csv'):
    """Production-ready version with optimized defaults"""
    test_df = pd.read_parquet(test_df_path)
    feature_cols = [f'X{i}' for i in range(42) if f'X{i}' in test_df.columns] + ['underlying']
    iv_cols = [col for col in test_df.columns if 'iv_' in col]

    # Normalize
    scaler = StandardScaler()
    test_df[feature_cols] = scaler.fit_transform(test_df[feature_cols])

    # Configure optimized ExtraTrees
    imputer = IterativeImputer(
        estimator=ExtraTreesRegressor(
            n_estimators=180,       #Reduced from typical 200-300
            max_depth=50,           #to Constrain tree growth
            min_samples_split=5,    #Reduce overfitting
            min_samples_leaf=2,     #Increase generalization
            max_features=0.6,       #Reduce memory per tree
            max_samples=0.5,        #Use 33% for sub-sampling
            bootstrap=True,         #Enable bagging
            n_jobs=-1,              #Full parallelism
            random_state=42,
            verbose=1
        ),
        max_iter=30,                #Fewer iterations for speed
        tol=1e-4,                   #for relaxed tolerance
        initial_strategy='median',
        random_state=42,
        verbose=1
    )

    # Impute and save
    imputed_ivs = imputer.fit_transform(test_df[feature_cols + iv_cols])[:, len(feature_cols):]
    test_df[iv_cols] = imputed_ivs
    return test_df




In [17]:

result = fast_extratrees_imputation(test_data_link)

[IterativeImputer] Completing matrix with shape (12065, 95)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   10.6s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    9.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    1.8s
[

[IterativeImputer] Change: 2.5856794748082024, scaled tolerance: 0.0076515724706362085 


In [18]:
result

Unnamed: 0,timestamp,underlying,call_iv_24000,call_iv_24100,call_iv_24200,call_iv_24300,call_iv_24400,call_iv_24500,call_iv_24600,call_iv_24700,...,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41
0,0,-1.661569,0.280939,0.263978,0.256434,0.247066,0.242149,0.236287,0.232439,0.225718,...,0.663962,0.092911,-0.024374,-0.063603,-0.392946,0.046047,0.398702,-0.004258,0.006258,-0.559490
1,1,-1.332567,0.270276,0.267848,0.258893,0.251326,0.244136,0.238919,0.233548,0.227836,...,0.580920,0.161123,-0.041717,-0.235256,-0.066972,-0.053319,-0.755161,-0.370345,-0.672751,0.492133
2,2,0.058213,0.256283,0.251731,0.236413,0.224543,0.214869,0.204580,0.194604,0.188425,...,0.073646,-1.369281,0.118565,0.381197,0.849529,1.793902,0.509732,-0.004258,-0.023866,-0.275940
3,3,0.071031,0.241888,0.231735,0.220505,0.208839,0.198602,0.186190,0.175446,0.168064,...,-0.967149,-0.905692,-0.565335,0.031009,0.265579,-0.064622,-1.308013,-0.434649,-1.670228,-1.951433
4,4,0.160759,0.235328,0.229888,0.222983,0.214126,0.205946,0.199043,0.192603,0.185998,...,-1.925825,-0.310247,-0.051909,-1.557104,0.109752,-0.534139,0.275949,0.053375,1.608930,0.323592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12060,12060,1.075128,0.242288,0.233427,0.226020,0.216826,0.207940,0.200276,0.192734,0.186856,...,-1.113447,0.251054,-0.021513,-0.655743,-0.310727,-1.606916,-1.334378,0.044356,-1.531660,-0.704304
12061,12061,1.376357,0.244896,0.236380,0.226972,0.218535,0.209751,0.200906,0.193655,0.188003,...,-0.034309,0.930739,-0.143664,-0.143778,-0.803349,0.250666,-0.379426,-0.270236,-1.495745,0.005801
12062,12062,0.061418,0.256475,0.243817,0.234267,0.221270,0.211039,0.201186,0.192209,0.185088,...,-0.934958,0.217993,0.224806,-0.145864,-0.278762,-2.200758,0.443591,-0.004258,1.417929,-0.376145
12063,12063,1.539790,0.248897,0.239825,0.231201,0.221517,0.212628,0.204863,0.197538,0.190833,...,0.155252,2.438728,0.071953,0.965181,1.761538,0.010256,1.575231,0.048929,1.225759,-0.118858


In [19]:
# prompt: remove columns underlying and X0 to X41 from final result

# Remove 'underlying' and 'X' columns
columns_to_drop = ['underlying'] + [f'X{i}' for i in range(42)]
result = result.drop(columns=columns_to_drop, errors='ignore')


In [20]:
result

Unnamed: 0,timestamp,call_iv_24000,call_iv_24100,call_iv_24200,call_iv_24300,call_iv_24400,call_iv_24500,call_iv_24600,call_iv_24700,call_iv_24800,...,put_iv_24600,put_iv_24700,put_iv_24800,put_iv_24900,put_iv_25000,put_iv_25100,put_iv_25200,put_iv_25300,put_iv_25400,put_iv_25500
0,0,0.280939,0.263978,0.256434,0.247066,0.242149,0.236287,0.232439,0.225718,0.222997,...,0.232334,0.226402,0.222576,0.227301,0.234169,0.244900,0.250422,0.258376,0.272546,0.282229
1,1,0.270276,0.267848,0.258893,0.251326,0.244136,0.238919,0.233548,0.227836,0.224748,...,0.233871,0.228209,0.224479,0.229333,0.238298,0.249402,0.253319,0.261363,0.270612,0.283930
2,2,0.256283,0.251731,0.236413,0.224543,0.214869,0.204580,0.194604,0.188425,0.183423,...,0.194612,0.188052,0.183089,0.180828,0.181346,0.185148,0.190750,0.196542,0.204673,0.212237
3,3,0.241888,0.231735,0.220505,0.208839,0.198602,0.186190,0.175446,0.168064,0.162963,...,0.175011,0.166394,0.161561,0.162173,0.165668,0.172032,0.181664,0.189560,0.197478,0.206107
4,4,0.235328,0.229888,0.222983,0.214126,0.205946,0.199043,0.192603,0.185998,0.181563,...,0.192438,0.185994,0.181495,0.177920,0.175699,0.176000,0.177529,0.178948,0.182314,0.186271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12060,12060,0.242288,0.233427,0.226020,0.216826,0.207940,0.200276,0.192734,0.186856,0.181764,...,0.193100,0.186486,0.181673,0.177270,0.175108,0.174783,0.176356,0.177716,0.179546,0.182576
12061,12061,0.244896,0.236380,0.226972,0.218535,0.209751,0.200906,0.193655,0.188003,0.183705,...,0.193951,0.187734,0.183582,0.180196,0.177969,0.177923,0.178587,0.180254,0.181577,0.183883
12062,12062,0.256475,0.243817,0.234267,0.221270,0.211039,0.201186,0.192209,0.185088,0.179760,...,0.192306,0.184840,0.179666,0.176916,0.177624,0.181472,0.187212,0.193739,0.201360,0.209940
12063,12063,0.248897,0.239825,0.231201,0.221517,0.212628,0.204863,0.197538,0.190833,0.185851,...,0.197718,0.190933,0.186172,0.181911,0.178432,0.177618,0.178280,0.178806,0.180184,0.182308


In [21]:
result.to_csv('check_importance_values.csv', index=False)

In [22]:
#from google.colab import files

In [23]:
#files.download('check_importance_values.csv')