### 1a. Impute pickle dataframe 
- This notebook aims to impute data

In [1]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd
pd.options.display.float_format = "{:,.6f}".format
import gc
import numpy as np
import os
import sys
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from lightgbm import LGBMClassifier, log_evaluation
from pandarallel import pandarallel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [3]:
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, plot_target_check, 
    plot_int_feature_distribution, plot_train_test_distribution, check_overlap_missing
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, BINARY_COLUMNS, ROUND_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, NON_FEATURE_COLUMNS
)
from utils.constants import (
    PROCESSED_DATA_PATH
)
from utils.feature_engineering_helpers import feature_gen_pipeline

In [4]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [5]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [6]:
%load_ext autoreload
%autoreload

## Read Data

In [7]:
%%time
train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/raw_train_data.pkl", replace_negative_one=True)
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (5531451, 193)
CPU times: user 2.1 s, sys: 2.13 s, total: 4.23 s
Wall time: 5.28 s


In [8]:
# train_missing_prop_df = plot_missing_proportion_barchart(train, top_n=115, figsize=(17, 30))

#### Imputation Strategy comparison setup

In [9]:
standard_columns = ["customer_ID", "row_number", "row_number_inv", "S_2"]
features = ["P_2"]

In [10]:
params = {
    "objective": "binary",
    "verbosity": -1,
    "metrics": "custom",
    "first_metric_only": True, 
    "boost_from_average": False,
    "boosting_type": "gbdt",
    "random_state": 1,
    "learning_rate": 0.0225,
    "n_estimators": 1000,
    "max_bins": 127,
    "subsample_freq": 2,
    "min_child_samples": 2000,
    "reg_alpha": 0.1,
    "reg_lambda": 15,
    "colsample_bytree": 0.25,
    "subsample": 0.65,
    "scale_pos_weight": 1.3,
    "num_leaves": 80,
    "max_depth": 7,
}

### Simple Imputation using Mean

#### Identify good (top 10) columns to use with no missing values

In [11]:
%%time
m = train.isnull().sum()
no_missing_cols = m[m == 0].index.tolist()
len(no_missing_cols)

CPU times: user 1.77 s, sys: 1.06 s, total: 2.83 s
Wall time: 3.55 s


73

In [12]:
feature_imp_summary = read_file(f"{EXP_PATH}/feature_imp_summary2.csv")
feature_imp_subset = feature_imp_summary[feature_imp_summary["base_feature"].isin(no_missing_cols)]
feature_imp_subset["avg_importance"] = feature_imp_subset.iloc[:, 1:].mean(axis=1)
feature_imp_subset = feature_imp_subset.sort_values(by="avg_importance", ascending=False).reset_index(drop=True)
without_missing_good_columns = feature_imp_subset["base_feature"].iloc[:5].tolist()

Shape of data: (188, 25)


In [13]:
without_missing_good_columns

['B_4', 'D_39', 'S_12', 'B_5', 'D_47']

In [14]:
TOP_PERFORMANCE_FEATURES = ["P_2", "D_43", "D_48", "S_3", "R_1", "B_1", "B_4", "B_5", "B_11"]

### Test specific columns

In [15]:
specific_features = read_file(f"{EXP_PATH}/percent_of_missing_from_wrong.csv").iloc[:, 0].values[:77].tolist()

Shape of data: (190, 2)


In [190]:
test_feature = specific_features[2]
test_feature

'B_13'

In [191]:
train[test_feature].describe()

count   5,481,932.000000
mean            0.099693
std             0.554849
min             0.000000
25%             0.009255
50%             0.029314
75%             0.089415
max           276.177826
Name: B_13, dtype: float64

In [192]:
%%time
train_subset = train.loc[:, standard_columns + TOP_PERFORMANCE_FEATURES + [test_feature]]

CPU times: user 108 ms, sys: 152 ms, total: 260 ms
Wall time: 339 ms


In [193]:
def interpolate_series(x, method, order):
    try:
        series = x.interpolate(method=method, order=order)
    except:
        series = x
    return series

In [194]:
def impute_(df, col, method, order=1):
    if method == 0:
        # No imputation
        return df[col]
        
    elif method == 1:
        # Impute with global mean directly
        return df[col].fillna(df[col].mean())
        
    elif method == 2:
        # Impute with respective group mean
        df[col] = df.groupby("customer_ID")[col].transform(lambda x: x.fillna(x.mean()))
        return df[col].fillna(df[col].mean())
    
    elif method == 3:
        # Impute using (certain order) polynomial interpolation
        m = "polynomial"
        df[col] = df.groupby("customer_ID")[col].transform(lambda x: interpolate_series(x, m, order))
        # return df.groupby("customer_ID")[col].transform(lambda x: x.fillna(x.mean()))
        return df.groupby("customer_ID")[col].transform(lambda x: x.ffill().bfill())
    
    elif method == 4:
        # Impute using (certain order) spline interpolation
        m = "spline"
        # return 
        df[col] = df.groupby("customer_ID")[col].transform(lambda x: interpolate_series(x, m, order))
        # return df.groupby("customer_ID")[col].transform(lambda x: x.fillna(x.mean()))
        return df.groupby("customer_ID")[col].transform(lambda x: x.ffill().bfill())
    
    else:
        pass

In [195]:
def prepare_lgb_train_test(train_agg, labels):
    X = train_agg.drop(columns=standard_columns, errors="ignore")
    y = labels["target"]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=923)
    train_data = lgb.Dataset(
        X_train,
        y_train,
        params={'verbose': -1}
    )
    valid_data = lgb.Dataset(
        X_val,
        y_val,
        params={'verbose': -1},
        reference=train_data
    )
    cat_columns = X_train.select_dtypes("category").columns.tolist()
    return train_data, valid_data, X_val, y_val, cat_columns

In [196]:
def train_and_evaluate_strategy(train_data, valid_data, X_val, y_val, cat_columns, 
                                lgbm_params=params, lgbm_metric=lgb_amex_metric):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model = lgb.train(
            params=lgbm_params,
            train_set=train_data, 
            valid_sets=[valid_data, train_data], 
            feval=lgbm_metric, 
            early_stopping_rounds=100,
            categorical_feature=cat_columns,
            verbose_eval=False
        )
    y_val_pred = model.predict(X_val, raw_score=True)
    return amex_metric(y_val.values, y_val_pred)

In [197]:
%%time
for method in [3, 4]: #range(4):
    if method in [3, 4]:
        for order in [1, 2, 3, 4]:
            gc.collect()
            train_subset = train.loc[:, standard_columns + TOP_PERFORMANCE_FEATURES + [test_feature]]
            train_subset[test_feature] = impute_(train_subset, col=test_feature, method=method, order=order)
            train_agg = feature_gen_pipeline(train_subset)
            train_data, valid_data, X_val, y_val, cat_columns = prepare_lgb_train_test(train_agg, labels)
            score_metric = train_and_evaluate_strategy(train_data, valid_data, X_val, y_val, cat_columns, 
                                                       lgbm_params=params, lgbm_metric=lgb_amex_metric)
            print(f"Method: {method}, Order: {order}, Metric: {score_metric[0]:.4f}")
    else:
        gc.collect()
        train_subset = train.loc[:, standard_columns + TOP_PERFORMANCE_FEATURES + [test_feature]]
        train_subset[test_feature] = impute_(train_subset, col=test_feature, method=method)
        train_agg = feature_gen_pipeline(train_subset)
        train_data, valid_data, X_val, y_val, cat_columns = prepare_lgb_train_test(train_agg, labels)
        score_metric = train_and_evaluate_strategy(train_data, valid_data, X_val, y_val, cat_columns, 
                                                   lgbm_params=params, lgbm_metric=lgb_amex_metric)
        print(f"Method: {method}, Metric: {score_metric[0]:.4f}")

Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.89s/it]


Method: 3, Order: 1, Metric: 0.7551
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.88s/it]


Method: 3, Order: 2, Metric: 0.7542
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.89s/it]


Method: 3, Order: 3, Metric: 0.7548
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.88s/it]


Method: 3, Order: 4, Metric: 0.7550
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.88s/it]


Method: 4, Order: 1, Metric: 0.7550
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.93s/it]


Method: 4, Order: 2, Metric: 0.7551
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.89s/it]


Method: 4, Order: 3, Metric: 0.7551
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.95s/it]


Method: 4, Order: 4, Metric: 0.7547
CPU times: user 1h 7min 32s, sys: 18min 48s, total: 1h 26min 20s
Wall time: 34min 16s


In [179]:
train_subset = train.loc[:, standard_columns + TOP_PERFORMANCE_FEATURES + [test_feature]]

In [180]:
cid = train_subset.loc[train_subset[test_feature].isnull()]["customer_ID"].unique()[32]

In [181]:
train_subset.loc[train_subset["customer_ID"] == cid, test_feature]

2852        NaN
2853        NaN
2854        NaN
2855        NaN
2856   0.017558
2857   0.012111
2858   0.017713
2859   0.017557
Name: D_119, dtype: float32

In [182]:
train_subset[test_feature] = train_subset.groupby("customer_ID")[test_feature].transform(lambda x: interpolate_series(x, 4, 3))

In [184]:
train_subset.loc[train_subset["customer_ID"] == cid, test_feature]

2852        NaN
2853        NaN
2854        NaN
2855        NaN
2856   0.017558
2857   0.012111
2858   0.017713
2859   0.017557
Name: D_119, dtype: float32

In [185]:
# train_subset[test_feature] = train_subset.groupby("customer_ID")[test_feature].transform(lambda x: x.fillna(x.mean()))

In [186]:
train_subset[test_feature] = train_subset.groupby("customer_ID")[test_feature].transform(lambda x: x.ffill().bfill())

In [187]:
train_subset.loc[train_subset["customer_ID"] == cid, test_feature]

2852   0.017558
2853   0.017558
2854   0.017558
2855   0.017558
2856   0.017558
2857   0.012111
2858   0.017713
2859   0.017557
Name: D_119, dtype: float32

In [178]:
train_subset.loc[train_subset["customer_ID"] == cid, test_feature]

2852   0.016235
2853   0.016235
2854   0.016235
2855   0.016235
2856   0.017558
2857   0.012111
2858   0.017713
2859   0.017557
Name: D_119, dtype: float32

In [None]:
train_subset[test_feature] = train_subset[test_feature].fillna(train_subset[test_feature].mean())

In [58]:
%%time
m1 = impute_(train_subset, col="B_13", method=0)
m2 = impute_(train_subset, col="B_13", method=1)
m3 = impute_(train_subset, col="B_13", method=2)


KeyboardInterrupt



In [None]:
%%time
train_agg = feature_gen_pipeline(train_subset)
# for col in train_agg.columns[1:]:
#     train_agg[col] = train_agg[col].fillna(train_agg[col].mean())

In [None]:
%%time


In [None]:
# Impute by linear interpolation then impute by group mean


In [None]:
xxx

In [None]:
# Impute by group mean
amex_metric(y_val.values, y_val_pred)

In [None]:
# Impute by global mean
amex_metric(y_val.values, y_val_pred)

### Imputation using other method

In [None]:
%%time
train_subset = train.loc[:, standard_columns + features]

In [None]:
# # Forward fill then back fill
# train_subset["P_2"] = train_subset.groupby("customer_ID")["P_2"].ffill().bfill()

In [None]:
%%time


In [None]:
%%time
# Impute using interpolate method
train_subset["temp"] = train_subset.groupby("customer_ID")["P_2"].transform(lambda x: x.interpolate(method="index"))

In [None]:
t = train_subset.groupby("customer_ID")["temp"].transform(
    lambda x: x.fillna(x.mean()).fillna(train_subset["temp"].mean())
)

In [None]:
train_subset["temp"].isnull().sum()

In [None]:
%%time
train_agg = feature_gen_pipeline(train_subset)
# for col in train_agg.columns[1:]:
#     train_agg[col] = train_agg[col].fillna(train_agg[col].mean())

In [None]:
X = train_agg.drop(columns=standard_columns, errors="ignore")
y = labels["target"]
X.shape, y.shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=923)

In [None]:
train_data = lgb.Dataset(
    X_train,
    y_train
)

In [None]:
valid_data = lgb.Dataset(
    X_val,
    y_val,
    reference=train_data
)

In [None]:
cat_columns = X_train.select_dtypes("category").columns.tolist()

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category=UserWarning)
    model = lgb.train(
        params=params,
        train_set=train_data, 
        valid_sets=[valid_data, train_data], 
        feval=lgb_amex_metric, 
        early_stopping_rounds=200,
        categorical_feature=cat_columns,
        callbacks=[
            log_evaluation(100),
        ]
    )

In [None]:
%%time
y_val_pred = model.predict(X_val, raw_score=True)

In [None]:
amex_metric(y_val.values, y_val_pred)

In [None]:
amex_metric(y_val.values, y_val_pred)

### Impute using interpolate

In [None]:
missing_cols = set(train_missing_prop_df.loc[train_missing_prop_df["missing_proportion"] <= 50]["column"].tolist())

In [None]:
cat_cols = set(train.select_dtypes("category").columns.tolist())

In [None]:
to_impute_cols = sorted(list(missing_cols - cat_cols))

In [None]:
len(to_impute_cols)

#### Impute Train

In [None]:
%%time
t = train.loc[:, ["customer_ID"] + to_impute_cols].groupby('customer_ID').apply(lambda group: group.interpolate().ffill().bfill())

In [None]:
t.shape

In [None]:
for col in tqdm(to_impute_cols):
    t = train[["customer_ID", col]].groupby('customer_ID').apply(lambda group: group.interpolate().ffill().bfill())
    train[col] = t[col].values
    gc.collect()

In [None]:
train.to_pickle(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")

#### Impute Test

In [None]:
if "train" in locals():
    del train

In [None]:
gc.collect()

In [None]:
%%time
test = read_file(f"{RAW_TEST_PICKLE_PATH}/raw_test_data.pkl", replace_negative_one=True)

In [None]:
for col in tqdm(to_impute_cols):
    t = test[["customer_ID", col]].groupby('customer_ID').apply(lambda group: group.interpolate().ffill().bfill())
    test[col] = t[col].values
    gc.collect()

In [None]:
test.to_pickle(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")

In [None]:
del test

### S_23

In [None]:
plot_target_check(train, "S_23", q=100)

In [None]:
train["S_23"].describe()

In [None]:
np.percentile(train["S_23"].dropna(), 1)

In [None]:
null_indices = train.loc[train["S_23"].isnull()].index.tolist()

In [None]:
cid_list = train.loc[null_indices, "customer_ID"].values.tolist()

In [None]:
index = 14
cid = cid_list[index]

In [None]:
# train.loc[train["customer_ID"] == cid, "S_23"]

In [None]:
print(f'Target = {labels.loc[labels["customer_ID"] == cid]["target"].values[0]}')
train.loc[train["customer_ID"] == cid, "S_23"].plot()
plt.show()

In [None]:
%%time
t = train[["customer_ID", "S_23"]].groupby('customer_ID').apply(lambda group: group.interpolate().ffill().bfill())

In [None]:
t["S_23"].values

In [None]:
train.loc[train["S_23"].isnull()]

In [None]:
train.groupby('customer_ID').apply(lambda group: group["S_23"].interpolate(method='index'))

### S_26, B_41

In [None]:
check_overlap_missing(train, "S_26", "B_41")

### Impute

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
%%time
imp = IterativeImputer(
    missing_values=np.nan, 
    max_iter=3, 
    verbose=2, 
    imputation_order='roman', 
    random_state=0
)

In [None]:
%%time
train_num = train.select_dtypes(np.number).iloc[:, 2:-1]

In [None]:
train_num.shape

In [None]:
# train_num

In [None]:
%%time
sample_imputed = imp.fit_transform(train_num)

In [None]:
pd.DataFrame(sample_imputed, columns=sample.iloc[:, :4].columns)

In [None]:
sample_imputed.columns

### Finish Simple Imputation, export data

In [None]:
train["D_132"] = train.groupby("customer_ID")["D_132"].apply(lambda group: group.interpolate(method='index'))
test["D_132"] = test.groupby("customer_ID")["D_132"].apply(lambda group: group.interpolate(method='index'))

In [None]:
# %%time
# train.to_pickle(f"{PROCESSED_DATA_PATH}/train_data_half_processed.pkl")
# test.to_pickle(f"{PROCESSED_DATA_PATH}/test_data_half_processed.pkl")

### Read Imputed (WIP) Data

In [None]:
train = read_file(f"{PROCESSED_DATA_PATH}/train_data_imputed.pkl")

In [None]:
test = read_file(f"{PROCESSED_DATA_PATH}/test_data_imputed.pkl")

In [None]:
test_missing_prop_df = plot_missing_proportion_barchart(test, top_n=50)

### Recursive KNN

In [None]:
test_corr_df = read_file(f"{PROCESSED_DATA_PATH}/raw_column_correlation/test_corr_df.csv").drop(columns="customer_ID")
test_corr_df.index = test_corr_df.columns

In [None]:
def recursive_impute_using_knn(df, corr_df, corr_thr=0.3, corr_search_step_size=0.02, 
                               predictor_size_thr=5, list_of_k=[99], max_try_threshold=6, 
                               skip_first_n=0):
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values()
    impute_columns = missing.index.tolist()

    for impute_column in impute_columns[skip_first_n:]:
        print(f"Selecting correlated column with {impute_column}...")
        curr_corr = corr_thr
        predictor_columns = []
        max_tries = 0
        while len(predictor_columns) < predictor_size_thr and max_tries < max_try_threshold:
            
            if curr_corr < corr_thr:
                print(f"Re-selecting correlated column using {curr_corr}")
            curr_corr -= corr_search_step_size
            max_tries += 1
            
            high_corr_columns = corr_df.loc[
                corr_df[impute_column].abs().between(curr_corr, 0.999), impute_column
            ].sort_values(ascending=False).index.tolist()
            no_missing_columns = df.isnull().sum()[df.isnull().sum() == 0].index.tolist()
            predictor_columns = list(set(high_corr_columns).intersection(set(no_missing_columns)))
            predictor_columns = predictor_columns[:predictor_size_thr]
        if max_tries >= max_try_threshold:
            print("Exceed max tries in searching correlated columns, skip this feature")
            continue
        train_val_knn = df.loc[~df[impute_column].isnull()]
        test_knn = df.loc[df[impute_column].isnull()]
        print(f"{predictor_columns} selected as predictors")
        if test_knn.shape[0] == 0:
            print(f"{impute_column} has no missing values, skip\n")
            continue
        train_knn, val_knn = train_test_split(train_val_knn, test_size=0.2, random_state=20)
        print(f"Train, Validation, Test size: {train_knn.shape[0], val_knn.shape[0], test_knn.shape[0]}")
        min_rmse = np.inf
        best_k = 0
        std = df[impute_column].std()
        print(f"{impute_column} standard deviation: {std:.4f}")
        for k in list_of_k:
            knn_model = KNeighborsRegressor(n_neighbors=k).fit(
                train_knn.loc[:, predictor_columns], 
                train_knn.loc[:, impute_column]
            )
            y_val_pred = knn_model.predict(val_knn.loc[:, predictor_columns])
            rmse = np.sqrt(mean_squared_error(val_knn.loc[:, impute_column], y_val_pred))
            print(f"K: {k}, Validation RMSE: {rmse:.5f}")
            if rmse < min_rmse:
                min_rmse = rmse
                best_knn_model = knn_model
                best_k = k
        print(f"Best K is {best_k}")
        if rmse >= std:
            print(f"Standard deviation smaller than RMSE, stop the imputation")
            continue
        df.loc[test_knn.index, impute_column] = best_knn_model.predict(test_knn.loc[:, predictor_columns])
        if df[impute_column].isnull().sum() > 0:
            print(f"Please check why column {impute_column} has yet to be imputed")
        print(f"Imputation done!\n")
        
    return df

In [None]:
test = recursive_impute_using_knn(test, test_corr_df, list_of_k=[99], skip_first_n=4, 
                                  predictor_size_thr=2, corr_search_step_size=0.015)

In [None]:
test_missing_prop_df = plot_missing_proportion_barchart(test, top_n=50)

In [None]:
# plot_scatterplot(train, predictor_columns[0], impute_column, "target")

In [None]:
train.loc[train["D_142"].between(d142_impute_value*0.99, d142_impute_value*1.01), "target"].mean()

In [None]:
train["D_53"].describe()

In [None]:
test["D_53"].describe()

In [None]:
# target_dist_df = single_col_target_check(train, "D_53", q=50)

In [None]:
temp_value = np.percentile(train["D_42"].dropna(), 20)

In [None]:
train.loc[train["D_42"].between(0.975*temp_value, 1.025*temp_value), "target"].mean()

In [None]:
# test_missing_prop_df = plot_missing_proportion_barchart(test, top_n=25)

In [None]:
train.shape

In [None]:
%%time
# train.to_pickle(f"{PROCESSED_DATA_PATH}/train_data_imputed.pkl")
test.to_pickle(f"{PROCESSED_DATA_PATH}/test_data_imputed.pkl")