# Jane Street Market Prediction (#2.2)
## Imputation, downsizing dataset.

Loaded by all training notebooks.<br>
Evaluated in https://www.kaggle.com/wendellavila/janestreet-preprocessing-selection/

Notebook Navigation<br>
[All](https://www.kaggle.com/wendellavila/janestreet-index/) | [#1](https://www.kaggle.com/wendellavila/janestreet-model-selection/) | [#2.1](https://www.kaggle.com/wendellavila/janestreet-preprocessing-selection) | [#2.2](https://www.kaggle.com/wendellavila/janestreet-data-preprocessing) | [#3](https://www.kaggle.com/wendellavila/janestreet-regularization-selection) | [#4.1](https://www.kaggle.com/wendellavila/janestreet-hyperparameter-tuning) | [#4.2](https://www.kaggle.com/wendellavila/janestreet-hyperparameter-evaluation) | [#5.1](https://www.kaggle.com/wendellavila/janestreet-pca) | [#5.2](https://www.kaggle.com/wendellavila/janestreet-autoencoder) | [#5.3](https://www.kaggle.com/wendellavila/janestreet-dimensionality-reduction-evaluation) |[#6](https://www.kaggle.com/wendellavila/janestreet-ensemble)

## Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
#from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer#, IterativeImputer
pd.set_option('display.max_columns', 300)

## Misc

In [2]:
#downsizing dataframe for faster loading
def reduce_dtypes(df):
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            
    return df

In [3]:
def missing_info(df):
    print("Total N° of NaN: ", df.isnull().sum().sum())
    col_nan = df.columns[df.isnull().any()]
    print("N° of columns with NaN: ", len(col_nan))
    df[col_nan]

In [4]:
def add_missing_indicator(df):
    col_nan = df.columns[df.isnull().any()]
    missing_i = df[col_nan].isnull().astype('float64').add_suffix('_missing')
    return pd.concat([df, missing_i], axis="columns")

## Visualizing data

# data = reduce_dtypes(pd.read_hdf('../input/jane-street-market-train-data-best-formats/jane_street_train.h5'))
# data = pd.read_hdf('../input/jane-street-market-train-data-best-formats/jane_street_train.h5')
# features = [c for c in data.columns if 'feature' in c]
# data

In [5]:
# missing_info(data)
# missing = pd.DataFrame(df[col_nan].isnull().sum().sort_values(ascending=False)*100/df.shape[0],columns=['missing %']).T
# missing.style.background_gradient(cmap='Blues', axis=1)

In [6]:
# # Display the histogram 
# fig,axes = plt.subplots(nrows=45,ncols=3,figsize=(25,250))

# for i in range(2,137):
#     sns.distplot(data.iloc[:,i],ax=axes[(i-2)//3,(i-2)%3])

In [7]:
# del data

## Preprocessing Pipeline

In [8]:
def preprocessing_pipeline(imputation='none', name='none', addIndicator=False, removeExtraResp=False, remove0=False):
    print("Loading data...")
    data = pd.read_hdf('../input/jane-street-market-train-data-best-formats/jane_street_train.h5')
    print("Data loaded. Working...")
    features = [c for c in data.columns if 'feature' in c]
    resps = [c for c in data.columns if 'resp' in c]
    
    #filtering out rows with weight == 0
    if(remove0 == True):
        data = data.query('weight > 0').reset_index(drop=True)
    
    if(removeExtraResp == True):
        data = data[['date'] + ['weight'] + ['resp'] + features]
    else:
        data = data[['date'] + ['weight'] + resps + features]
        
    if(addIndicator == True):
        data = add_missing_indicator(data)
    
    data['action'] = (data['resp'] > 0.000000001)*1
    
    train_data = data[data['date']<450]
    val_data = data[data['date']>=450]
    del data
    
    print("Imputting...")
    #imputation
    if(imputation == 'mean'):
        mean = train_data.mean()
        train_data.fillna(value=mean, inplace=True)
        val_data.fillna(value=mean, inplace=True)
    elif(imputation == 'ffil'):
        train_data.fillna(method='ffill', inplace=True)
        val_data.fillna(method='ffill', inplace=True)
        mean = train_data.mean()
        train_data.fillna(value=mean, inplace=True)
        val_data.fillna(value=mean, inplace=True)
#     elif(imputation == 'iterative'):
#         imp = IterativeImputer(max_iter=10)
#         temp_data = pd.DataFrame(imp.fit_transform(train_data))
#         temp_data.columns=train_data.columns
#         temp_data.index=train_data.index
#         train_data = temp_data
        
#         temp_data = pd.DataFrame(imp.transform(val_data))
#         temp_data.columns=val_data.columns
#         temp_data.index=val_data.index
#         val_data = temp_data
#         del temp_data
    
    #reducing dtypes of dataframe for faster loading
    train_data = reduce_dtypes(train_data)
    val_data = reduce_dtypes(val_data)
    train_data.to_pickle(f'train-{name}.pkl')
    val_data.to_pickle(f'val-{name}.pkl')
    print("\nTrain missing values:")
    missing_info(train_data)
    print("\nVal missing values:")
    missing_info(val_data)
    del train_data, val_data
    print("Finished.")
    

## Execution

In [9]:
preprocessing_pipeline('mean', 'mean')

Loading data...
Data loaded. Working...
Imputting...

Train missing values:
Total N° of NaN:  0
N° of columns with NaN:  0

Val missing values:
Total N° of NaN:  0
N° of columns with NaN:  0
Finished.


In [10]:
preprocessing_pipeline('ffil', 'ffil')

Loading data...
Data loaded. Working...
Imputting...

Train missing values:
Total N° of NaN:  0
N° of columns with NaN:  0

Val missing values:
Total N° of NaN:  0
N° of columns with NaN:  0
Finished.


In [11]:
preprocessing_pipeline('mean', 'mean-indicator', addIndicator=True)

Loading data...
Data loaded. Working...
Imputting...

Train missing values:
Total N° of NaN:  0
N° of columns with NaN:  0

Val missing values:
Total N° of NaN:  0
N° of columns with NaN:  0
Finished.


In [12]:
preprocessing_pipeline('ffil', 'ffil-indicator', addIndicator=True)

Loading data...
Data loaded. Working...
Imputting...

Train missing values:
Total N° of NaN:  0
N° of columns with NaN:  0

Val missing values:
Total N° of NaN:  0
N° of columns with NaN:  0
Finished.
