In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from collections import defaultdict
from datetime import datetime
from os import path
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm
from cafeen import utils

mpl.style.use('ggplot')

In [3]:
path_to_train = path.join(path.abspath('..'), 'data', 'train.csv')
train = pd.read_csv(path_to_train)

In [4]:
features = [col for col in train.columns if col not in ['id', 'target']]
obj_cols = [col for col in features if train[col].dtype == np.object]
num_cols = [col for col in features if train[col].dtype == np.float64]

train[obj_cols] = train[obj_cols].fillna(value='NAN')
train[num_cols] = train[num_cols].fillna(value=-1)

for col in obj_cols:    
    encoder = LabelEncoder()    
    train[col] = encoder.fit(train[col]).transform(train[col])
    
utils.eval_weights(train, features)

ord_3   :  8.72 +- 0.215
ord_5   :  3.26 +- 0.137
ord_2   :  2.82 +- 0.160
month   :  2.26 +- 0.020
ord_0   :  2.26 +- 0.051
ord_4   :  1.47 +- 0.062
bin_0   :  0.84 +- 0.075
bin_2   :  0.74 +- 0.080
nom_1   :  0.70 +- 0.020
ord_1   :  0.67 +- 0.034
nom_3   :  0.46 +- 0.012
day     :  0.41 +- 0.010
nom_4   :  0.20 +- 0.017
nom_2   :  0.13 +- 0.010
bin_1   :  0.03 +- 0.000
nom_8   :  0.02 +- 0.004
bin_4   :  0.01 +- 0.007
nom_0   :  0.01 +- 0.002
nom_7   :  0.00 +- 0.002
nom_9   :  0.00 +- 0.000
bin_3   :  0.00 +- 0.000
nom_6   :  0.00 +- 0.000
nom_5   : -0.00 +- 0.000


In [5]:
path_to_train = path.join(path.abspath('..'), 'data', 'train.csv')
path_to_test = path.join(path.abspath('..'), 'data', 'test.csv')
train = pd.read_csv(path_to_train)
test = pd.read_csv(path_to_test)

In [6]:
features = [col for col in train.columns if col not in ['id', 'target']]

In [7]:
def impute_nans(estimator, train, test, features):
    _train = train.copy()
    _test = test.copy()
    _train['is_train'] = 1
    _test['is_train'] = 0

    df = pd.concat([
        _train[features + ['is_train']], 
        _test[features + ['is_train']]]
    ).reset_index(drop=True)
    
    del _train, _test

    nans = df.isna()

    obj_cols = [col for col in features if df[col].dtype == np.object]
    df[obj_cols] = df[obj_cols].fillna(value='-1')
    
    num_cols = [col for col in features if df[col].dtype == np.float64]
    df[num_cols] = df[num_cols].fillna(value=-1)

    encoders = defaultdict()
    
    for col in obj_cols:    
        encoders[col] = LabelEncoder()
        encoders[col].fit(df[col])
        df[col] = encoders[col].transform(df[col])
    
    for feature in tqdm(features, ascii=True):
        _features = [f for f in features if f not in [feature]]
                
        imputed = estimator.fit(
            df.loc[~nans[feature], _features],
            df.loc[~nans[feature], feature]
        ).predict(df[_features])
        
        df.loc[nans[feature], feature] = imputed[nans.index[nans[feature]]]

    for col in obj_cols:            
        df[col] = encoders[col].inverse_transform(df[col])
        
    train[features] = df.loc[df['is_train'] == 1, features]
    test[features] = df.loc[df['is_train'] == 0, features]
    
    return train, test

In [8]:
estimator=lgb.LGBMClassifier(n_estimators=100)

In [9]:
features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 
            'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 
            'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 
            'day', 'month']

train, test = impute_nans(estimator, train, test, features)

100%|##########| 17/17 [05:29<00:00, 19.37s/it]


In [12]:
features = [col for col in train.columns if col not in ['id', 'target']]
obj_cols = [col for col in features if train[col].dtype == np.object]
num_cols = [col for col in features if train[col].dtype == np.float64]

train[obj_cols] = train[obj_cols].fillna(value='NAN')
train[num_cols] = train[num_cols].fillna(value=-1)


for col in obj_cols:    
    encoder = LabelEncoder()    
    train[col] = encoder.fit(train[col]).transform(train[col])
    
utils.eval_weights(train, features)

ord_3   :  8.54 +- 0.134
ord_5   :  3.13 +- 0.044
ord_2   :  2.78 +- 0.050
month   :  2.42 +- 0.109
ord_0   :  2.13 +- 0.006
ord_4   :  1.53 +- 0.051
bin_0   :  0.82 +- 0.077
bin_2   :  0.75 +- 0.013
ord_1   :  0.64 +- 0.072
nom_1   :  0.62 +- 0.080
day     :  0.45 +- 0.030
nom_3   :  0.41 +- 0.028
nom_4   :  0.22 +- 0.041
nom_2   :  0.15 +- 0.018
nom_8   :  0.03 +- 0.009
bin_1   :  0.02 +- 0.005
bin_4   :  0.01 +- 0.009
nom_0   :  0.01 +- 0.004
nom_7   :  0.00 +- 0.001
nom_6   :  0.00 +- 0.001
nom_9   :  0.00 +- 0.000
nom_5   :  0.00 +- 0.000
bin_3   :  0.00 +- 0.000
