In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import swifter
from sklearn.model_selection import train_test_split
import utils
import catboost

In [2]:
DATA_PATH = "./data"

In [3]:
NA_VALUE = -9999.0

In [4]:
def fillna(df):
    """
    Пропуски только в 2 и 3 колонках
    """
    for axis in 'XYZ':
        cols = ['MatchedHit_{}[{}]'.format(axis, i) for i in range(4)]
        mask = np.isclose(df.loc[:, cols[2]], NA_VALUE)
        mask = mask & ~np.isclose(df.loc[:, cols[3]], NA_VALUE)
        
        df.loc[mask, cols[2]] = df.loc[mask, [cols[1], cols[3]]].mean(axis=1)
    return df

In [None]:
from collections import Counter

for i in [2]:
    full_train = pd.read_hdf(os.path.join(DATA_PATH, "train_part_%i.hdf" % i))
    full_train = fillna(full_train)

    df_counts = pd.DataFrame(full_train['FOI_hits_S'].swifter.apply(Counter).tolist(), index=full_train.index)
    df_counts.columns = map("FOI_hits_N[{}]".format, df_counts.columns)
    df_counts = df_counts.fillna(0).astype(int)
                             
    closest_hits_features = full_train.swifter.apply(
        utils.find_closest_hit_per_station, result_type="expand", axis=1)
                             
    train_concat = pd.concat([
        closest_hits_features,
        df_counts
    ], axis=1, ignore_index=True)
    train_concat.columns = closest_hits_features.columns.tolist() + df_counts.columns.tolist()
    
    train_concat.to_hdf('closest_hits_features.train.filled.m9999.part%i.v1.hdf' % i, 'key')

Pandas Apply: 100%|██████████| 2722853/2722853 [00:26<00:00, 104699.87it/s]
Pandas Apply:  87%|████████▋ | 2380525/2722853 [1:11:40<10:22, 550.01it/s]

In [2]:
train_concat = pd.concat([
    pd.read_hdf('closest_hits_features.train.filled.m9999.part1.v1.hdf'),
    pd.read_hdf('closest_hits_features.train.filled.m9999.part2.v1.hdf')
], axis=0, ignore_index=True, copy=False)
train_concat.head()

Unnamed: 0,FOI_hits_N,0,1,2,3,4,5,6,7,8,...,18,19,20,21,22,23,FOI_hits_N[0],FOI_hits_N[1],FOI_hits_N[2],FOI_hits_N[3]
0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,118.0,126.0,126.278549,136.278488,146.278412,156.278351,2,1,2,1
1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,118.0,756.0,126.278549,136.278488,146.278412,156.278351,1,1,1,1
2,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,59.0,63.0,63.078957,68.078926,73.078896,78.078857,2,5,1,1
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,59.0,63.0,63.038589,68.078926,73.078896,78.078857,2,1,1,1
4,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,...,59.0,63.0,63.078957,68.078926,73.078896,78.078857,3,1,1,1


In [5]:
from collections import Counter

full_train = pd.read_hdf(os.path.join(DATA_PATH, "test_public.hdf"))
full_train = fillna(full_train)

df_counts = pd.DataFrame(full_train['FOI_hits_S'].swifter.apply(Counter).tolist(), index=full_train.index)
df_counts.columns = map("FOI_hits_N[{}]".format, df_counts.columns)
df_counts = df_counts.fillna(0).astype(int)

closest_hits_features = full_train.swifter.apply(
    utils.find_closest_hit_per_station, result_type="expand", axis=1)

train_concat = pd.concat([
    closest_hits_features,
    df_counts
], axis=1, ignore_index=True)
train_concat.columns = closest_hits_features.columns.tolist() + df_counts.columns.tolist()

train_concat.to_hdf('closest_hits_features.test.filled.m9999.v1.hdf', 'key')

Pandas Apply: 100%|██████████| 726095/726095 [00:06<00:00, 108465.77it/s]
Pandas Apply: 100%|██████████| 726095/726095 [21:42<00:00, 557.34it/s]


In [5]:
from collections import Counter

full_train = pd.read_hdf(os.path.join(DATA_PATH, "test_private_v2_track_1.hdf"))
full_train = fillna(full_train)

df_counts = pd.DataFrame(full_train['FOI_hits_S'].swifter.apply(Counter).tolist(), index=full_train.index)
df_counts.columns = map("FOI_hits_N[{}]".format, df_counts.columns)
df_counts = df_counts.fillna(0).astype(int)

closest_hits_features = full_train.swifter.apply(
    utils.find_closest_hit_per_station, result_type="expand", axis=1)

train_concat = pd.concat([
    closest_hits_features,
    df_counts
], axis=1, ignore_index=True)
train_concat.columns = closest_hits_features.columns.tolist() + df_counts.columns.tolist()

train_concat.to_hdf('closest_hits_features.test.filled.m9999.v1.private.hdf', 'key')

Pandas Apply: 100%|██████████| 1452188/1452188 [00:12<00:00, 116609.30it/s]
Pandas Apply: 100%|██████████| 1452188/1452188 [41:01<00:00, 590.00it/s]
