In [26]:
import numpy as np
import pandas as pd
from sklearn import pipeline, preprocessing
from sklearn.pipeline import Pipeline, make_pipeline

from df_transformers import *

In [27]:
train_data = pd.read_csv("data/cleaned_train_data.csv", index_col="id")
test_data = pd.read_csv("data/cleaned_test_data.csv", index_col="id")
print(f"Shape of train data: {train_data.shape}. Shape of test data: {test_data.shape}")
train_size = train_data.shape[0]

Shape of train data: (600000, 24). Shape of test data: (400000, 23)


In [28]:
# Объединим датасеты для обработки
data = pd.concat([train_data.drop(['target'], axis=1), test_data])
y = train_data['target']

### Разделяем признаки по типу

In [29]:
# Бинарные
bin_features = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
# Категориальные
nom_features = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
# Хеш
hex_features = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
# Порядковые
ord_features = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
# Цикличные
cyc_features = ['day', 'month']

Бинарные признаки

In [30]:
data[bin_features].head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 23 columns):
bin_0    1000000 non-null float64
bin_1    1000000 non-null float64
bin_2    1000000 non-null float64
bin_3    1000000 non-null float64
bin_4    1000000 non-null float64
nom_0    1000000 non-null object
nom_1    1000000 non-null object
nom_2    1000000 non-null object
nom_3    1000000 non-null object
nom_4    1000000 non-null object
nom_5    1000000 non-null object
nom_6    1000000 non-null object
nom_7    1000000 non-null object
nom_8    1000000 non-null object
nom_9    1000000 non-null object
ord_0    1000000 non-null float64
ord_1    1000000 non-null object
ord_2    1000000 non-null object
ord_3    1000000 non-null object
ord_4    1000000 non-null object
ord_5    1000000 non-null object
day      1000000 non-null float64
month    1000000 non-null float64
dtypes: float64(8), object(15)
memory usage: 183.1+ MB


In [8]:
# Обработка бинарных признаков
bin_pipeline = make_pipeline(
    SelectColumnsTransfomer(bin_features),
    DataFrameFunctionTransformer(lambda x: x.apply(str)),
    ToDummiesTransformer(),
)

In [9]:
bin_df = bin_pipeline.fit_transform(data)

In [10]:
bin_df.head()

Unnamed: 0_level_0,bin_0_0.0,bin_0_1.0,bin_0_NAN,bin_1_0.0,bin_1_1.0,bin_1_NAN,bin_2_0.0,bin_2_1.0,bin_2_NAN,bin_3_F,bin_3_NAN,bin_3_T,bin_4_N,bin_4_NAN,bin_4_Y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1
2,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0
3,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0
4,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0


In [14]:
from sklearn.preprocessing import LabelEncoder

bin_l = pd.DataFrame()
label = LabelEncoder()

for col in bin_features:
    if(data[col].dtype=='object'):
        bin_l[col] = label.fit_transform(data[col])
    else:
        bin_l[col] = data[col]

In [15]:
bin_l.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,0,0
1,1,1,0,0,2
2,0,1,0,0,0
3,2,0,0,0,0
4,0,2,0,2,0


### Собираем пайплайн

In [5]:
# Обработка числовых признаков
# num_pipeline = make_pipeline(
#     SelectColumnsTransfomer(num_features),
#     #preprocessing.StandardScaler(with_mean = 0)
# )

# Обработка бинарных признаков
bin_pipeline = make_pipeline(
    SelectColumnsTransfomer(bin_features),
    DataFrameFunctionTransformer(lambda x: x.apply(str)),
    ToDummiesTransformer(),
)

# Обработка категориальных признаков
nom_pipeline = make_pipeline(
    SelectColumnsTransfomer(nom_features),
    DataFrameFunctionTransformer(lambda x: x.apply(str)),
    # Разобраться что за тип данных category в pd.DataFrame и что такое object_levels
    #DataFrameFunctionTransformer(lambda x:x.astype('category', categories=object_levels)),
    ToDummiesTransformer(),
)

# Обработка порядковых признаков
ord_pipeline = make_pipeline(
    SelectColumnsTransfomer(ord_features),
    DataFrameFunctionTransformer(lambda x: x.apply(str)),
    ToDummiesTransformer(),
)

# Обединяем обработанные данные
preprocessing_features = DataFrameFeatureUnion([bin_pipeline, nom_pipeline, ord_pipeline])

In [1]:
# prprd_data = preprocessing_features.fit_transform(data)

In [None]:
preprocessed_train_data = prprd_data.iloc[:train_size, :]
preprocessed_test_data = prprd_data.iloc[train_size:, :]

In [None]:
preprocessed_train_data['target'] = y

### Обрабатываем и сохраняем данные

In [None]:
preprocessed_train_data.to_csv('data/preprocessed_train_data.csv', header=True)
preprocessed_test_data.to_csv('data/preprocessed_test_data.csv', header=True)

In [18]:
# Избранные ноутбуки из предыдущего соревнования
# https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
# https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer
# https://www.kaggle.com/adaubas/2nd-place-solution-categorical-fe-callenge
# https://www.kaggle.com/abhishek/entity-embeddings-to-handle-categories
# https://www.kaggle.com/peterhurford/why-not-logistic-regression

# Посмотреть по текущему соревнованию
# https://www.kaggle.com/drcapa/categorical-feature-engineering-2-xgb
# https://www.kaggle.com/vikassingh1996/don-t-underestimate-the-power-of-a-logistic-reg

# https://habr.com/ru/company/ods/blog/326418/#label-encoding