In [1]:
import numpy as np
import pandas as pd
from sklearn import pipeline, preprocessing
from sklearn.pipeline import Pipeline, make_pipeline

from df_transformers import *

In [2]:
train_data = pd.read_csv("data/cleaned_train_data.csv", index_col="PassengerId")
test_data = pd.read_csv("data/cleaned_test_data.csv", index_col="PassengerId")
print(f"Shape of train data: {train_data.shape}. Shape of test data: {test_data.shape}")
train_size = train_data.shape[0]

Shape of train data: (891, 9). Shape of test data: (418, 8)


In [3]:
# Объединим датасеты для обработки
data = pd.concat([train_data.drop(['Survived'], axis=1), test_data])
y = train_data['Survived']

### Разделяем признаки по типу

In [4]:
# Бинарные

# Числовые
num_features = ['Age', 'SibSp', 'Parch', 'Fare']

# Категориальные
cat_features = train_data.columns.drop('Survived').drop(num_features).tolist()

# Текстовые

### Собираем пайплайн

In [5]:
# Обработка числовых признаков
num_pipeline = make_pipeline(
    SelectColumnsTransfomer(num_features),
    #preprocessing.StandardScaler(with_mean = 0)
)

# Обработка категориальных признаков
cat_pipeline = make_pipeline(
    SelectColumnsTransfomer(cat_features),
    DataFrameFunctionTransformer(lambda x: x.apply(str)),
    # Разобраться что за тип данных category в pd.DataFrame и что такое object_levels
    #DataFrameFunctionTransformer(lambda x:x.astype('category', categories=object_levels)),
    ToDummiesTransformer(),
)

# Обединяем обработанные данные
preprocessing_features = DataFrameFeatureUnion([num_pipeline, cat_pipeline])

In [6]:
prprd_data = preprocessing_features.fit_transform(data)

In [7]:
preprocessed_train_data = prprd_data.iloc[:train_size, :]
preprocessed_test_data = prprd_data.iloc[train_size:, :]

In [8]:
preprocessed_train_data['Survived'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Обрабатываем и сохраняем данные

In [9]:
preprocessed_train_data.to_csv('data/preprocessed_train_data.csv', header=True)
preprocessed_test_data.to_csv('data/preprocessed_test_data.csv', header=True)