# Построение Pipline
Исходя из ранее созданной предобработки

## Импорт библиотек и загрузка данных

In [1]:
#Импорты

import os
import re

from datetime import datetime

import pandas as  pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn import set_config
set_config(transform_output="pandas")

In [2]:
# Содержимое директории с данными:

data_path = '../../Data/Kaggle/titanic/'

data_src_path = './src_data/'

files = os.listdir(data_path)
for i in range(len(files)):
    print(f'file {i} - {files[i]}')

file 0 - test.csv
file 1 - train.csv
file 2 - gender_submission.csv


In [3]:
# Загрузка данных

train = pd.read_csv(data_path + 'train.csv')
test  = pd.read_csv(data_path + 'test.csv')

X = train.drop(columns=['Survived'])
y = train['Survived']

print(train.shape, test.shape)
train.head()

(891, 12) (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Создание функций для препроцессинга и feature engineering

In [4]:
#Создам отдельный класс, для обработки выбросов в числовых переменных 
#с использованием межквартильного размаха

class ClipOutliersIQR(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, k=1.5):
        self.columns = columns
        self.k = k
        self.bounds_ = {} #Здесь будут сохранятся границы для конкретного столбца

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in self.columns:
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            IQR = q3 - q1
            upper = q3 + self.k * IQR
            lower = q1 - self.k * IQR
            self.bounds_[col] = (lower, upper)

        return self

    
    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col, (lower, upper) in self.bounds_.items():
            X[col] = X[col].clip(lower, upper)

        return X

In [5]:
#Создам функцию, которая редкие экземпляры относит к "другому"

class RareCategoriesGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, columns, min_freq=0.01, label_rare_categoty="other"):
        self.columns = columns
        self.min_freq = min_freq
        self.label_rare_categoty = label_rare_categoty
        self.rare_categories_ = {} #{column :set(rare_categories)}

        
    def fit(self, X, y=None):
        self.X = pd.DataFrame(X).copy()
        for col in self.columns:
            rare_index = X[col].value_counts(normalize=True)*100<1
            set(X[col].value_counts()[rare_index].index)
            self.rare_categories_[col] = set(X[col].value_counts()[rare_index].index)
            
        return self

    
    def transform(self, X):
        self.X = pd.DataFrame(X).copy()
        for col, rare_values in self.rare_categories_.items():
            X[col] = X[col].where(~X[col].isin(rare_values), self.label_rare_categoty)
            X[col] = X[col].apply(str)
        return X

In [6]:
#Работа с нечисловыми признаками:

class TitanicFeatures(BaseEstimator, TransformerMixin):
    """
    Feature engineering:
    - Title из Name
    - FamilySize = SibSp + Parch + 1
    - IsAlone = 1 если FamilySize==1
    - Ticket_len
    - Ticket_prfx
    - Cabin_prfx (первая буква или empty)
    Удаляет Name/Ticket/Cabin (если drop_original=True).
    """
    def __init__(self, delite_duplicates=True):
        self.delite_duplicates=delite_duplicates

    #Обработка именени на статус
    @staticmethod
    def _extract_title(name:str)->str:

        list_name_prfx = name.lower().replace('.', '').replace(',', '').strip().split()
        if 'mr' in list_name_prfx:
            return 'mr'
        elif 'miss' in list_name_prfx:
            return 'miss'
        elif 'mrs' in list_name_prfx:
            return 'mrs'
        else:
            return 'other'

    #Обработка билета:
    @staticmethod
    def _len_ticket(ticket:str)->str:
        return len(ticket.strip())
        
    @staticmethod
    def _remove_nbrs(string:str)->str:
        return re.sub(r'\d+', '__', string)
        
    @staticmethod
    def _ticket_prefix(ticket:str)->str:
        return TitanicFeatures._remove_nbrs(ticket).lower().replace('.','').split()[0].replace('/','')[:2]

    #Обработки каюты:
    @staticmethod
    def _cabin_letter(cabin: str) -> str:
        c = str(cabin)
        if c == "nan" or c.strip() == "":
            return "empty"
        return c.strip()[0]    

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X.copy())

        # Family features
        if {'Parch', 'SibSp'}.issubset(X.columns):
            X['Family_size'] = X['Parch'].fillna(0) + X['SibSp'].fillna(0) + 1
            X['Is_alone'] = X['Family_size'].apply(lambda x: 1 if x==1 else 0)

        # Из имени
        X['Title'] = X['Name'].apply(self._extract_title)

        # Из билета
        X['Ticket_len'] = X['Ticket'].apply(self._len_ticket)

        X['Ticket_prefix'] = X['Ticket'].apply(self._ticket_prefix)

        #Обработки каюты:
        X['Cabin_prefix'] = X['Cabin'].apply(self._cabin_letter)

        #Удаляем оригинальные столбцы, если указано
        if self.delite_duplicates == True:
            drop_cols = [c for c in ['Parch', 'SibSp', 'Name', 'Ticket', 'Cabin'] if c in X.columns]
            X.drop(drop_cols, axis=1, inplace=True)
        
        return X

## Создание Pipeline

In [7]:
a = TitanicFeatures()
X_new = a.fit_transform(X)

In [8]:
num_cols = ["Age", "Fare", "Family_size", "Ticket_len"]
cat_cols = ["Pclass", "Sex", "Embarked", "Title", "Ticket_prefix", "Cabin_prefix", "Is_alone"]

In [9]:
numeric_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

numeric_pipe_with_outliers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('outl_fit', ClipOutliersIQR(columns=["Age", "Fare"])),
    ('scaler', StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ohe', OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False))
])

ct = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, [x for x in num_cols if x not in ["Age", "Fare"]]),
        ("num_out", numeric_pipe_with_outliers, ["Age", "Fare"]),
        ("cat", categorical_pipe, cat_cols)
    ],
    remainder="drop"
)

pipe = Pipeline(steps=[
    ("feats", TitanicFeatures()),
    ("rare", RareCategoriesGrouper(columns=cat_cols)),
    ("ct", ct),
])

## Получение обработанных обучающих и тестовых данных и их сохранение

In [10]:
X_tr = pipe.fit_transform(X, y)
X_test = pipe.transform(test)

print('X_tr.shape = ', X_tr.shape)
print('X_test.shape = ', X_test.shape)

X_tr.shape =  (891, 29)
X_test.shape =  (418, 29)




In [11]:
cur_date_time = datetime.now().strftime("%Y_%m_%d_%H_%M")

X_tr.to_pickle(f'{data_src_path}{cur_date_time}_Xtr')
y.to_pickle(f'{data_src_path}{cur_date_time}_y_tr')
X_test.to_pickle(f'{data_src_path}{cur_date_time}_X_test')