In [171]:
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import FeatureUnion

from sklearn.model_selection import train_test_split

import dill

In [172]:
df = pd.read_csv('train_airplane.csv')

In [173]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [174]:
categorical_columns = ['Type of Travel', 'Gender','Class', 'Customer Type']
numerical_columns = [
    'Age', 'Flight Distance', 'Inflight wifi service',
    'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service',
    'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes'
       ]
features = categorical_columns + numerical_columns

In [175]:
with open('features.txt', 'w') as f:  
    f.writelines(f"{feature}\n"for feature in features)

In [176]:
df.satisfaction.replace({'satisfied': 1, 'neutral or dissatisfied':  0}, inplace=True)

In [177]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], random_state=123)

#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [178]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col))
                ,('ohe', OHEEncoder(key=cat_col))
                ,('nan_imputer', SimpleImputer(strategy='most_frequent'))
            ])
    final_transformers.append((cat_col, cat_transformer))

for num_col in numerical_columns:
    numerical_transformer = Pipeline([
                ('selector', NumberSelector(key=num_col))
                ,('nan_imputer', SimpleImputer())
                # ,('standartize', StandardScaler())
                # ,('normilize', MinMaxScaler())
            ])
    final_transformers.append((num_col, numerical_transformer))


In [179]:
final_transformers

[('Type of Travel',
  Pipeline(steps=[('selector', FeatureSelector(column='Type of Travel')),
                  ('ohe', OHEEncoder(key='Type of Travel')),
                  ('nan_imputer', SimpleImputer(strategy='most_frequent'))])),
 ('Gender',
  Pipeline(steps=[('selector', FeatureSelector(column='Gender')),
                  ('ohe', OHEEncoder(key='Gender')),
                  ('nan_imputer', SimpleImputer(strategy='most_frequent'))])),
 ('Class',
  Pipeline(steps=[('selector', FeatureSelector(column='Class')),
                  ('ohe', OHEEncoder(key='Class')),
                  ('nan_imputer', SimpleImputer(strategy='most_frequent'))])),
 ('Customer Type',
  Pipeline(steps=[('selector', FeatureSelector(column='Customer Type')),
                  ('ohe', OHEEncoder(key='Customer Type')),
                  ('nan_imputer', SimpleImputer(strategy='most_frequent'))])),
 ('Age',
  Pipeline(steps=[('selector', NumberSelector(key='Age')),
                  ('nan_imputer', SimpleImputer())

In [180]:
feats = FeatureUnion(final_transformers)

Pipeline([('feats', feats)])

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('Type of Travel',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Type '
                                                                                         'of '
                                                                                         'Travel')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Type '
                                                                                 'of '
                                                                                 'Travel')),
                                                                 ('nan_imputer',
                                                                  SimpleImputer(strategy='most_frequent'))])),
  

In [181]:
gboost = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(n_estimators=150, learning_rate=0.5, max_depth=5)),
])

In [182]:
gboost.fit(X_train, y_train)


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Type of Travel',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Type '
                                                                                         'of '
                                                                                         'Travel')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Type '
                                                                                 'of '
                                                                                 'Travel')),
                                                                 ('nan_imputer',
                                                                  SimpleImputer(strategy='most_frequent'))])),

In [183]:
with open("gb_pipeline.dill", "wb") as f:
    dill.dump(gboost, f)