In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import FeatureUnion

from sklearn.model_selection import train_test_split, RandomizedSearchCV

import dill

In [2]:
df = pd.read_csv('train_airplane.csv')

In [3]:

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]


class FeatureSum(BaseEstimator, TransformerMixin):
    '''
    Transform class for sum of several columns

    column - list of columns to sum

    Return: DF with one column - sum of given columns
    '''
    counter = 0
    
    def __init__(self, column):
        self.column = column
        

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        FeatureSum.counter += 1
        Xt = X[self.column].copy()
        Xt[f'sum{FeatureSum.counter}'] = Xt.sum(axis=1, skipna=True)
        return Xt[[f'sum{FeatureSum.counter}']]

In [4]:
categorical_columns = ['Type of Travel', 'Gender','Class', 'Customer Type']
categorical_columns2 = [ 'Inflight wifi service',
    'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service',
    'Cleanliness']

numerical_columns = ['Age', 'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'Flight Distance']
features = categorical_columns + numerical_columns + categorical_columns2

In [5]:
with open('features.txt', 'w') as f:  
    f.writelines(f"{feature}\n"for feature in features)

In [6]:
df.satisfaction.replace({'satisfied': 1, 'neutral or dissatisfied':  0}, inplace=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], random_state=123)

#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [8]:
pipe1 = Pipeline([
    ('column_selector', FeatureSelector(column=categorical_columns)),
    ('cat_nan_imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder()),
])

pipe2 = Pipeline([
    ('column_selector', FeatureSelector(column=categorical_columns2)),
    ('cat_nan_imputer', SimpleImputer(strategy='most_frequent')),
])

pipe3 = Pipeline([
    ('column_selector', FeatureSelector(column=numerical_columns)),
    ('num_nan_imputer', SimpleImputer(strategy='median')) 
])
pipe4 = Pipeline([
    ('sum_of_cat', FeatureSum(column=categorical_columns2))
])

feats = FeatureUnion(
    [('categorical', pipe1),
    ('categorical2', pipe2),
    ('numbers', pipe3),
    ('cat2_sum', pipe4)]
)

In [9]:
gboost = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(n_estimators=466, learning_rate=0.15214285714285716, max_depth=7)),
])

In [10]:
gboost.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('categorical',
                                                 Pipeline(steps=[('column_selector',
                                                                  FeatureSelector(column=['Type '
                                                                                          'of '
                                                                                          'Travel',
                                                                                          'Gender',
                                                                                          'Class',
                                                                                          'Customer '
                                                                                          'Type'])),
                                                                 ('cat_nan_imputer',
                                         

In [11]:
with open("gb_pipeline.dill", "wb") as f:
    dill.dump(gboost, f)