In [62]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn import ensemble
from sklearn import model_selection

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [63]:
train_data_org = pd.read_csv('../data/train.csv')
test_data_org = pd.read_csv('../data/test.csv')
test_data_org['Survived'] = 0
combined_train_test = train_data_org.append(test_data_org)

In [64]:
#Embarked
if combined_train_test['Embarked'].isnull().sum() != 0:
        combined_train_test['Embarked'].fillna(combined_train_test['Embarked'].mode().iloc[0], inplace=True)

emb_dummies_df = pd.get_dummies(combined_train_test['Embarked'],prefix=combined_train_test[['Embarked']].columns[0])
combined_train_test = pd.concat([combined_train_test, emb_dummies_df], axis=1)

In [65]:
#Sex
sex_dummies_df = pd.get_dummies(combined_train_test['Sex'], prefix=combined_train_test[['Sex']].columns[0])
combined_train_test = pd.concat([combined_train_test, sex_dummies_df], axis=1)

In [66]:
#Title
combined_train_test['Title'] = combined_train_test['Name'].str.extract('.+,(.+)').str.extract('^(.+?)\.').str.strip()
title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Jonkheer', 'Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master'], 'Master'))

combined_train_test['Title'] = combined_train_test['Title'].map(title_Dict)
title_dummies_df = pd.get_dummies(combined_train_test['Title'], prefix=combined_train_test[['Title']].columns[0])
combined_train_test = pd.concat([combined_train_test, title_dummies_df], axis=1)

  


In [67]:
# Fare
if combined_train_test['Fare'].isnull().sum() != 0:
    combined_train_test['Fare'] = combined_train_test[['Fare']].fillna(combined_train_test.groupby('Pclass').transform('mean'))

combined_train_test['Group_Ticket'] = combined_train_test['Fare'].groupby(by=combined_train_test['Ticket']).transform('count')
combined_train_test['Fare'] = combined_train_test['Fare'] / combined_train_test['Group_Ticket']
combined_train_test.drop(['Group_Ticket'], axis=1, inplace=True)

def fare_category(fare):
        if fare <= 4:
            return 0
        elif fare <= 10:
            return 1
        elif fare <= 30:
            return 2
        elif fare <= 45:
            return 3
        else:
            return 4
combined_train_test['Fare_Category'] = combined_train_test['Fare'].map(fare_category)

fare_cat_dummies_df = pd.get_dummies(combined_train_test['Fare_Category'],prefix=combined_train_test[['Fare_Category']].columns[0])
combined_train_test = pd.concat([combined_train_test, fare_cat_dummies_df], axis=1)

In [68]:
# Pclass

# 建立Fare Category


def fare_category(fare):
    if fare <= 4:
        return 'Very_Low_Fare'
    elif fare <= 10:
        return 'Low_Fare'
    elif fare <= 30:
        return 'Med_Fare'
    elif fare <= 45:
        return 'High_Fare'
    else:
        return 'Very_High_Fare'

    # 建立PClass Fare Category

def pclass_fare_category(df, Pclass_1_mean_fare, Pclass_2_mean_fare, Pclass_3_mean_fare):
    if df['Pclass'] == 1:
        if df['Fare'] <= Pclass_1_mean_fare:
            return 'Pclass_1_Low_Fare'
        else:
            return 'Pclass_1_High_Fare'
    elif df['Pclass'] == 2:
        if df['Fare'] <= Pclass_2_mean_fare:
            return 'Pclass_2_Low_Fare'
        else:
            return 'Pclass_2_High_Fare'
    elif df['Pclass'] == 3:
        if df['Fare'] <= Pclass_3_mean_fare:
            return 'Pclass_3_Low_Fare'
        else:
            return 'Pclass_3_High_Fare'


Pclass_1_mean_fare = combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([1]).values[0]
Pclass_2_mean_fare = combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([2]).values[0]
Pclass_3_mean_fare = combined_train_test['Fare'].groupby(by=combined_train_test['Pclass']).mean().get([3]).values[0]
# 建立Pclass_Fare Category
combined_train_test['Pclass_Fare_Category'] = combined_train_test.apply(pclass_fare_category, args=(Pclass_1_mean_fare, Pclass_2_mean_fare, Pclass_3_mean_fare), axis=1)
p_fare = LabelEncoder()
p_fare.fit(np.array(['Pclass_1_Low_Fare', 'Pclass_1_High_Fare', 'Pclass_2_Low_Fare',
                     'Pclass_2_High_Fare', 'Pclass_3_Low_Fare', 'Pclass_3_High_Fare']))  # 给每一项添加标签
combined_train_test['Pclass_Fare_Category'] = p_fare.transform(combined_train_test['Pclass_Fare_Category'])  # 转换成数值

In [69]:
#Parch and SibSp

def family_size_category(family_size):
    if family_size <= 1:
        return 'Single'
    elif family_size <= 3:
        return 'Small_Family'
    else:
        return 'Large_Family'

combined_train_test['Family_Size'] = combined_train_test['Parch'] + combined_train_test['SibSp'] + 1
combined_train_test['Family_Size_Category'] = combined_train_test['Family_Size'].map(family_size_category)
le_family = LabelEncoder()
le_family.fit(np.array(['Single', 'Small_Family', 'Large_Family']))
combined_train_test['Family_Size_Category'] = le_family.transform(combined_train_test['Family_Size_Category'])
fam_size_cat_dummies_df = pd.get_dummies(combined_train_test['Family_Size_Category'],
                                         prefix=combined_train_test[['Family_Size_Category']].columns[0])
combined_train_test = pd.concat([combined_train_test, fam_size_cat_dummies_df], axis=1)

In [70]:
# Age
missing_age_df = pd.DataFrame(combined_train_test[['Age', 'Parch', 'Sex', 'SibSp', 'Family_Size', 'Family_Size_Category',
                             'Title', 'Fare', 'Fare_Category', 'Pclass', 'Embarked']])
missing_age_df = pd.get_dummies(missing_age_df,columns=['Title', 'Family_Size_Category', 'Fare_Category', 'Sex', 'Pclass' ,'Embarked'])
missing_age_train = missing_age_df[missing_age_df['Age'].notnull()]
missing_age_test = missing_age_df[missing_age_df['Age'].isnull()]

In [71]:
# 弃掉不需要的列
def drop_col_not_req(df, cols):
    df.drop(cols, axis=1, inplace=True)

def fill_missing_age(missing_age_train, missing_age_test):
        missing_age_X_train = missing_age_train.drop(['Age'], axis=1)
        missing_age_Y_train = missing_age_train['Age']
        missing_age_X_test = missing_age_test.drop(['Age'], axis=1)
        #模型1
        gbm_reg = ensemble.GradientBoostingRegressor(random_state=42)
        gbm_reg_param_grid = {'n_estimators': [2000], 'max_depth': [3],'learning_rate': [0.01], 'max_features': [3]}
        gbm_reg_grid = model_selection.GridSearchCV(gbm_reg, gbm_reg_param_grid, cv=10, n_jobs=25, verbose=1,  scoring='neg_mean_squared_error')
        gbm_reg_grid.fit(missing_age_X_train, missing_age_Y_train)
        print('Age feature Best GB Params:' + str(gbm_reg_grid.best_params_))
        print('Age feature Best GB Score:' + str(gbm_reg_grid.best_score_))
        print('GB Train Error for "Age" Feature Regressor:'+ str(gbm_reg_grid.score(missing_age_X_train, missing_age_Y_train)))
        missing_age_test['Age_GB'] = gbm_reg_grid.predict(missing_age_X_test)
        print(missing_age_test['Age_GB'][:4])
        #模型2
        lrf_reg = LinearRegression()
        lrf_reg_param_grid = {'fit_intercept': [True], 'normalize': [True]}
        lrf_reg_grid = model_selection.GridSearchCV(lrf_reg, lrf_reg_param_grid, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
        lrf_reg_grid.fit(missing_age_X_train, missing_age_Y_train)
        print('Age feature Best LR Params:' + str(lrf_reg_grid.best_params_))
        print('Age feature Best LR Score:' + str(lrf_reg_grid.best_score_))
        print('LR Train Error for "Age" Feature Regressor' + str(lrf_reg_grid.score(missing_age_X_train, missing_age_Y_train)))
        missing_age_test['Age_LRF'] = lrf_reg_grid.predict(missing_age_X_test)
        print(missing_age_test['Age_LRF'][:4])
        #将两个模型预测后的均值作为最终预测结果
        print('shape1',missing_age_test['Age'].shape,missing_age_test[['Age_GB','Age_LRF']].mode(axis=1).shape)
        #missing_age_test['Age'] = missing_age_test[['Age_GB','Age_LRF']].mode(axis=1)
        missing_age_test['Age'] = np.mean([missing_age_test['Age_GB'],missing_age_test['Age_LRF']])
        print(missing_age_test['Age'][:4])
        drop_col_not_req(missing_age_test, ['Age_GB', 'Age_LRF'])

        return missing_age_test
    
combined_train_test.loc[(combined_train_test.Age.isnull()), 'Age'] = fill_missing_age(missing_age_train,missing_age_test)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:    2.4s remaining:    2.4s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:    2.4s finished


Age feature Best GB Params:{'learning_rate': 0.01, 'max_depth': 3, 'max_features': 3, 'n_estimators': 2000}
Age feature Best GB Score:-112.901918595
GB Train Error for "Age" Feature Regressor:-91.5736510487
5     33.625384
17    33.216675
19    33.120653
26    26.608875
Name: Age_GB, dtype: float64
Fitting 10 folds for each of 1 candidates, totalling 10 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:    0.1s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Age feature Best LR Params:{'fit_intercept': True, 'normalize': True}
Age feature Best LR Score:-119.179217703
LR Train Error for "Age" Feature Regressor-115.203587721
5     34.31250
17    32.75000
19    30.34375
26    26.65625
Name: Age_LRF, dtype: float64
shape1 (263,) (263, 2)
5     29.45237
17    29.45237
19    29.45237
26    29.45237
Name: Age, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [72]:
#Cabin
combined_train_test['Cabin_Letter'] = combined_train_test['Cabin'].apply(lambda x:str(x)[0] if pd.notnull(x) else x)
combined_train_test = pd.get_dummies(combined_train_test,columns=['Cabin','Cabin_Letter'])

In [73]:
train_data = combined_train_test[:891]
test_data = combined_train_test[891:]
titanic_train_data_X = train_data.drop(['Survived'],axis=1)
titanic_train_data_Y = train_data['Survived']
titanic_test_data_X = test_data.drop(['Survived'],axis=1)

In [74]:
titanic_train_data_X

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Ticket,...,Cabin_G6,Cabin_T,Cabin_Letter_A,Cabin_Letter_B,Cabin_Letter_C,Cabin_Letter_D,Cabin_Letter_E,Cabin_Letter_F,Cabin_Letter_G,Cabin_Letter_T
0,22.00000,S,7.250000,"Braund, Mr. Owen Harris",0,1,3,male,1,A/5 21171,...,0,0,0,0,0,0,0,0,0,0
1,38.00000,C,35.641650,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,PC 17599,...,0,0,0,0,1,0,0,0,0,0
2,26.00000,S,7.925000,"Heikkinen, Miss. Laina",0,3,3,female,0,STON/O2. 3101282,...,0,0,0,0,0,0,0,0,0,0
3,35.00000,S,26.550000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,113803,...,0,0,0,0,1,0,0,0,0,0
4,35.00000,S,8.050000,"Allen, Mr. William Henry",0,5,3,male,0,373450,...,0,0,0,0,0,0,0,0,0,0
5,29.45237,Q,8.458300,"Moran, Mr. James",0,6,3,male,0,330877,...,0,0,0,0,0,0,0,0,0,0
6,54.00000,S,25.931250,"McCarthy, Mr. Timothy J",0,7,1,male,0,17463,...,0,0,0,0,0,0,1,0,0,0
7,2.00000,S,4.215000,"Palsson, Master. Gosta Leonard",1,8,3,male,3,349909,...,0,0,0,0,0,0,0,0,0,0
8,27.00000,S,3.711100,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,347742,...,0,0,0,0,0,0,0,0,0,0
9,14.00000,C,15.035400,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,237736,...,0,0,0,0,0,0,0,0,0,0


In [75]:
titanic_test_data_X

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Ticket,...,Cabin_G6,Cabin_T,Cabin_Letter_A,Cabin_Letter_B,Cabin_Letter_C,Cabin_Letter_D,Cabin_Letter_E,Cabin_Letter_F,Cabin_Letter_G,Cabin_Letter_T
0,34.50000,Q,7.829200,"Kelly, Mr. James",0,892,3,male,0,330911,...,0,0,0,0,0,0,0,0,0,0
1,47.00000,S,7.000000,"Wilkes, Mrs. James (Ellen Needs)",0,893,3,female,1,363272,...,0,0,0,0,0,0,0,0,0,0
2,62.00000,Q,9.687500,"Myles, Mr. Thomas Francis",0,894,2,male,0,240276,...,0,0,0,0,0,0,0,0,0,0
3,27.00000,S,8.662500,"Wirz, Mr. Albert",0,895,3,male,0,315154,...,0,0,0,0,0,0,0,0,0,0
4,22.00000,S,6.143750,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,896,3,female,1,3101298,...,0,0,0,0,0,0,0,0,0,0
5,14.00000,S,9.225000,"Svensson, Mr. Johan Cervin",0,897,3,male,0,7538,...,0,0,0,0,0,0,0,0,0,0
6,30.00000,Q,7.629200,"Connolly, Miss. Kate",0,898,3,female,0,330972,...,0,0,0,0,0,0,0,0,0,0
7,26.00000,S,9.666667,"Caldwell, Mr. Albert Francis",1,899,2,male,1,248738,...,0,0,0,0,0,0,0,0,0,0
8,18.00000,C,7.229200,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",0,900,3,female,0,2657,...,0,0,0,0,0,0,0,0,0,0
9,21.00000,S,8.050000,"Davies, Mr. John Samuel",0,901,3,male,2,A/4 48871,...,0,0,0,0,0,0,0,0,0,0


In [76]:
   def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features):
        # 随机森林
        rf_est = RandomForestClassifier(random_state=42)
        rf_param_grid = {'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20]}
        rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1)
        rf_grid.fit(titanic_train_data_X,titanic_train_data_Y)
        #将feature按Importance排序
        feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
        features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
        print('Sample 25 Features from RF Classifier')
        print(str(features_top_n_rf[:25]))

        # AdaBoost
        ada_est = ensemble.AdaBoostClassifier(random_state=42)
        ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.5, 0.6]}
        ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1)
        ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
        #排序
        feature_imp_sorted_ada = pd.DataFrame({'feature': list(titanic_train_data_X),'importance': ada_grid.best_estimator_.feature_importances_}).sort_values( 'importance', ascending=False)
        features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']

        # ExtraTree
        et_est = ensemble.ExtraTreesClassifier(random_state=42)
        et_param_grid = {'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [15]}
        et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1)
        et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
        #排序
        feature_imp_sorted_et = pd.DataFrame({'feature': list(titanic_train_data_X), 'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
        features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
        print('Sample 25 Features from ET Classifier:')
        print(str(features_top_n_et[:25]))

        # 将三个模型挑选出来的前features_top_n_et合并
        features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et], ignore_index=True).drop_duplicates()

        return features_top_n

In [77]:
feature_to_pick = 250
feature_top_n = get_top_n_features(titanic_train_data_X,titanic_train_data_Y,feature_to_pick)
titanic_train_data_X = titanic_train_data_X[feature_top_n]
del titanic_train_data_X['Ticket_Number']#后来发现删除Ticket_Number后效果更好了
titanic_test_data_X = titanic_test_data_X[feature_top_n]
del titanic_test_data_X['Ticket_Number']

Fitting 10 folds for each of 2 candidates, totalling 20 fits


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x10482f780, file "/ana...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x10482f780, file "/ana...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/anaconda3/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    659 
    660         If a global instance already exists, this reinitializes and starts it
    661         """
    662         app = cls.instance(**kwargs)
    663         app.initialize(argv)
--> 664         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    665 
    666 #-----------------------------------------------------------------------------
    667 # utility functions, for convenience
    668 #-----------------------------------------------------------------------------

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    558         if self.poller is not None:
    559             self.poller.start()
    560         self.kernel.start()
    561         self.io_loop = ioloop.IOLoop.current()
    562         try:
--> 563             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    564         except KeyboardInterrupt:
    565             pass
    566 
    567 

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    143         except (RuntimeError, AssertionError):
    144             old_loop = None  # type: ignore
    145         try:
    146             self._setup_logging()
    147             asyncio.set_event_loop(self.asyncio_loop)
--> 148             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    149         finally:
    150             asyncio.set_event_loop(old_loop)
    151 
    152     def stop(self) -> None:

...........................................................................
/anaconda3/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    433             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    434                                    finalizer=self._asyncgen_finalizer_hook)
    435         try:
    436             events._set_running_loop(self)
    437             while True:
--> 438                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    439                 if self._stopping:
    440                     break
    441         finally:
    442             self._stopping = False

...........................................................................
/anaconda3/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1446                         logger.warning('Executing %s took %.3f seconds',
   1447                                        _format_handle(handle), dt)
   1448                 finally:
   1449                     self._current_handle = None
   1450             else:
-> 1451                 handle._run()
        handle._run = <bound method Handle._run of <Handle IOLoop.add_...b/python3.6/site-packages/tornado/ioloop.py:690>>
   1452         handle = None  # Needed to break cycles when an exception occurs.
   1453 
   1454     def _set_coroutine_wrapper(self, enabled):
   1455         try:

...........................................................................
/anaconda3/lib/python3.6/asyncio/events.py in _run(self=<Handle IOLoop.add_future.<locals>.<lambda>(<Fut...ib/python3.6/site-packages/tornado/ioloop.py:690>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <function IOLoop.add_future.<locals>.<lambda>>
        self._args = (<Future finished result=None>,)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py in <lambda>(f=<Future finished result=None>)
    685             #
    686             # Wrap the callback in self._run_callback so we control
    687             # the error logging (i.e. it goes to tornado.log.app_log
    688             # instead of asyncio's log).
    689             future.add_done_callback(
--> 690                 lambda f: self._run_callback(functools.partial(callback, future))
        f = <Future finished result=None>
    691             )
    692         else:
    693             assert is_future(future)
    694             # For concurrent futures, we use self.add_callback, so

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py in _run_callback(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, callback=functools.partial(<function Runner.handle_yield.... at 0x1a1bc8e400>, <Future finished result=None>))
    738         .. versionchanged:: 6.0
    739 
    740            CancelledErrors are no longer logged.
    741         """
    742         try:
--> 743             ret = callback()
        ret = undefined
        callback = functools.partial(<function Runner.handle_yield.... at 0x1a1bc8e400>, <Future finished result=None>)
    744             if ret is not None:
    745                 from tornado import gen
    746 
    747                 # Functions that return Futures typically swallow all

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in inner(f=None)
    782         elif not self.future.done():
    783 
    784             def inner(f: Any) -> None:
    785                 # Break a reference cycle to speed GC.
    786                 f = None  # noqa: F841
--> 787                 self.run()
    788 
    789             self.io_loop.add_future(self.future, inner)
    790             return False
    791         return True

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in run(self=<tornado.gen.Runner object>)
    743                         finally:
    744                             # Break up a reference to itself
    745                             # for faster GC on CPython.
    746                             exc_info = None
    747                     else:
--> 748                         yielded = self.gen.send(value)
        yielded = None
        self.gen.send = <built-in method send of generator object>
        value = None
    749 
    750                 except (StopIteration, Return) as e:
    751                     self.finished = True
    752                     self.future = _null_future

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_queue(self=<ipykernel.ipkernel.IPythonKernel object>)
    373         """
    374 
    375         while True:
    376             # receive the next message and handle it
    377             try:
--> 378                 yield self.process_one()
        self.process_one = <bound method Kernel.process_one of <ipykernel.ipkernel.IPythonKernel object>>
    379             except Exception:
    380                 self.log.exception("Error in message handler")
    381 
    382     _message_counter = Any(

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>,), **kwargs={})
    220                     # Runner). This keeps the coroutine's Runner alive.
    221                     # We do this by exploiting the public API
    222                     # add_done_callback() instead of putting a private
    223                     # attribute on the Future.
    224                     # (Github issues #1769, #2229).
--> 225                     runner = Runner(result, future, yielded)
        result = <generator object process_one>
        future = <Future pending>
        yielded = <Future finished result=(10, 159, <bound method....b407ea8>, <zmq.sugar.fr... 0x1a1b407f60>, ...]))>
    226                     future.add_done_callback(lambda _: runner)
    227                 yielded = None
    228                 try:
    229                     return future

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in __init__(self=<tornado.gen.Runner object>, gen=None, result_future=None, first_yielded=None)
    709         self.running = False
    710         self.finished = False
    711         self.io_loop = IOLoop.current()
    712         if self.handle_yield(first_yielded):
    713             gen = result_future = first_yielded = None  # type: ignore
--> 714             self.run()
        self.run = <bound method Runner.run of <tornado.gen.Runner object>>
    715 
    716     def run(self) -> None:
    717         """Starts or resumes the generator, running until it reaches a
    718         yield point that is not ready.

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in run(self=<tornado.gen.Runner object>)
    743                         finally:
    744                             # Break up a reference to itself
    745                             # for faster GC on CPython.
    746                             exc_info = None
    747                     else:
--> 748                         yielded = self.gen.send(value)
        yielded = undefined
        self.gen.send = <built-in method send of generator object>
        value = (10, 159, <bound method Kernel.dispatch_shell of <ipykernel.ipkernel.IPythonKernel object>>, (<zmq.eventloop.zmqstream.ZMQStream object>, [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]))
    749 
    750                 except (StopIteration, Return) as e:
    751                     self.finished = True
    752                     self.future = _null_future

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in process_one(self=<ipykernel.ipkernel.IPythonKernel object>, wait=True)
    360         else:
    361             try:
    362                 priority, t, dispatch, args = self.msg_queue.get_nowait()
    363             except QueueEmpty:
    364                 return None
--> 365         yield gen.maybe_future(dispatch(*args))
        dispatch = <bound method Kernel.dispatch_shell of <ipykernel.ipkernel.IPythonKernel object>>
        args = (<zmq.eventloop.zmqstream.ZMQStream object>, [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    366 
    367     @gen.coroutine
    368     def dispatch_queue(self):
    369         """Coroutine to preserve order of message handling

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>, <zmq.eventloop.zmqstream.ZMQStream object>, [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]), **kwargs={})
    204                 # avoid the cost of creating a Runner when the coroutine
    205                 # never actually yields, which in turn allows us to
    206                 # use "optional" coroutines in critical path code without
    207                 # performance penalty for the synchronous case.
    208                 try:
--> 209                     yielded = next(result)
        yielded = undefined
        result = <generator object dispatch_shell>
    210                 except (StopIteration, Return) as e:
    211                     future_set_result_unless_cancelled(
    212                         future, _value_from_stopiteration(e)
    213                     )

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 11, 25, 14, 47, 40, 371731, tzinfo=datetime.timezone.utc), 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'session': 'aff3e4b1ceca416590e5fdf2779b58fd', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'parent_header': {}})
    267             try:
    268                 self.pre_handler_hook()
    269             except Exception:
    270                 self.log.debug("Unable to signal in pre_handler_hook:", exc_info=True)
    271             try:
--> 272                 yield gen.maybe_future(handler(stream, idents, msg))
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'aff3e4b1ceca416590e5fdf2779b58fd']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 11, 25, 14, 47, 40, 371731, tzinfo=datetime.timezone.utc), 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'session': 'aff3e4b1ceca416590e5fdf2779b58fd', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'parent_header': {}}
    273             except Exception:
    274                 self.log.error("Exception in message handler:", exc_info=True)
    275             finally:
    276                 try:

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>, <zmq.eventloop.zmqstream.ZMQStream object>, [b'aff3e4b1ceca416590e5fdf2779b58fd'], {'buffers': [], 'content': {'allow_stdin': True, 'code': "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 11, 25, 14, 47, 40, 371731, tzinfo=datetime.timezone.utc), 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'session': 'aff3e4b1ceca416590e5fdf2779b58fd', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'parent_header': {}}), **kwargs={})
    204                 # avoid the cost of creating a Runner when the coroutine
    205                 # never actually yields, which in turn allows us to
    206                 # use "optional" coroutines in critical path code without
    207                 # performance penalty for the synchronous case.
    208                 try:
--> 209                     yielded = next(result)
        yielded = undefined
        result = <generator object execute_request>
    210                 except (StopIteration, Return) as e:
    211                     future_set_result_unless_cancelled(
    212                         future, _value_from_stopiteration(e)
    213                     )

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'aff3e4b1ceca416590e5fdf2779b58fd'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 11, 25, 14, 47, 40, 371731, tzinfo=datetime.timezone.utc), 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'session': 'aff3e4b1ceca416590e5fdf2779b58fd', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'fa4c51d26ba24b3885d92fb982986bbe', 'msg_type': 'execute_request', 'parent_header': {}})
    537             self._publish_execute_input(code, parent, self.execution_count)
    538 
    539         reply_content = yield gen.maybe_future(
    540             self.do_execute(
    541                 code, silent, store_history,
--> 542                 user_expressions, allow_stdin,
        user_expressions = {}
        allow_stdin = True
    543             )
    544         )
    545 
    546         # Flush output before sending the reply.

...........................................................................
/anaconda3/lib/python3.6/site-packages/tornado/gen.py in wrapper(*args=(<ipykernel.ipkernel.IPythonKernel object>, "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", False, True, {}, True), **kwargs={})
    204                 # avoid the cost of creating a Runner when the coroutine
    205                 # never actually yields, which in turn allows us to
    206                 # use "optional" coroutines in critical path code without
    207                 # performance penalty for the synchronous case.
    208                 try:
--> 209                     yielded = next(result)
        yielded = undefined
        result = <generator object do_execute>
    210                 except (StopIteration, Return) as e:
    211                     future_set_result_unless_cancelled(
    212                         future, _value_from_stopiteration(e)
    213                     )

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    289                     res = yield coro_future
    290             else:
    291                 # runner isn't already running,
    292                 # make synchronous call,
    293                 # letting shell dispatch to loop runners
--> 294                 res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        code = "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']"
        store_history = True
        silent = False
    295         finally:
    296             self._restore_input()
    297 
    298         if res.error_before_exec is not None:

...........................................................................
/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']",), **kwargs={'silent': False, 'store_history': True})
    531             )
    532         self.payload_manager.write_payload(payload)
    533 
    534     def run_cell(self, *args, **kwargs):
    535         self._last_traceback = None
--> 536         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']",)
        kwargs = {'silent': False, 'store_history': True}
    537 
    538     def _showtraceback(self, etype, evalue, stb):
    539         # try to preserve ordering of tracebacks and print statements
    540         sys.stdout.flush()

...........................................................................
/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", store_history=True, silent=False, shell_futures=True)
   2850         result : :class:`ExecutionResult`
   2851         """
   2852         result = None
   2853         try:
   2854             result = self._run_cell(
-> 2855                 raw_cell, store_history, silent, shell_futures)
        raw_cell = "feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']"
        store_history = True
        silent = False
        shell_futures = True
   2856         finally:
   2857             self.events.trigger('post_execute')
   2858             if not silent:
   2859                 self.events.trigger('post_run_cell', result)

...........................................................................
/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", store_history=True, silent=False, shell_futures=True)
   2876             runner = self.loop_runner
   2877         else:
   2878             runner = _pseudo_sync_runner
   2879 
   2880         try:
-> 2881             return runner(coro)
        runner = <function _pseudo_sync_runner>
        coro = <coroutine object InteractiveShell.run_cell_async>
   2882         except BaseException as e:
   2883             info = ExecutionInfo(raw_cell, store_history, silent, shell_futures)
   2884             result = ExecutionResult(info)
   2885             result.error_in_exec = e

...........................................................................
/anaconda3/lib/python3.6/site-packages/IPython/core/async_helpers.py in _pseudo_sync_runner(coro=<coroutine object InteractiveShell.run_cell_async>)
     63 
     64     Credit to Nathaniel Smith
     65 
     66     """
     67     try:
---> 68         coro.send(None)
        coro.send = <built-in method send of coroutine object>
     69     except StopIteration as exc:
     70         return exc.value
     71     else:
     72         # TODO: do not raise but return an execution result with the right info.

...........................................................................
/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell_async(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="feature_to_pick = 250\nfeature_top_n = get_top_n_...e_top_n]\ndel titanic_test_data_X['Ticket_Number']", store_history=True, silent=False, shell_futures=True)
   3053                 interactivity = "none" if silent else self.ast_node_interactivity
   3054                 if _run_async:
   3055                     interactivity = 'async'
   3056 
   3057                 has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
-> 3058                        interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   3059 
   3060                 self.last_execution_succeeded = not has_raised
   3061                 self.last_execution_result = result
   3062 

...........................................................................
/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Delete object>, <_ast.Assign object>, <_ast.Delete object>], cell_name='<ipython-input-77-00dc425ce574>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 1a1b173cf8, execution...rue silent=False shell_futures=True> result=None>)
   3244                     elif mode == 'single':
   3245                         mod = ast.Interactive([node])
   3246                     with compiler.extra_flags(getattr(ast, 'PyCF_ALLOW_TOP_LEVEL_AWAIT', 0x0) if self.autoawait else 0x0):
   3247                         code = compiler(mod, cell_name, mode)
   3248                         asy = compare(code)
-> 3249                     if (await self.run_code(code, result,  async_=asy)):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x1a1be9a540, file "<ipython-input-77-00dc425ce574>", line 2>
        result = <ExecutionResult object at 1a1b173cf8, execution...rue silent=False shell_futures=True> result=None>
        asy = False
   3250                         return True
   3251 
   3252             # Flush softspace
   3253             if softspace(sys.stdout, 0):

...........................................................................
/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x1a1be9a540, file "<ipython-input-77-00dc425ce574>", line 2>, result=<ExecutionResult object at 1a1b173cf8, execution...rue silent=False shell_futures=True> result=None>, async_=False)
   3321                     code = compile('last_expr', 'fake', "single")
   3322                     exec(code, {'last_expr': last_expr})
   3323                 elif async_ :
   3324                     await eval(code_obj, self.user_global_ns, self.user_ns)
   3325                 else:
-> 3326                     exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x1a1be9a540, file "<ipython-input-77-00dc425ce574>", line 2>
        self.user_global_ns = {'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'In': ['', "import pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')", "train_data_org = pd.read_csv('../data/train.csv'...train_test = train_data_org.append(test_data_org)", "#Embarked\nif combined_train_test['Embarked'].isn...at([combined_train_test, emb_dummies_df], axis=1)", '#Sex\nsex_dummies_df = pd.get_dummies(combined_tr...at([combined_train_test, sex_dummies_df], axis=1)', "#Title\ncombined_train_test['Title'] = combined_t...([combined_train_test, title_dummies_df], axis=1)", "# Fare\nif combined_train_test['Fare'].isnull().s...ombined_train_test, fare_cat_dummies_df], axis=1)", "#Pclass\nPclass_1_mean_fare = combined_train_test...ombined_train_test['Pclass_Fare_Category'])#转换成数值", "import pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')", "train_data_org = pd.read_csv('../data/train.csv'...train_test = train_data_org.append(test_data_org)", "#Embarked\nif combined_train_test['Embarked'].isn...at([combined_train_test, emb_dummies_df], axis=1)", '#Sex\nsex_dummies_df = pd.get_dummies(combined_tr...at([combined_train_test, sex_dummies_df], axis=1)', "#Title\ncombined_train_test['Title'] = combined_t...([combined_train_test, title_dummies_df], axis=1)", "# Fare\nif combined_train_test['Fare'].isnull().s...ombined_train_test, fare_cat_dummies_df], axis=1)", "#Pclass\n\n # 建立Fare Category\n    def fare_categor...ombined_train_test['Pclass_Fare_Category'])#转换成数值", "# Pclass\n\n# 建立Fare Category\n\n\ndef fare_category(...ined_train_test['Pclass_Fare_Category'])  # 转换成数值", "import pandas as pd\nimport numpy as np\nfrom skle..._ipython().run_line_magic('matplotlib', 'inline')", "train_data_org = pd.read_csv('../data/train.csv'...train_test = train_data_org.append(test_data_org)", "#Embarked\nif combined_train_test['Embarked'].isn...at([combined_train_test, emb_dummies_df], axis=1)", '#Sex\nsex_dummies_df = pd.get_dummies(combined_tr...at([combined_train_test, sex_dummies_df], axis=1)', ...], 'LabelEncoder': <class 'sklearn.preprocessing.label.LabelEncoder'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'OneHotEncoder': <class 'sklearn.preprocessing.data.OneHotEncoder'>, 'Out': {41:           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 42:           Age Embarked       Fare  \
0    34.500...    0               0  

[418 rows x 228 columns], 58:           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 59:           Age Embarked       Fare  \
0    34.500...    0               0  

[418 rows x 228 columns], 74:           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 75:           Age Embarked       Fare  \
0    34.500...    0               0  

[418 rows x 228 columns]}, 'Pclass_1_mean_fare': 33.910500309597509, 'Pclass_2_mean_fare': 11.411010108303246, ...}
        self.user_ns = {'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'In': ['', "import pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')", "train_data_org = pd.read_csv('../data/train.csv'...train_test = train_data_org.append(test_data_org)", "#Embarked\nif combined_train_test['Embarked'].isn...at([combined_train_test, emb_dummies_df], axis=1)", '#Sex\nsex_dummies_df = pd.get_dummies(combined_tr...at([combined_train_test, sex_dummies_df], axis=1)', "#Title\ncombined_train_test['Title'] = combined_t...([combined_train_test, title_dummies_df], axis=1)", "# Fare\nif combined_train_test['Fare'].isnull().s...ombined_train_test, fare_cat_dummies_df], axis=1)", "#Pclass\nPclass_1_mean_fare = combined_train_test...ombined_train_test['Pclass_Fare_Category'])#转换成数值", "import pandas as pd\nimport numpy as np\nget_ipython().run_line_magic('matplotlib', 'inline')", "train_data_org = pd.read_csv('../data/train.csv'...train_test = train_data_org.append(test_data_org)", "#Embarked\nif combined_train_test['Embarked'].isn...at([combined_train_test, emb_dummies_df], axis=1)", '#Sex\nsex_dummies_df = pd.get_dummies(combined_tr...at([combined_train_test, sex_dummies_df], axis=1)', "#Title\ncombined_train_test['Title'] = combined_t...([combined_train_test, title_dummies_df], axis=1)", "# Fare\nif combined_train_test['Fare'].isnull().s...ombined_train_test, fare_cat_dummies_df], axis=1)", "#Pclass\n\n # 建立Fare Category\n    def fare_categor...ombined_train_test['Pclass_Fare_Category'])#转换成数值", "# Pclass\n\n# 建立Fare Category\n\n\ndef fare_category(...ined_train_test['Pclass_Fare_Category'])  # 转换成数值", "import pandas as pd\nimport numpy as np\nfrom skle..._ipython().run_line_magic('matplotlib', 'inline')", "train_data_org = pd.read_csv('../data/train.csv'...train_test = train_data_org.append(test_data_org)", "#Embarked\nif combined_train_test['Embarked'].isn...at([combined_train_test, emb_dummies_df], axis=1)", '#Sex\nsex_dummies_df = pd.get_dummies(combined_tr...at([combined_train_test, sex_dummies_df], axis=1)', ...], 'LabelEncoder': <class 'sklearn.preprocessing.label.LabelEncoder'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'OneHotEncoder': <class 'sklearn.preprocessing.data.OneHotEncoder'>, 'Out': {41:           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 42:           Age Embarked       Fare  \
0    34.500...    0               0  

[418 rows x 228 columns], 58:           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 59:           Age Embarked       Fare  \
0    34.500...    0               0  

[418 rows x 228 columns], 74:           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 75:           Age Embarked       Fare  \
0    34.500...    0               0  

[418 rows x 228 columns]}, 'Pclass_1_mean_fare': 33.910500309597509, 'Pclass_2_mean_fare': 11.411010108303246, ...}
   3327             finally:
   3328                 # Reset our crash handler in place
   3329                 sys.excepthook = old_excepthook
   3330         except SystemExit as e:

...........................................................................
/Users/wangfeng/ML/ml_in_action/kaggle_prc/Titanic/model/<ipython-input-77-00dc425ce574> in <module>()
      1 feature_to_pick = 250
----> 2 feature_top_n = get_top_n_features(titanic_train_data_X,titanic_train_data_Y,feature_to_pick)
      3 titanic_train_data_X = titanic_train_data_X[feature_top_n]
      4 del titanic_train_data_X['Ticket_Number']#后来发现删除Ticket_Number后效果更好了
      5 titanic_test_data_X = titanic_test_data_X[feature_top_n]
      6 del titanic_test_data_X['Ticket_Number']

...........................................................................
/Users/wangfeng/ML/ml_in_action/kaggle_prc/Titanic/model/<ipython-input-76-bae21dc0ca63> in get_top_n_features(titanic_train_data_X=          Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], titanic_train_data_Y=0      0
1      1
2      1
3      1
4      0
5  ...90    0
Name: Survived, Length: 891, dtype: int64, top_n_features=250)
      1 def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features):
      2      # 随机森林
      3      rf_est = RandomForestClassifier(random_state=42)
      4      rf_param_grid = {'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20]}
      5      rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1)
----> 6      rf_grid.fit(titanic_train_data_X,titanic_train_data_Y)
      7      #将feature按Importance排序
      8      feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
      9      features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
     10      print('Sample 25 Features from RF Classifier')

...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=10, error_score='raise',
       ...ain_score='warn',
       scoring=None, verbose=1), X=          Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], y=0      0
1      1
2      1
3      1
4      0
5  ...90    0
Name: Survived, Length: 891, dtype: int64, groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...d(n_splits=10, random_state=None, shuffle=False)>
        X =           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns]
        y = 0      0
1      1
2      1
3      1
4      0
5  ...90    0
Name: Survived, Length: 891, dtype: int64
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=25), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=25)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Mon Nov 25 23:47:40 2019
PID: 8419                               Python 3.6.9: /anaconda3/bin/python
...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (RandomForestClassifier(bootstrap=True, class_wei...se, random_state=42, verbose=0, warm_start=False),           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 0      0
1      1
2      1
3      1
4      0
5  ...90    0
Name: Survived, Length: 891, dtype: int64, {'score': <function _passthrough_scorer>}, array([ 82,  84,  85,  88,  94,  95,  96,  97,  ...,
       883, 884, 885, 886, 887, 888, 889, 890]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1..., 80, 81, 83, 86, 87,
       89, 90, 91, 92, 93]), 1, {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (RandomForestClassifier(bootstrap=True, class_wei...se, random_state=42, verbose=0, warm_start=False),           Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], 0      0
1      1
2      1
3      1
4      0
5  ...90    0
Name: Survived, Length: 891, dtype: int64, {'score': <function _passthrough_scorer>}, array([ 82,  84,  85,  88,  94,  95,  96,  97,  ...,
       883, 884, 885, 886, 887, 888, 889, 890]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1..., 80, 81, 83, 86, 87,
       89, 90, 91, 92, 93]), 1, {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=RandomForestClassifier(bootstrap=True, class_wei...se, random_state=42, verbose=0, warm_start=False), X=          Age Embarked       Fare  \
0    22.000...    0               0  

[891 rows x 228 columns], y=0      0
1      1
2      1
3      1
4      0
5  ...90    0
Name: Survived, Length: 891, dtype: int64, scorer={'score': <function _passthrough_scorer>}, train=array([ 82,  84,  85,  88,  94,  95,  96,  97,  ...,
       883, 884, 885, 886, 887, 888, 889, 890]), test=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 1..., 80, 81, 83, 86, 87,
       89, 90, 91, 92, 93]), verbose=1, parameters={'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method BaseForest.fit of RandomForestClas...e, random_state=42, verbose=0, warm_start=False)>
        X_train =           Age Embarked       Fare  \
82   29.452...    0               0  

[801 rows x 228 columns]
        y_train = 82     1
84     1
85     1
88     1
94     0
95 ...90    0
Name: Survived, Length: 801, dtype: int64
        fit_params = {}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self=RandomForestClassifier(bootstrap=True, class_wei...se, random_state=42, verbose=0, warm_start=False), X=          Age Embarked       Fare  \
82   29.452...    0               0  

[801 rows x 228 columns], y=82     1
84     1
85     1
88     1
94     0
95 ...90    0
Name: Survived, Length: 801, dtype: int64, sample_weight=None)
    242         -------
    243         self : object
    244             Returns self.
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
        X =           Age Embarked       Fare  \
82   29.452...    0               0  

[801 rows x 228 columns]
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if sample_weight is not None:
    250             sample_weight = check_array(sample_weight, ensure_2d=False)
    251         if issparse(X):

...........................................................................
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array=          Age Embarked       Fare  \
82   29.452...    0               0  

[801 rows x 228 columns], accept_sparse='csc', dtype=<class 'numpy.float32'>, order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=False, estimator=None)
    428 
    429     if sp.issparse(array):
    430         array = _ensure_sparse_format(array, accept_sparse, dtype, copy,
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
        array =           Age Embarked       Fare  \
82   29.452...    0               0  

[801 rows x 228 columns]
        dtype = <class 'numpy.float32'>
        order = None
        copy = False
    434 
    435         if ensure_2d:
    436             if array.ndim == 1:
    437                 raise ValueError(

ValueError: could not convert string to float: 'Mr'
___________________________________________________________________________

In [None]:
rf_est = ensemble.RandomForestClassifier(n_estimators = 750, criterion = 'gini', max_features = 'sqrt',
                                             max_depth = 3, min_samples_split = 4, min_samples_leaf = 2,
                                             n_jobs = 50, random_state = 42, verbose = 1)
gbm_est = ensemble.GradientBoostingClassifier(n_estimators=900, learning_rate=0.0008, loss='exponential',
                                                  min_samples_split=3, min_samples_leaf=2, max_features='sqrt',
                                                  max_depth=3, random_state=42, verbose=1)
et_est = ensemble.ExtraTreesClassifier(n_estimators=750, max_features='sqrt', max_depth=35, n_jobs=50,
                                           criterion='entropy', random_state=42, verbose=1)
voting_est = ensemble.VotingClassifier(estimators = [('rf', rf_est),('gbm', gbm_est),('et', et_est)],
                                       voting = 'soft', weights = [3,5,2],
                                       n_jobs = 50)
voting_est.fit(titanic_train_data_X,titanic_train_data_Y)

In [None]:
titanic_test_data_X['Survived'] = voting_est.predict(titanic_test_data_X)
submission = pd.DataFrame({'PassengerId':test_data_org.loc[:,'PassengerId'],
                               'Survived':titanic_test_data_X.loc[:,'Survived']})
submission.to_csv('../data/result.csv',index=False,sep=',')