In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./tt/train.csv')

In [3]:
data = data[['Survived', 'Pclass','Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked']]

In [4]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [5]:
data.fillna(0, inplace=True)

In [6]:
data['Sex'] =[1 if x=='male' else 0 for x in data.Sex]

In [7]:
data['p1'] = np.array(data['Pclass'] == 1).astype(np.int32)
data['p2'] = np.array(data['Pclass'] == 2).astype(np.int32)
data['p3'] = np.array(data['Pclass'] == 3).astype(np.int32)

In [8]:
del data['Pclass']

In [9]:
data['e1'] = np.array(data['Embarked'] == 'S').astype(np.int32)
data['e2'] = np.array(data['Embarked'] == 'C').astype(np.int32)
data['e3'] = np.array(data['Embarked'] == 'Q').astype(np.int32)

In [10]:
del data['Embarked']

In [11]:
data_train = data[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'p1', 'p2', 'p3',
       'e1', 'e2', 'e3']].values

In [12]:
data_target = data['Survived'].values.reshape(len(data),1)

In [13]:
np.shape(data_train),np.shape(data_target)

((891, 11), (891, 1))

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(data_train, data_target, test_size = 0.2)

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
model = RandomForestClassifier(n_estimators=100, n_jobs=4)

In [18]:
model.fit(x_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
model.score(x_test,y_test)

0.7988826815642458

In [20]:
model.feature_importances_

array([0.2451983 , 0.27472705, 0.04810057, 0.04127795, 0.25706224,
       0.02817151, 0.01808067, 0.0486292 , 0.01571343, 0.01351074,
       0.00952834])

In [21]:
for futh, imp in zip(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'p1', 'p2', 'p3','e1', 'e2', 'e3'], model.feature_importances_):
    print(futh, ':', imp)

Sex : 0.24519830223547842
Age : 0.27472705128342034
SibSp : 0.04810057269657244
Parch : 0.041277947706635336
Fare : 0.25706223691371166
p1 : 0.028171510582244062
p2 : 0.018080668372255995
p3 : 0.04862920399336567
e1 : 0.01571343220298058
e2 : 0.013510736829247656
e3 : 0.009528337184087975


In [26]:
model.predict(x_test[0].reshape(1, -1))

array([1])

In [27]:
model.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=512576206, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=2143031552, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_wei

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
n_estimators = range(80, 130)

In [30]:
param_grid = {'n_estimators':n_estimators}

In [31]:
model = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)

In [32]:
model.fit(data_train, data_target.ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(80, 130)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
model.best_params_

{'n_estimators': 95}

In [34]:
model.best_score_

0.8260381593714927