In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import average_precision_score, auc, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.model_selection import cross_val_score
from tqdm import *
from ggplot import *
import random

In [None]:
# Get data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_dumm = test.copy()

In [None]:
# Get a sense of the data
print train.info()
# print train.describe()
# print train.describe(include=['O'])

# Data Cleaning

In [None]:
# Test set one row of fare is null - find value
print test[test['Fare'].isnull() == True]

# Assign the mean value in the same embarkment, class
test.at[152,'Fare'] = test[(test['Embarked'] == 'S') & (test['Pclass'] == 3) & (test['Fare'].isnull() == False) & (test['Age'].isnull() == False)]['Age'].mean()

print test.iloc[152]['Fare']

# Explorations

In [None]:
# fig = plt.figure()
plt.subplot(2,2,1)
plt.xlabel('Age')
plt.ylabel('Freq')
plt.title('Distribution of Age')
plt.hist(np.array(train['Age'][train['Age'].isnull() == False]), bins = 100)

plt.subplot(2,2,2)
plt.xlabel('Fare')
plt.ylabel('Freq')
plt.title('Distribution of Fare')
plt.hist(np.array(train['Fare'][train['Fare'].isnull() == False]), bins = 100)

plt.show()

In [None]:
# plot the heatmap and annotation on it
import seaborn as sns
Var_Corr = train[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr()
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True)

### Explore relationships with age to impute the missing values

In [None]:
# Plot Age with Pclass
sns.kdeplot(train[train['Pclass'] == 1]['Age'], shade=True, color='r');
sns.kdeplot(train[train['Pclass'] == 2]['Age'], shade=True, color='b');
sns.kdeplot(train[train['Pclass'] == 3]['Age'], shade=True, color='g');
print train.groupby('Pclass')['Age'].mean()

In [None]:
# Plot Age with Sibsp

sns.kdeplot(train[train['SibSp'] == 0]['Age'], shade=True, color='black');
sns.kdeplot(train[train['SibSp'] == 1]['Age'], shade=True, color='blue');
sns.kdeplot(train[train['SibSp'] == 2]['Age'], shade=True, color='red');
print train['SibSp'].value_counts()

In [None]:
# Plot Age for Parch
sns.kdeplot(train[train['Parch'] == 0]['Age'], shade=True, color='black');
sns.kdeplot(train[train['Parch'] == 1]['Age'], shade=True, color='blue');
sns.kdeplot(train[train['Parch'] == 2]['Age'], shade=True, color='red');

train['Parch'].value_counts()

In [None]:
# Look at combinations
sns.kdeplot(train[(train['Parch'] == 0)&(train['SibSp'] == 0)&(train['Pclass'] == 1)]['Age'], shade=True, color='blue');
sns.kdeplot(train[(train['Parch'] == 0)&(train['SibSp'] == 0)&(train['Pclass'] == 2)]['Age'], shade=True, color='yellow');
sns.kdeplot(train[(train['Parch'] == 0)&(train['SibSp'] == 0)&(train['Pclass'] == 3)]['Age'], shade=True, color='pink');

In [None]:
g = sns.FacetGrid(train, row='Pclass', col='Sex')
g.map(plt.hist, "Age")
age_pclass_medians = train.groupby(['Pclass', 'Sex'])['Age'].median()
print age_pclass_medians

-  Looks like age can be segmented from combination of pclass and sex

# Feature Engineering

### Feature Engineer Names

```
# Use this code to check if all titles are there in the list
def check_title(name):
    x = False
    for title in ['Mr.', 'Master.', 'Miss.', 'Mrs.', 'Rev.', 'Don.', 'Dona.', 'Dr.', 'Mme.', 'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Jonkheer.', 'Col.', 'Countess.', 'Capt.']:
        x = title in name
        if x:
            break
    return x

for i in range(len(test['Name'])):
    if check_title(test['Name'][i]) == False:
        print test['Name'][i]
```

In [None]:
# Code to get title column
def get_title(name):
    x = False
    for title in ['Mr.', 'Master.', 'Miss.', 'Mrs.', 'Rev.', 'Don.', 'Dona.', 'Dr.', 'Mme.', 'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Jonkheer.', 'Col.', 'Countess.', 'Capt.']:
        x = title in name
        if x:
            break
    return title
# for i in range(len(test['Name'])):
#     print check_title(test['Name'][i])
train['title'] = train[['Name']].apply(lambda x: get_title(*x), axis=1)
test['title'] = test[['Name']].apply(lambda x: get_title(*x), axis=1)

In [None]:
# Explore the survival rates of the titles
pd.concat([train.groupby('title')['Survived'].mean(), train.groupby('title')['Survived'].count()], axis = 1)

In [None]:
# Remap the title relationships
title_map = {
    'Capt.': 'Other',
    'Col.': 'Other',
    'Countess.': 'Royal',
    'Don.': 'Royal',
    'Dr.': 'Other',
    'Jonkheer.': 'Royal',
    'Lady.': 'Royal',
    'Major.': 'Other',
    'Mlle.': 'Miss.',
    'Mme.': 'Mr.',
    'Ms.': 'Miss.',
    'Rev.': 'Rev',
    'Sir.': 'Royal',
    'Dona.': 'Royal'
}
train['title'] = train.replace({'title': title_map})['title']
test['title'] = test.replace({'title': title_map})['title']

### Impute Age

In [None]:
# Impute age as very low for 'Master' title (4 missing values)
g = sns.FacetGrid(test, col='title')
g.map(plt.hist, 'Age')
train[
    (train['title'] == 'Master.')
    & (train['Age'].isnull() == True)
]

# This shouldn't but its reducing accuracy for some reason, so not doing
# train.loc[(train['title'] == 'Master.')& (train['Age'].isnull() == True),'Age'] = train[(train['title'] == 'Master.')]['Age'].mean()
# test.loc[(test['title'] == 'Master.')& (test['Age'].isnull() == True),'Age'] = train[(train['title'] == 'Master.')]['Age'].mean()

In [None]:
### Assign null ages to median values from segments of Pclass and sex
age_pclass_medians

# Function to create the age val from pclass and sex
def calAge(pclass, sex):
    return age_pclass_medians.loc[(age_pclass_medians.index.get_level_values('Pclass') == pclass) & (age_pclass_medians.index.get_level_values('Sex') == sex)].values[0]

# Use apply function on the two columns to get the new missing age vals
created_age_vals_TRAIN = train[train['Age'].isnull()][['Pclass','Sex']].apply(lambda x: calAge(*x), axis = 1)
created_age_vals_TEST = test[test['Age'].isnull()][['Pclass','Sex']].apply(lambda x: calAge(*x), axis = 1)

# Append given ages with created ages
new_age_TRAIN = train[train['Age'].isnull() == False]['Age'].append(created_age_vals_TRAIN)
new_age_TEST = test[test['Age'].isnull() == False]['Age'].append(created_age_vals_TEST)

# concat the new age to the old age using index
# new_age
train = pd.concat([train.drop(['Age'], axis = 1), pd.DataFrame(new_age_TRAIN, columns=['Age'])], axis =1)
test = pd.concat([test.drop(['Age'], axis = 1), pd.DataFrame(new_age_TEST, columns=['Age'])], axis =1)

### Is_alone feature?

In [None]:
# Commenting out because this was not very helpful ~ gets accounted for in the num_dependents feature
# train['isAlone'] = train[['SibSp', 'Parch']].apply(lambda x: 1 if ((x['SibSp'] == 0) and (x['Parch'] == 0)) else 0, axis=1)
# test['isAlone'] = test[['SibSp', 'Parch']].apply(lambda x: 1 if ((x['SibSp'] == 0) and (x['Parch'] == 0)) else 0, axis=1)

## num_dependents feature?

In [None]:
# aka Family Size
train['num_dependents'] = train[['SibSp', 'Parch']].apply(lambda x: ((x['SibSp']) + (x['Parch'])), axis=1)
test['num_dependents'] = test[['SibSp', 'Parch']].apply(lambda x: ((x['SibSp']) + (x['Parch'])), axis=1)

In [None]:
# Check survival rates acc to num_dependents
g = sns.FacetGrid(train, col='num_dependents')
g.map(plt.hist, 'Survived')

-  Looks definitely promising

## premium_over_class_avg?

In [None]:
# To check how many people paid more than the mean payment of their classes 
meanFares = train.groupby('Pclass')['Fare'].mean()

train['premium_over_mean_ticket'] = train[['Pclass', 'Fare']].apply(lambda x: 1 if ((x['Fare'] - meanFares.loc[x['Pclass']]) >0) else 0, axis=1)
test['premium_over_mean_ticket'] = test[['Pclass', 'Fare']].apply(lambda x: 1 if ((x['Fare'] - meanFares.loc[x['Pclass']]) >0) else 0, axis=1)

## Continuous to Categorical

In [None]:
# num_dependents
g = sns.FacetGrid(train, col='num_dependents')
g.map(plt.hist, 'Survived')

# Reassign titles and make less frequent titles to 'Other'
title_map = {
    0: 'alone',
    1: 'couple',
    2: 'small',
    3: 'mid',
    4: 'mid',
    5: 'large',
    6: 'large',
    7: 'large',
    8: 'large',
    9: 'large',
    10: 'large'
}
train['num_dependents'] = train.replace({'num_dependents': title_map})['num_dependents']
test['num_dependents'] = test.replace({'num_dependents': title_map})['num_dependents']


-  This helped increase the accuracy

In [None]:
# fare
g = sns.FacetGrid(train, col='Survived')
plott = g.map(plt.hist, 'Fare', bins=200)
axs = plott.axes
axs[0,0].set_xlim(0,100)
train.groupby('Survived')['Fare'].mean()

In [None]:
train['fare_categ'] = train[['Fare']].apply(lambda x: 'low' if x['Fare'] < 20 else 'high', axis = 1)
test['fare_categ'] = test[['Fare']].apply(lambda x: 'low' if x['Fare'] < 20 else 'high', axis = 1)

## Downsampling?

```
random.seed(101)
# Class distribution?
train['Survived'].value_counts()

# Try Downsampling the data 
def downSample(df, y):
    majorityClass = df[y].value_counts().idxmax()
    minorityClass = df[y].value_counts().idxmin()
    
    dfMajority = df[df[y] == majorityClass]
    dfMinority = df[df[y] == minorityClass]
    
    dfMajorityDownSampled = resample(dfMajority, 
                                 replace=False,    # sample without replacement
                                 n_samples=dfMinority.shape[0],     # to match minority class
                                 random_state=123) # reproducible results
    return pd.concat([dfMajorityDownSampled, dfMinority])

downSample(train, 'Survived')['Survived'].value_counts()
```

# Random Forest Fit

In [None]:
def sanitizeRF(df):
    
    # Remove Columns that do not make sense for modelling
    dropCols = ['PassengerId', 'Name', 'Ticket'
                , 'Cabin'
               ]
    df = df.drop(dropCols, axis = 1)
    
    # Change data types to object for dummification
    df['Pclass'] = df['Pclass'].astype('object')
    
    # Dummify object type columns
    df_Dumm = pd.get_dummies(df, columns=df.dtypes[df.dtypes == 'object'].index
#                              , drop_first=True
                            )
    return df_Dumm

train = sanitizeRF(train)
test = sanitizeRF(test)

In [None]:
# Prepare data for model fit
y = train['Survived']
X = train.loc[:,train.columns != 'Survived']
dropCols = []
X = X.drop(dropCols, axis=1)
print X.columns


In [None]:
# random.seed(101)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # 70% training and 30% test
# Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100, max_depth=5, max_features=3, random_state=101)
cv_acc = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
cv_acc.mean()

## Random Forest Grid Search for Hyperparameter Tuning

In [None]:
# Grid Search Hyper Parameter Tuning
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 10, num = 7)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 50]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 10, 20]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)

In [None]:
rf_random.best_params_

In [None]:
# Check accuracy from the new parameters
clf=RandomForestClassifier(n_estimators=1371, max_depth=7, max_features=4, bootstrap=True,min_samples_leaf=1, min_samples_split=40, random_state=101)
cv_acc = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
cv_acc.mean()

In [None]:
# Narrow down search by trying all combinations in a particular region
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [ 5,7],
    'max_features': [2, 4,6],
    'min_samples_leaf': [1],
    'min_samples_split': [40, 50,70],
    'n_estimators': [500,1371,1500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X, y)
grid_search.best_params_

### Plot probability threshold curve

In [None]:
import operator
# Choosing the threshold for max accuracy
def plot_prob_thresh(pred_probs, y_test):
    threshs = {}
    xs = []
    ys = []
    for x in range(100):
        thresh = x/float(100) + .01
        preds = (pred_probs > thresh).astype('int')
        threshs[thresh] = metrics.accuracy_score(y_test, preds)
        xs.append(thresh)
        ys.append(threshs[thresh])
    print max(threshs.iteritems(), key=operator.itemgetter(1))[0]
    print threshs[max(threshs.iteritems(), key=operator.itemgetter(1))[0]]
    threshdf = pd.concat([pd.Series(xs),pd.Series(ys)], axis=1) # Axis =1 for concatenation along the columns. For row append, axis = 0 (adding rows: 0, adding cols = 1)
    threshdf.columns=['x','y']
    main_thresh = max(threshs.iteritems(), key=operator.itemgetter(1))[0]
    print ggplot(aes(x='x', y='y'), threshdf) + geom_line() + geom_point()

# plot_prob_thresh(y_pred[:,1], y_test)

### Manual Cross Validation

```
cv_dict={}
xs = []
ys = []
ysd = []
for i in tqdm(range(15)):
    param = (i+1)
    clf=RandomForestClassifier(n_estimators=45, max_depth=6, max_features=param)
#     clf=RandomForestClassifier(n_estimators=60, max_depth=4, max_features=3)
#     scores = cross_val_score(clf, X, y, cv=10, scoring='roc_auc')
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    cv_dict[param] = scores.mean()
    xs.append(param)
    ys.append(cv_dict[param])
    ysd.append(scores.std())
# print cv_dict
cv = pd.concat([pd.Series(xs),pd.Series(ys), pd.Series(['auc']*len(xs))], axis=1) # Axis =1 for concatenation along the columns. For row append, axis = 0 (adding rows: 0, adding 
cv.columns=['x','y','met']
# ggplot(aes(x='x', y='y'), cv) + geom_line() + geom_point()

# print cv_dict
cv1 = pd.concat([pd.Series(xs),pd.Series(ysd), pd.Series(['sd']*len(xs))], axis=1) # Axis =1 for concatenation along the columns. For row append, axis = 0 (adding rows: 0, adding cols = 1)
cv1.columns=['x','y', 'met']

plt.subplot(2,1,1)
plt.plot(cv['x'], cv['y'])
plt.subplot(2,1,2)
plt.plot(cv1['x'], cv1['y'])
```

## Prediction on the main test set

In [None]:
clf=RandomForestClassifier(n_estimators=1371, max_depth=7, max_features=4, bootstrap=True,min_samples_leaf=1, min_samples_split=40, random_state=101)
clf.fit(X,y)
# Get prob scores for the main test set
# y_pred_main=clf.predict_proba(test)
# main_preds = (y_pred_main[:,1] > main_thresh).astype('int')
# main_preds = (y_pred_main[:,1] > .57).astype('int')
y_pred_main=clf.predict(test)
main_preds = y_pred_main
predDF = pd.DataFrame(test_dumm['PassengerId'])
predDF['Survived'] = main_preds
predDF.to_csv('preds_Oct21_NN_embeddings.csv',index=False)

In [None]:
# Plot feature importances
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance',                                                                 ascending=False)
feature_importances

In [None]:
# # Manual inspection
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # 70% training and 30% test
# clf=RandomForestClassifier(n_estimators=1371, max_depth=7, max_features=4, bootstrap=True,min_samples_leaf=1, min_samples_split=40, random_state=101)
# # cv_acc = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
# # clf.fit(X,y)
# # cv_acc.mean()
# clf.fit(X_train,y_train)
# # Get prob scores for the main test set
# # y_pred_main=clf.predict_proba(test)
# y_pred_main=clf.predict(X_test)
# test_manual = X_test.copy()
# test_manual['preds'] = y_pred_main
# test_manual['labels'] = y_test

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 70% training and 30% test
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
# predictions = gbm.predict(test_X)
y_pred=gbm.predict_proba(X_test)

# Neural Net

In [None]:
import numpy
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import np_utils
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
# Do data scaling
X_Scaled = X.copy()
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(X_Scaled)
X_mainTest = scaler.fit_transform(test)
X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y, test_size=0.2) # 70% training and 30% test

In [None]:
def baseline_model():
    # Create Model
    model = Sequential()
    # One hidden layer with num hidden units the same as num_pixels, with relu units
    model.add(Dense(X.shape[1], input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))
    # model.add(Dense(X.shape[1], input_dim=X.shape[1], kernel_initializer='normal', activation='relu'))

    # Output layer with softmax units (10)
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile Model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# Build the model
model = baseline_model()

# Fit the model
# We can now fit and evaluate the model. The model is fit over 10 epochs with updates every 200 images. The test data is used as the validation dataset, allowing you to see the skill of the model as it trains. A verbose value of 2 is used to reduce the output to one line for each training epoch.
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=500, batch_size=3, verbose=2)

# Evaluate the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Baseline Error: %.2f%%" % (100-scores[1]*100))

In [None]:
y_test_preds = model.predict(X_test)

In [None]:
preds = model.predict(X_mainTest)
int_preds = (preds > .47).astype('int')

## Create AutoEncoder representations from the titanic dataset

In [None]:
import tensorflow as tf
import math
print("Tensorflow version " + tf.__version__)
tf.set_random_seed(0)

In [None]:
xdata = X.copy()

In [None]:
# Do data scaling
X_Scaled = X.copy()
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(X_Scaled)
X_Scaled = X_Scaled.astype('float32')
X_mainTest = scaler.fit_transform(test)
X_mainTest = X_mainTest.astype('float32')

In [None]:
# iiiiiiiii
# *********
#   *****
# *********
# iiiiiiiii

In [None]:
num_encoded_units = 3

X_ = tf.placeholder(tf.float32, [None, X_Scaled.shape[1]])
Y_ = tf.placeholder(tf.float32, [None, X_Scaled.shape[1]])

# 1st hidden layer
W1 = tf.Variable(tf.truncated_normal([X_Scaled.shape[1], X_Scaled.shape[1]], stddev=0.1))
B1 = tf.Variable(tf.ones([X_Scaled.shape[1]])/10)

# 2nd hidden layer
W2 = tf.Variable(tf.truncated_normal([X_Scaled.shape[1], num_encoded_units], stddev=0.1))
B2 = tf.Variable(tf.ones([num_encoded_units])/10)

# 3rd hidden layer
W3 = tf.Variable(tf.truncated_normal([num_encoded_units, X_Scaled.shape[1]], stddev=0.1))
B3 = tf.Variable(tf.ones([X_Scaled.shape[1]])/10)

Y1 = tf.nn.relu(tf.matmul(X_, W1) + B1)
Y2 = tf.nn.relu(tf.matmul(Y1, W2) + B2)
Y3 = tf.nn.relu(tf.matmul(Y2, W3) + B3)

loss = tf.reduce_mean(tf.square(Y3))
train_step=tf.train.AdamOptimizer(.01).minimize(loss)

In [None]:
init=tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [None]:
a = 0
for i in range(1000):
    train_data={X_: X_Scaled, Y_:X_Scaled}
    
    sess.run(train_step, feed_dict=train_data)
    l = sess.run([loss], feed_dict=train_data)
    print l

In [None]:
# Get values of the hidden representation
y1 = sess.run(tf.nn.relu(tf.matmul(X_Scaled, W1) + B1))
y2 = sess.run(tf.nn.relu(tf.matmul(y1, W2) + B2))

In [None]:
y1_maintest = sess.run(tf.nn.relu(tf.matmul(X_mainTest, W1) + B1))
y2_maintest = sess.run(tf.nn.relu(tf.matmul(y1_maintest, W2) + B2))
test['encoded1'] = y2_maintest[:,0]
test['encoded2'] = y2_maintest[:,1]
test['encoded3'] = y2_maintest[:,2]

In [None]:
X['encoded1'] = y2[:,0]
X['encoded2'] = y2[:,1]
X['encoded3'] = y2[:,2]
X.shape
Xn = X[['encoded1', 'encoded2', 'encoded3']]

In [None]:
# Add the new embeddings to the existing data set to check if the accuracy imporves
clf=RandomForestClassifier(n_estimators=1000, max_depth=4, max_features=2, bootstrap=True,min_samples_leaf=1, min_samples_split=40, random_state=101)
cv_acc = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
cv_acc.mean()