## Import Packages and Prepare Data 

In [1]:
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [2]:
# Import packages
import numpy as np
import pandas as pd 

In [3]:
# Import data
df = pd.read_csv('train.csv')

# Delete columns not being used 
cols = ['Name','Ticket','Cabin']
df = df.drop(cols, axis = 1)
print(df.head())

   PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0            1         0       3    male  22.0      1      0   7.2500        S
1            2         1       1  female  38.0      1      0  71.2833        C
2            3         1       3  female  26.0      0      0   7.9250        S
3            4         1       1  female  35.0      1      0  53.1000        S
4            5         0       3    male  35.0      0      0   8.0500        S


In [4]:
# Set index to 'PassengerId'
df = df.set_index('PassengerId')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 62.6+ KB
None


In [5]:
# Create categorical variables
numerics = ['Age','SibSp','Parch','Fare']
categories = ['Pclass','Sex','Embarked']
df[categories] = df[categories].astype('category')

# Drop null values
df = df.dropna()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 1 to 891
Data columns (total 8 columns):
Survived    712 non-null int64
Pclass      712 non-null category
Sex         712 non-null category
Age         712 non-null float64
SibSp       712 non-null int64
Parch       712 non-null int64
Fare        712 non-null float64
Embarked    712 non-null category
dtypes: category(3), float64(2), int64(3)
memory usage: 35.8 KB
None


In [6]:
# Create dummy variables
dummy_df = pd.get_dummies(df, columns=['Pclass','Sex', 'Embarked'])
print(dummy_df.head())

             Survived   Age  SibSp  Parch     Fare  Pclass_1  Pclass_2  \
PassengerId                                                              
1                   0  22.0      1      0   7.2500         0         0   
2                   1  38.0      1      0  71.2833         1         0   
3                   1  26.0      0      0   7.9250         0         0   
4                   1  35.0      1      0  53.1000         1         0   
5                   0  35.0      0      0   8.0500         0         0   

             Pclass_3  Sex_female  Sex_male  Embarked_C  Embarked_Q  \
PassengerId                                                           
1                   1           0         1           0           0   
2                   0           1         0           1           0   
3                   1           1         0           0           0   
4                   0           1         0           0           0   
5                   1           0         1           0

In [7]:
# Create Predictor and Response variables

labels = dummy_df['Survived'].values
print(labels.shape)

features = dummy_df.loc[:, dummy_df.columns != 'Survived'].values
print(features.shape)


(712,)
(712, 12)


## Fit Two DecisionTreeClassifier Models. One Scaled and One Not Scaled

In [8]:
# Import ML packages 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [9]:
# Split training and test variables
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state = 77)

In [10]:
# Create Pipeline for scaled model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [('scaler' , StandardScaler()),
         ('dt', DecisionTreeClassifier())]

Pipeline = Pipeline(steps)

In [11]:
#Create parameter dictionaries

print(sorted(Pipeline.get_params().keys()))

# Parameters for unscaled model
param_dist_dt = {"max_depth": np.arange(1,12),
                 "min_samples_split" : np.linspace(0.1, 1.0, 100, endpoint=True),
              "max_features": list(range(1,X_train.shape[1])),
              "min_samples_leaf": np.linspace(0.1, 0.5, 100, endpoint=True),
              "criterion": ["gini", "entropy"]}

# Parameters for scaled model
param_dist_dt_cv = {"dt__max_depth": np.arange(1,12),
              "dt__min_samples_split" : np.linspace(0.1, 1.0, 100, endpoint=True),
              "dt__max_features": list(range(1,X_train.shape[1])),
              "dt__min_samples_leaf": np.linspace(0.1, 0.5, 100, endpoint=True),
              "dt__criterion": ["gini", "entropy"]}

['dt', 'dt__class_weight', 'dt__criterion', 'dt__max_depth', 'dt__max_features', 'dt__max_leaf_nodes', 'dt__min_impurity_decrease', 'dt__min_impurity_split', 'dt__min_samples_leaf', 'dt__min_samples_split', 'dt__min_weight_fraction_leaf', 'dt__presort', 'dt__random_state', 'dt__splitter', 'memory', 'scaler', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'steps', 'verbose']


## Find the Best Parameters

In [12]:
# Unscaled model RandomizedSearchCV
dt = DecisionTreeClassifier()
dt_cv = RandomizedSearchCV(dt, param_dist_dt, cv = 10)
dt_cv.fit(X_train,y_train)
print("Tuned Decision Tree Parameters: {}".format(dt_cv.best_params_))
print("Best score is {}".format(dt_cv.best_score_))

Tuned Decision Tree Parameters: {'min_samples_split': 0.2181818181818182, 'min_samples_leaf': 0.19292929292929295, 'max_features': 8, 'max_depth': 9, 'criterion': 'gini'}
Best score is 0.7811244979919679




In [13]:
# Unscaled predictions
y_pred_cv = dt_cv.predict(X_test)
print("Accuracy: " , accuracy_score(y_test, y_pred_cv))

Accuracy:  0.7757009345794392


In [14]:
# Scaled Model RandomizedSearchCV
dt_cv2 = RandomizedSearchCV(Pipeline, param_dist_dt_cv, cv = 10)
dt_cv2.fit(X_train,y_train)
print("Tuned Decision Tree Parameters: {}".format(dt_cv2.best_params_))
print("Best score is {}".format(dt_cv2.best_score_))



Tuned Decision Tree Parameters: {'dt__min_samples_split': 0.5909090909090909, 'dt__min_samples_leaf': 0.3343434343434344, 'dt__max_features': 8, 'dt__max_depth': 5, 'dt__criterion': 'entropy'}
Best score is 0.7811244979919679


## Print Validation Accuracy Score

In [15]:
# Scaled predictions
y_pred_cv2 = dt_cv2.predict(X_test)
print("Accuracy: " , accuracy_score(y_test, y_pred_cv2))

Accuracy:  0.7757009345794392


## Make Predictions on the Holdout Dataset and Convert them to a CSV

In [16]:
holdout = pd.read_csv('test.csv')
holdout = holdout.drop(cols, axis = 1)
print(holdout.head())

   PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0          892       3    male  34.5      0      0   7.8292        Q
1          893       3  female  47.0      1      0   7.0000        S
2          894       2    male  62.0      0      0   9.6875        Q
3          895       3    male  27.0      0      0   8.6625        S
4          896       3  female  22.0      1      1  12.2875        S


In [17]:
# Set index to 'PassengerId'
holdout = holdout.set_index('PassengerId')
print(holdout.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 26.1+ KB
None


In [18]:
holdout[categories] = holdout[categories].astype('category')
print(holdout.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 7 columns):
Pclass      418 non-null category
Sex         418 non-null category
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null category
dtypes: category(3), float64(2), int64(2)
memory usage: 17.8 KB
None


In [19]:
# Create dummy variables
dummy_df_holdout = pd.get_dummies(holdout, columns=['Pclass','Sex', 'Embarked'])
print(dummy_df_holdout.head())

              Age  SibSp  Parch     Fare  Pclass_1  Pclass_2  Pclass_3  \
PassengerId                                                              
892          34.5      0      0   7.8292         0         0         1   
893          47.0      1      0   7.0000         0         0         1   
894          62.0      0      0   9.6875         0         1         0   
895          27.0      0      0   8.6625         0         0         1   
896          22.0      1      1  12.2875         0         0         1   

             Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  
PassengerId                                                            
892                   0         1           0           1           0  
893                   1         0           0           0           1  
894                   0         1           0           1           0  
895                   0         1           0           0           1  
896                   1         0           0    

In [20]:
dummy_df_holdout = dummy_df_holdout.fillna(dummy_df_holdout.mean())
print(dummy_df_holdout.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 12 columns):
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          418 non-null float64
Pclass_1      418 non-null uint8
Pclass_2      418 non-null uint8
Pclass_3      418 non-null uint8
Sex_female    418 non-null uint8
Sex_male      418 non-null uint8
Embarked_C    418 non-null uint8
Embarked_Q    418 non-null uint8
Embarked_S    418 non-null uint8
dtypes: float64(2), int64(2), uint8(8)
memory usage: 19.6 KB
None


In [21]:
holdout_pred = dt_cv2.predict(dummy_df_holdout)
print(holdout_pred)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [22]:
Id = np.arange(892, 1310)
y_pred_df = pd.DataFrame(holdout_pred, Id)
y_pred_df.index.name = 'PassengerId'
y_pred_df.columns = ["Survived"]
print(y_pred_df)

             Survived
PassengerId          
892                 0
893                 1
894                 0
895                 0
896                 1
897                 0
898                 1
899                 0
900                 1
901                 0
902                 0
903                 0
904                 1
905                 0
906                 1
907                 1
908                 0
909                 0
910                 1
911                 1
912                 0
913                 0
914                 1
915                 0
916                 1
917                 0
918                 1
919                 0
920                 0
921                 0
...               ...
1280                0
1281                0
1282                0
1283                1
1284                0
1285                0
1286                0
1287                1
1288                0
1289                1
1290                0
1291                0
1292      

In [23]:
y_pred_df.to_csv('submission.csv')