# Decision Tr

In [127]:
import pandas as pd
import numpy as np

df = pd.read_csv('titanic_data.csv')

df.drop(columns=['Ticket', 'Name', 'Cabin', 'PassengerId'])

# Removing NaN values
df = df.fillna(0)

# Replaces: Male = 0 and Female = 1
gender = {"male": 0, "female": 1}
df = df.replace({"Sex":gender})

# Replaces: Embarked C = 0, Q = 1, S = 2
gate = {"C":0, "Q":1, "S":2}
df = df.replace({"Embarked":gate})

# Extracting the significant features of the dataframe
X = np.array(df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])
y = np.array(df[['Survived']])
y = np.squeeze(y)

# Calculating the correlation of the features and the output
Pclass_corr = np.corrcoef(X[:,0],y)
Sex_corr = np.corrcoef(X[:,1],y)
Age_corr = np.corrcoef(X[:,2],y)
SibSp_corr = np.corrcoef(X[:,3],y)
Parch_corr = np.corrcoef(X[:,4],y)
Fare_corr = np.corrcoef(X[:,5],y)
Embarked_corr = np.corrcoef(X[:,6],y)

print("Correlations coeficient between features and the output:\n")
print("Pclass: {}".format(Pclass_corr[0,1]))
print("Sex {}".format(Sex_corr[0,1]))
print("Age {}".format(Age_corr[0,1]))
print("SipSp {}".format(SibSp_corr[0,1]))
print("Parch {}".format(Parch_corr[0,1]))
print("Fare {}".format(Fare_corr[0,1]))
print("Embarked {}".format(Embarked_corr[0,1]))

Correlations coeficient between features and the output:

Pclass: -0.3384810359610147
Sex 0.5433513806577551
Age 0.010539215871285656
SipSp -0.03532249888573558
Parch 0.08162940708348365
Fare 0.2573065223849623
Embarked -0.1741992460782486


In [128]:
# Eliminating all features of X but Sex, Age, Parch, and Fare.
# X = X[:,[0,1,2,4,5]]

In [129]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20, test_size=0.2 )

In [130]:
from sklearn.model_selection import ShuffleSplit
cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 20)

In [131]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

scorer = make_scorer(accuracy_score)

In [132]:
parameters = {'max_depth':list(range(1,8)),
              'min_samples_split':[5, 10, 15, 20, 25, 30],
              'min_samples_leaf':[3, 4, 5, 6, 7, 8, 9, 10]}

In [133]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

In [134]:
from sklearn.model_selection import GridSearchCV

grid_obj = GridSearchCV(clf, parameters, scoring = scorer, cv = cv_sets)

grid_fit = grid_obj.fit(X_train,y_train)
best_clf = grid_fit.best_estimator_
print(best_clf)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [135]:
y_pred_train = best_clf.predict(X_train)
y_pred_test = best_clf.predict(X_test)

In [137]:
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

print("Training score: {}".format(acc_train))
print("Testing score: {}".format(acc_test))

Training score: 0.8497191011235955
Testing score: 0.8379888268156425
