In [9]:
import csv
import numpy as np
with open('titanic.txt') as csvfile:
    titanic_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # Header contains feature names
    row = next(titanic_reader)
    feature_names = np.array(row)
    
    # load dataset, and target classes
    titanic_X, titanic_y = [], []
    for row in titanic_reader:
        titanic_X.append(row)
        titanic_y.append(row[2])
        # The target value is "survived"
        
    titanic_X = np.array(titanic_X)
    titanic_y = np.array(titanic_y)

In [13]:
print(feature_names)

['row.names' 'pclass' 'survived' 'name' 'age' 'embarked' 'home.dest' 'room'
 'ticket' 'boat' 'sex']


In [17]:
titanic_X[0]

array(['1', '1st', '1', 'Allen, Miss Elisabeth Walton', '29.0000',
       'Southampton', 'St Louis, MO', 'B-5', '24160 L221', '2', 'female'], 
      dtype='<U62')

In [18]:
# we keep class, age and sex
titanic_X = titanic_X[:, [1, 4, 10]]
feature_names = feature_names[[1, 4, 10]]

In [20]:
print(feature_names)

['pclass' 'age' 'sex']


In [21]:
print(titanic_X[12], titanic_y[12])

['1st' 'NA' 'female'] 1


In [23]:
titanic_X.shape

(1313, 3)

In [24]:
# We have missing values for age
# Assign the mean value
ages = titanic_X[:, 1]
mean_age = np.mean(titanic_X[ages != 'NA', 1].astype(np.float))
titanic_X[titanic_X[:, 1] == 'NA', 1] = mean_age

In [25]:
# Encode sex
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:, 2])
print("Categorical classes:", label_encoder.classes_)

Categorical classes: ['female' 'male']


In [26]:
integer_classes = label_encoder.transform(label_encoder.classes_)
print("Integer classes:", integer_classes)

Integer classes: [0 1]


In [28]:
t = label_encoder.transform(titanic_X[:, 2])
titanic_X[:, 2] = t

In [29]:
titanic_X[:, 2]

array(['0', '0', '1', ..., '1', '0', '1'], 
      dtype='<U62')

In [30]:
print(feature_names)
print(titanic_X[12], titanic_y[12])

['pclass' 'age' 'sex']
['1st' '31.19418104265403' '0'] 1


In [32]:
from sklearn.preprocessing import OneHotEncoder

enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:, 0])
print("Categorical classes:", label_encoder.classes_)

Categorical classes: ['1st' '2nd' '3rd']


In [35]:
integer_classes = label_encoder.transform(label_encoder.classes_).reshape(3, 1)
print("Integer classes:", integer_classes)

Integer classes: [[0]
 [1]
 [2]]


In [36]:
enc = OneHotEncoder()
one_hot_encoder = enc.fit(integer_classes)
# First, convert classes to 0-(N-1) integers using label_encoder
num_of_rows = titanic_X.shape[0]
t = label_encoder.transform(titanic_X[:, 0]).reshape(num_of_rows, 1)
# Second, create a sparse matrix with three columns,each one indicating if the instance belongs to the class
new_features = one_hot_encoder.transform(t)
# Add the new features to titanic_X
titanic_X = np.concatenate([titanic_X, new_features.toarray()], axis=1)
# Eliminate converted columns
titanic_X = np.delete(titanic_X, [0], 1)
# Update feature names
feature_names = ['age', 'sex', 'first_class', 'second_class', 'third_class']
# Convert to numerical values
titanic_X = titanic_X.astype(float)
titanic_y = titanic_y.astype(float)

In [37]:
print(feature_names)

['age', 'sex', 'first_class', 'second_class', 'third_class']


In [38]:
print(titanic_X[0], titanic_y[0])

[ 29.   0.   1.   0.   0.] 1.0


In [46]:
titanic_X[0]

array([ 29.,   0.,   1.,   0.,   0.])

In [47]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y, test_size=0.25, random_state=33)

In [48]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
clf = clf.fit(X_train, y_train)

In [56]:
import pydot, io
dot_data = io.StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=['age', 'sex', '1st_class', '2nd_class', '3rd_class'])
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('titanic.png')
from IPython.core.display import Image
Image(filename='titanic.png')

AttributeError: 'list' object has no attribute 'write_png'

In [57]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=dot_data, feature_names=['age', 'sex', '1st_class', '2nd_class', '3rd_class'])
graph = graphviz.Source(dot_data)
graph

ExecutableNotFound: failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x144672bd940>