In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.preprocessing import Imputer, OneHotEncoder, LabelEncoder

from imblearn.over_sampling import SMOTE
from sklearn.cross_validation import train_test_split

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display                               
from ipywidgets import interactive



In [2]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [3]:
df = pd.read_csv('./data/titanic.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
imp = Imputer(strategy='median', axis=0)

In [6]:
df.drop(['Cabin','Name','PassengerId','Ticket'], axis=1, inplace=True)
df['Age'] = imp.fit_transform(df['Age'].values.reshape(-1,1))
df['Embarked'].fillna(df['Embarked'].mode()[0], axis=0, inplace=True)

In [7]:
df['GroupSize'] = df['SibSp'] + df['Parch'] + 1

In [8]:
bins = [0,1,3,5,8,12]
labels = ['Single','Couple','Small Group','Medium Group','Large Group']
df['GroupType'] = pd.cut(df['GroupSize'].astype(int), bins, labels=labels)

In [9]:
bins = [0,5,10,15,30,55,80]
labels = ['Baby','Child','Teen','Young Adult','Adult','Senior']
df['AgeBin'] = pd.cut(df['Age'].astype(float), bins, labels=labels)

In [10]:
df['FareBin'] = pd.qcut(df['Fare'],5, labels=[1,2,3,4,5])

In [11]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,GroupSize,GroupType,AgeBin,FareBin
0,0,3,male,22.0,1,0,7.25,S,2,Couple,Young Adult,1
1,1,1,female,38.0,1,0,71.2833,C,2,Couple,Adult,5
2,1,3,female,26.0,0,0,7.925,S,1,Single,Young Adult,2
3,1,1,female,35.0,1,0,53.1,S,2,Couple,Adult,5
4,0,3,male,35.0,0,0,8.05,S,1,Single,Adult,2


In [12]:
test = df[['Survived','Pclass','Sex','GroupSize','Fare','FareBin','SibSp','Age']]
test = pd.get_dummies(test, columns=['Sex'], drop_first=True)

In [13]:
# feature matrix
X = test.drop(['Survived'], axis=1)
# target vector
y = test.Survived
# class labels
labels = X.columns

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
sm = SMOTE()

In [16]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [17]:
# feature matrix
X = test.drop(['Survived'], axis=1)
# target vector
y = test.Survived
# class labels
labels = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def display_roc_auc(fpr, tpr, roc_auc):
    # method I: plt
    import matplotlib.pyplot as plt
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

def plot_tree(crit, split, depth, min_split, min_leaf=0.2, min_decrease=0.001):
    estimator = DecisionTreeClassifier(random_state = 0
        , criterion=crit
        , splitter = split
        , max_depth = depth
        , min_samples_split=min_split
        , min_samples_leaf=min_leaf
        , min_impurity_decrease = min_decrease)
    estimator.fit(X_train, y_train)
    graph = Source(tree.export_graphviz(estimator
          , out_file=None
          , feature_names=labels
          , class_names=['0', '1', '2']
          , filled = True))
    # calculate the fpr and tpr for all thresholds of the classification
    probs = estimator.predict_proba(X_test)[:,1]
    y_pred = estimator.predict(X_test)
    fpr, tpr, threshold = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    display_roc_auc(fpr, tpr, roc_auc)
    display(SVG(graph.pipe(format='svg')))
    return estimator

inter=interactive(plot_tree
   , crit = ["gini", "entropy"]
   , split = ["best"]
   , depth=np.linspace(1, 35, 35, endpoint=True)
   , min_split=np.linspace(.01, .5, 100)
   , min_leaf=np.linspace(.01, .5, 100)
   , min_decrease=np.linspace(.001, .01, 100)
)
display(inter)

interactive(children=(Dropdown(description='crit', options=('gini', 'entropy'), value='gini'), Dropdown(descri…