In [209]:
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd
#%matplotlib inline

import scipy as sp
#from scipy.special import expit

from sklearn import tree


from ipywidgets import interact
from IPython.display import display

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import FixedTicker
from bokeh.charts import Bar
output_notebook()

In [210]:
#return J and gradient, inputs are nump arrays/matrix
def costFunction(theta,*args):
    X,y = args
    #print "theta.shape = " + str(theta.shape)
    m_numSamples = len(y)
    
    #initialize return values
    J = 0
    grad = np.zeros(len(theta))

    Xnp = X.as_matrix()
    #print "Xnp.shape = " + str(Xnp.shape)
    
    z = np.dot(Xnp,theta)
    #print "z.shape = " + str(z.shape)
    
    h=sp.special.expit(z)
    #print "h.shape = " + str(h.shape)
    
    for idx,yi in enumerate(y):
        J = J + yi*np.log(h[idx]) + (1.0 - yi)*np.log(1.0 - h[idx])

    J= -1.0*J/m_numSamples
    #print "J.shape = " + str(J.shape)

    #grad = (1.0/m_numSamples)*np.dot(np.transpose(Xnp),(h-y))
    #print grad.shape
    #print "J = " + str(J)
    return J

def gradient(theta,*args):
    X,y = args
    m_numSamples = len(y)
    
    #initialize return values
    J = 0
    grad = np.zeros(len(theta))
    
    Xnp = X.as_matrix()
    
    z = np.dot(Xnp,theta)
    h=sp.special.expit(z)

    grad = (1.0/m_numSamples)*np.dot(np.transpose(Xnp),(h-y))
    return grad
    

In [211]:
def predict(theta,X):
    m_numSamples = len(X)
    prediction = np.zeros(m_numSamples)
    
    Xnp = X.as_matrix()
    
    s = sp.special.expit(np.dot(Xnp,theta))
    
    for i,pred in enumerate(s):
        if pred >=0.5:
            prediction[i]=1
    
    return prediction

In [212]:
trainClean = pd.read_csv("/Users/miceli/Data/Titanic/CleanedTItanicDivided_TRAIN.csv")
cvClean = pd.read_csv("/Users/miceli/Data/Titanic/CleanedTItanicDivided_CV.csv")
trainClean.drop(['Unnamed: 0'], inplace=True, axis=1)
cvClean.drop(['Unnamed: 0'], inplace=True, axis=1)
print trainClean.columns

Index([u'Survived', u'Pclass', u'Age', u'SibSp', u'Parch', u'Fare',
       u'NameLength', u'NameCapitals', u'female', u'HasAge',
       u'LengthTicketNumber', u'AlphaTicketNumber', u'HasCabinLetter',
       u'CabinLetter', u'EmbarkedNum'],
      dtype='object')


In [213]:
#store the features in X
X = trainClean.drop('Survived',1)
X_cv = cvClean.drop('Survived',1)
#X = trainClean[['female','Pclass']]
#print X

m_numSamples, n_numFeatures = X.shape

#store the label in y
y = trainClean['Survived']
y_cv = cvClean['Survived']

In [214]:
#add column for intercept in X:
intercept = np.ones(m_numSamples)
X = X.assign(intercept = pd.Series(intercept) )
X_cv = X_cv.assign(intercept = pd.Series(intercept) )
print X.columns

Index([u'Pclass', u'Age', u'SibSp', u'Parch', u'Fare', u'NameLength',
       u'NameCapitals', u'female', u'HasAge', u'LengthTicketNumber',
       u'AlphaTicketNumber', u'HasCabinLetter', u'CabinLetter', u'EmbarkedNum',
       u'intercept'],
      dtype='object')


In [215]:
#initialize fit parameters
initial_theta = np.zeros(n_numFeatures+1)

In [219]:
xopt, fopt, func_calls, grad_calls, warnflag = sp.optimize.fmin_cg(f=costFunction, \
                                                                    fprime=gradient, \
                                                                    args=(X,y), \
                                                                    x0=initial_theta, \
                                                                    maxiter=2000 ,\
                                                                    epsilon=1.49e-12,\
                                                                    full_output=True)

Optimization terminated successfully.
         Current function value: 0.414672
         Iterations: 1256
         Function evaluations: 2389
         Gradient evaluations: 2389


In [220]:
print "theta = " + str(xopt)
print "cost = " + str(fopt)
theta = xopt

theta = [ -9.04617142e-01  -4.78329933e-02  -4.55075914e-01  -1.34158595e-01
   1.85067325e-03   3.22370377e-02   4.67371768e-02   2.67146498e+00
   1.62429675e+00   3.03455504e-02  -3.83201728e-01  -4.43798942e-01
   2.22838283e-01  -3.51023485e-01   3.70466367e-01]
cost = 0.41467204674


In [231]:
prediction = predict(theta,X_cv)
survived_predSurvived=0
survived_predDied=0
died_predSurvived=0
died_predDied=0
#ans = y_cv.as_matrix()
#print ans
for i,answer in enumerate(y_cv):
    if answer == 1: #survived
        if prediction[i] == 1: #predicted survived
            survived_predSurvived += 1
        else: #predicted died
            survived_predDied += 1
    else: #died
        if prediction[i]==1: #predicted survived
            died_predSurvived += 1
        else: #predicted died
            died_predDied += 1
            
        
print "survived_predSurvived = " + str(survived_predSurvived)
print "survived_predDied = " + str(survived_predDied)
print "died_predSurvived = " + str(died_predSurvived)
print "died_predDied = " + str(died_predDied)


survived_predSurvived = 90
survived_predDied = 31
died_predSurvived = 29
died_predDied = 147


In [233]:
accuracySurvival = float(survived_predSurvived)/(survived_predSurvived + survived_predDied)
accuracyDied = float(died_predDied)/(died_predDied + died_predSurvived)
print "accuracySurvival = " + str(accuracySurvival)
print "accuracyDied = " + str(accuracyDied)

accuracySurvival = 0.743801652893
accuracyDied = 0.835227272727
