In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

from sklearn.learning_curve import learning_curve

from bokeh.models import Range1d
from bokeh.charts import HeatMap
from bokeh.palettes import RdYlBu9 as palette
from bokeh.plotting import figure, show, output_file


In [2]:
# plot html file
output_file("hw05.html", title = "hw05")


In [3]:
# read data to pandas data frame
# convert non mumeric values to nan using convert_objects and drop na's using dropna
# field Bare_Nuclei had "?" values
cf = pd.read_csv("cancer_uci.csv", header = 0).convert_objects(convert_numeric = True).dropna()


In [4]:
# helper function to plot roc
def plot_roc_curve(target_test, target_predicted_proba, kernel):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])

    roc_auc = auc(fpr, tpr)
    p = figure(title = "ROC (%s)" % (kernel))

    # Plot ROC curve
    p.line(x = fpr, y = tpr, legend = "ROC curve (area = %0.3f)" % roc_auc)
    p.x_range = Range1d(0, 1)
    p.y_range = Range1d(0, 1)
    p.xaxis.axis_label = "False Positive Rate or (1 - Specifity)"
    p.yaxis.axis_label = "True Positive Rate or (Sensitivity)"
    p.legend.orientation = "bottom_right"
    show(p)

In [5]:
# Utility function for SVM
def svm(kernel, balance):
    print "For %s kernel" % (kernel)

    if balance:
        # Only Malignant
        mf = cf[cf['Class'] == "Malignant"]

        # Only Benign
        bf = cf[cf['Class'] == "Benign"]

        # The ratio between Malignant and Beign is around 34%
        # Balance Benign by sampling out to the len of Maliganat to get 50/50 ratio
        bf_samp = bf.sample(len(mf))

        if False:
            print "%4d %4d %4d %4d %4d %.2f" % (len(cf), len(mf), len(bf), len(mf) + len(bf),
                    len(bf_samp), len(mf) * 100.0 / len(cf))

        # create a balanced data frame using sampled data
        new_cf = pd.concat([bf_samp, mf])
    else:
        # No balancing
        new_cf = cf

    # convert string to boolean (for class)
    target_cf = pd.DataFrame(new_cf['Class'].map({'Benign' : 0, 'Malignant' : 1}))

    feature_names = new_cf.columns[2:-1]

    # Get y
    y = target_cf.Class

    # Get X (features)
    X = pd.DataFrame(new_cf, columns = feature_names)

    # split train and test
    index = range(0, len(X))
    np.random.shuffle(index)
    train = index[ : len(X) * 3 / 5]
    test = index[len(X) * 3 / 5 : ]

    model = SVC(kernel = kernel, probability = True, C = 1).fit(X.iloc[train], y.iloc[train])
    results = model.predict(X.iloc[test])

    print "Classification report"
    print classification_report(y.iloc[test], results)

    cm = confusion_matrix(y.iloc[test], results)
    print "Confusion matrix"
    print cm

    xyvalues = {}
    xyvalues['Predicted False'] = {'Actual False' : cm[0, 0], 'Actual True' : cm[0, 1]}
    xyvalues['Predicted True'] = {'Actual False' : cm[1, 0], 'Actual True' : cm[1, 1]}

    if False:
        hm = HeatMap(xyvalues, title='Confusion Matrix', palette = palette)
        show(hm)

    print cm, '\n'
    print "Precision:", float(cm[1, 1]) / (cm[0, 1] + cm[1, 1])
    print "Recall:   ", float(cm[1, 1]) / (cm[1, 0] + cm[1, 1]), '\n'  
    
    if False:
        target_predicted_proba = model.predict_proba(X.iloc[test])
        plot_roc_curve(y.iloc[test], target_predicted_proba,
                "Balance: " + str(balance) + ", SVM: " + kernel)

    # sklearn learning curve function to generate scores
    train_sizes, train_scores, test_scores = learning_curve(model, X.iloc[index], y.iloc[index], cv = 5)

    print 'Training Score:', train_scores.mean(axis = 1)
    print '\n'
    print 'Test Score:', test_scores.mean(axis = 1)

    if True:
        # Create our base figure
        p = figure(title = 'Learning Curve', y_range = (0, 1))

        # Create our Training score line
        p.line(x = train_sizes, y = train_scores.mean(axis = 1), color = 'red',
                legend = "Training Scores")

        # Create our Testing score line
        p.line(x = train_sizes, y = test_scores.mean(axis = 1), color = 'blue',
                legend = "Test Scores")

        # Move our legend around
        p.legend.orientation = "bottom_right"

        # Render the plot!!
        show(p)
    

In [6]:
def dtree():
    # convert string to boolean (for class)
    target_cf = pd.DataFrame(cf['Class'].map({'Benign' : 0, 'Malignant' : 1}))

    feature_names = cf.columns[2:-1]

    features = cf[feature_names]
    target = target_cf.Class

    model = DecisionTreeClassifier(random_state = 0)
    mf = model.fit(features, target)

    print pd.crosstab(target, model.predict(features))

    X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 0)
    model = DecisionTreeClassifier(random_state = 0)
    mf = model.fit(X_train, y_train)

    results = model.predict(X_test)
    print pd.crosstab(y_test, results)

    print "Classification report"
    print classification_report(y_test, results)

    cm = confusion_matrix(y_test, results)
    print "Confusion matrix"
    print cm

    xyvalues = {}
    xyvalues['Predicted False'] = {'Actual False' : cm[0, 0], 'Actual True' : cm[0, 1]}
    xyvalues['Predicted True'] = {'Actual False' : cm[1, 0], 'Actual True' : cm[1, 1]}

    if False:
        hm = HeatMap(xyvalues, title='Confusion Matrix', palette = palette)
        show(hm)

    print cm, '\n'
    print "Precision:", float(cm[1, 1]) / (cm[0, 1] + cm[1, 1])
    print "Recall:   ", float(cm[1, 1]) / (cm[1, 0] + cm[1, 1]), '\n'

    target_predicted_proba = model.predict_proba(X_test)

    if True:
        plot_roc_curve(y_test, target_predicted_proba, "Dtree")

    # create new dataframes with features, target, results
    cf_features = pd.DataFrame(X_test, columns = feature_names)
    cf_target = pd.DataFrame(y_test, columns = ['Class (acutal)'])
    cf_results = pd.DataFrame(results, columns = ['Class (predicted)'])

    # join the features, actual and predicted into one dataframe
    ncf = cf_features.join(cf_target).join(cf_results)

    cf_errors = ncf[(ncf['Class (acutal)'] != ncf['Class (predicted)'])]
    #print cf_errors

In [7]:
svm("linear", True)

For linear kernel
Classification report
             precision    recall  f1-score   support

          0       0.97      1.00      0.98        86
          1       1.00      0.97      0.99       106

avg / total       0.98      0.98      0.98       192

Confusion matrix
[[ 86   0]
 [  3 103]]
[[ 86   0]
 [  3 103]] 

Precision: 1.0
Recall:    0.971698113208 

Training Score: [ 1.          0.99677419  0.97809524  0.97567568  0.97905759]


Test Score: [ 0.92907801  0.94787234  0.95828901  0.97083333  0.96870567]


In [8]:
svm("linear", False)

For linear kernel
Classification report
             precision    recall  f1-score   support

          0       0.98      0.96      0.97       184
          1       0.93      0.97      0.95        90

avg / total       0.96      0.96      0.96       274

Confusion matrix
[[177   7]
 [  3  87]]
[[177   7]
 [  3  87]] 

Precision: 0.925531914894
Recall:    0.966666666667 

Training Score: [ 0.9962963   0.98305085  0.97666667  0.97399527  0.97289377]


Test Score: [ 0.95461476  0.96341714  0.9561395   0.96193566  0.96487699]


In [9]:
svm("rbf", True)

For rbf kernel
Classification report
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        98
          1       0.91      1.00      0.95        94

avg / total       0.96      0.95      0.95       192

Confusion matrix
[[89  9]
 [ 0 94]]
[[89  9]
 [ 0 94]] 

Precision: 0.912621359223
Recall:    1.0 

Training Score: [ 1.          1.          0.99714286  0.99594595  0.99528796]


Test Score: [ 0.94964539  0.95172872  0.94964539  0.95177305  0.95598404]


In [10]:
svm("rbf", False)

For rbf kernel
Classification report
             precision    recall  f1-score   support

          0       1.00      0.97      0.98       185
          1       0.94      1.00      0.97        89

avg / total       0.98      0.98      0.98       274

Confusion matrix
[[179   6]
 [  0  89]]
[[179   6]
 [  0  89]] 

Precision: 0.936842105263
Recall:    1.0 

Training Score: [ 1.          1.          0.99733333  0.99716312  0.9970696 ]


Test Score: [ 0.93561503  0.95171668  0.95759935  0.95908083  0.95908083]


In [11]:
dtree()

col_0    0    1
Class          
0      444    0
1        0  239
col_0    0   1
Class         
0      105   2
1        7  57
Classification report
             precision    recall  f1-score   support

          0       0.94      0.98      0.96       107
          1       0.97      0.89      0.93        64

avg / total       0.95      0.95      0.95       171

Confusion matrix
[[105   2]
 [  7  57]]
[[105   2]
 [  7  57]] 

Precision: 0.966101694915
Recall:    0.890625 

