In [2]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

In [5]:
# Read data
cancer_data = pd.read_csv("new_data.csv")
print "Data read successfully!"

Data read successfully!


In [7]:
# TODO: Calculate number of students
n_entries = cancer_data.shape[0]

# TODO: Calculate number of features
n_features = cancer_data.shape[1]-1

# TODO: Calculate passing students
n_malignant = cancer_data[cancer_data['diagnosis']== 1].shape[0]

# TODO: Calculate failing students
n_benign = cancer_data[cancer_data['diagnosis'] == 0].shape[0]

# Print the results
print "Total number of entries: {}".format(n_entries)
print "Number of features: {}".format(n_features)
print "Number of malignant entries: {}".format(n_malignant)
print "Number of benign entries: {}".format(n_benign)

Total number of entries: 569
Number of features: 30
Number of malignant entries: 212
Number of benign entries: 357


In [9]:
# Extract feature columns
feature_cols = list(cancer_data.columns[:-1])

# Extract target column 'diagnosis'
target_col = cancer_data.columns[-1] 

# Show the list of columns
print "Feature columns:\n{}".format(feature_cols)
print "\nTarget column: {}".format(target_col)

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = cancer_data[feature_cols]
y_all = cancer_data[target_col]

# Show the feature information by printing the first five rows
print "\nFeature values:"
print X_all.head()

Feature columns:
['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']

Target column: diagnosis

Feature values:
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0     0.521037      0.022658        0.545989   0.363733         0.593753   
1     0.643144      0.272574        0.615783   0.501591         0.289880   
2     0.601496      0.390260        0.595743   0.449417         0.514309   
3     0.210090      0.360839        0.233501   0.102906         0.811321   
4     0.629893

In [22]:
from time import time
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer

# TODO: Set the number of training points
num_train = 479

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=90, random_state=42)

# Show the results of the split
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

Training set has 479 samples.
Testing set has 90 samples.


In [58]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print 'Trained model in {:.4f} seconds'.format(end - start)

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print "Made predictions in {:.4f} seconds.".format(end - start)
    return f1_score(target.values, y_pred, pos_label=1)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print "F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {:.4f}.\n\n".format(predict_labels(clf, X_test, y_test))

In [59]:
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(num_train, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [60]:
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

train_predict(rf, X_train, y_train, X_test, y_test)
train_predict(et, X_train, y_train, X_test, y_test)
train_predict(ada, X_train, y_train, X_test, y_test)
train_predict(gb, X_train, y_train, X_test, y_test)


Training a SklearnHelper using a training set size of 479. . .
Trained model in 1.5072 seconds
Made predictions in 0.3310 seconds.
F1 score for training set: 0.9915.
Made predictions in 0.2020 seconds.
F1 score for test set: 0.9394.


Training a SklearnHelper using a training set size of 479. . .
Trained model in 1.4521 seconds
Made predictions in 0.3465 seconds.
F1 score for training set: 0.9886.
Made predictions in 0.2047 seconds.
F1 score for test set: 0.9394.


Training a SklearnHelper using a training set size of 479. . .
Trained model in 1.8923 seconds
Made predictions in 0.0733 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0415 seconds.
F1 score for test set: 0.9552.


Training a SklearnHelper using a training set size of 479. . .
Trained model in 0.3265 seconds
Made predictions in 0.0021 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0006 seconds.
F1 score for test set: 0.9394.




In [70]:
rf_feature = rf.fit(X_train, y_train).feature_importances_
et_feature = et.fit(X_train, y_train).feature_importances_
ada_feature = ada.fit(X_train, y_train).feature_importances_
gb_feature = gb.fit(X_train, y_train).feature_importances_

In [81]:
#averaging the feature_importances obtained from the different classifiers

a = np.array([rf_feature, et_feature, ada_feature, gb_feature])
print a
avg_array = np.mean(a, axis=0)
print "\n\nAVERAGE\n\n"
print avg_array


[[ 0.03627394  0.01456477  0.04996821  0.04098817  0.00548078  0.01384331
   0.05571153  0.11333541  0.00259449  0.0038415   0.01633188  0.00390766
   0.01152841  0.03085779  0.00382046  0.00366677  0.00546514  0.00432612
   0.00328915  0.00445075  0.11737634  0.01931315  0.1226758   0.11380129
   0.00928052  0.01397712  0.03967421  0.11967989  0.01365017  0.00632528]
 [ 0.06002688  0.01837291  0.0692622   0.05826016  0.0075222   0.02312954
   0.0580876   0.08951156  0.00625699  0.00514714  0.0139623   0.00307909
   0.01468696  0.02544779  0.00261136  0.00540793  0.0056671   0.00826967
   0.00240206  0.00362764  0.09070681  0.02178458  0.09151255  0.07963792
   0.01618595  0.02747416  0.04093342  0.12885889  0.01529869  0.00686796]
 [ 0.          0.082       0.004       0.03        0.046       0.066       0.03
   0.05        0.034       0.01        0.03        0.01        0.008       0.042
   0.028       0.076       0.004       0.026       0.008       0.036       0.01
   0.064       0.

In [98]:
#print type(feature_cols)
#print feature_cols
avg_list = list(avg_array)
#print type(avg_list)
#print avg_list

#sorting both lists in sync with each other (ascending order of average feature_importance)
list1, list2 = (list(t) for t in zip(*sorted(zip(avg_list, feature_cols))))

list1.reverse()
list2.reverse()


30


In [101]:
#copying the two columns into an excel sheet

import openpyxl
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Font, Border
from openpyxl.styles import colors
from openpyxl.cell import Cell
import xlrd
from openpyxl import load_workbook
from openpyxl import worksheet

wb_write = Workbook()
ws_write = wb_write.get_active_sheet()

for i in range(0, len(list1)):
    write_cell = ws_write.cell(row=i+1, column=1)
    write_cell.value = list2[i];

for i in range(0, len(list1)):
    write_cell = ws_write.cell(row=i+1, column=2)
    write_cell.value = list1[i];

wb_write.save('pred_power.xlsx')