In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import itertools

In [72]:
class NBDecisionTreeClassifier(DecisionTreeClassifier):
    def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None):
        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]
        self.min_samples_leaf = max(self.min_samples_leaf, X.shape[1])
        super(NBDecisionTreeClassifier, self).fit(X, y, sample_weight, check_input, X_idx_sorted)

        data = pd.DataFrame(X)
        data['label'] = y
        data['leaf'] = self.apply(X)
        self.leaves_model = dict()
        for n, g in data.groupby(['leaf']):
            self.leaves_model[n] = GaussianNB()
            self.leaves_model[n].partial_fit(g.iloc[:, :-2].values, g['label'].values, classes=self.classes_)

        return self

    def predict(self, X, check_input=True):
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

    def predict_proba(self, X, check_input=True):
        return np.exp(self.predict_log_proba(X))

    def predict_log_proba(self, X):
        log_proba = np.empty((len(X), self.n_classes_))
        for i, x_leaf in enumerate(self.apply(X)):
            log_proba[i] = self.leaves_model[x_leaf].predict_log_proba([X[i]])
        return log_proba

Methods

apply(X[, check_input])	Returns the index of the leaf that each sample is predicted as.
decision_path(X[, check_input])	Return the decision path in the tree
fit(X, y[, sample_weight, check_input, …])	Build a decision tree classifier from the training set (X, y).
get_params([deep])	Get parameters for this estimator.
predict(X[, check_input])	Predict class or regression value for X.
predict_log_proba(X)	Predict class log-probabilities of the input samples X.
predict_proba(X[, check_input])	Predict class probabilities of the input samples X.
score(X, y[, sample_weight])	Returns the mean accuracy on the given test data and labels.
set_params(**params)	Set the parameters of this estimator.

In [73]:
def rfc_nb(df,data_set_name,test_size,random_state,param_list_extraction):
    x, y = df[df.columns[:-1]], df[df.columns[-1]]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = random_state)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size  =test_size, random_state = random_state)
    rf_nb = RandomForestClassifier(n_estimators = param_list_extraction[0], random_state = random_state,min_samples_leaf = param_list_extraction[1],min_samples_split = param_list_extraction[2])
    rf_nb.base_estimator = NBDecisionTreeClassifier()
    rf_nb.fit(x_train, y_train)
    res = rf_nb.predict(x_test)
    print_result('Random forest classifier with Naive Bayes',data_set_name,test_size,random_state,param_list_extraction[0],param_list_extraction[1],param_list_extraction[2],y_test,res)

In [74]:
def rfc(df,data_set_name,test_size,random_state,param_list_extraction):
    x, y = df[df.columns[:-1]], df[df.columns[-1]]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = random_state)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = random_state)
    rf = RandomForestClassifier(n_estimators = param_list_extraction[0], random_state = random_state,min_samples_leaf = param_list_extraction[1],min_samples_split = param_list_extraction[2])
    rf.fit(x_train, y_train)
    res = rf.predict(x_test)
    print_result('Random forest classifier',data_set_name,test_size,random_state,param_list_extraction[0],param_list_extraction[1],param_list_extraction[2],y_test,res)

In [75]:
def print_result(classifier_name,data_set_name,test_size,random_state,n_estimators,min_samples_leaf,min_samples_split,y_test,res):
    #classifier_type = 'BGUEXC'
    #data_set = 'Iris'
    results_file.write('Data set = {}|\t Classifier = {} \n'.format(data_set_name,classifier_name))
    results_file.write('hyper params:\n |random state = {}|\t |number of estimators = {}|\t |min_samples_leaf = {}|\t |min_samples_split = {}|\n'.format(random_state,n_estimators,min_samples_leaf,min_samples_split))
    #results_file.write('Data set = {} \n'.format(data_set))
    results_file.write('confusion matrix: \n {confusion_matrix} \n\n'.format(confusion_matrix = confusion_matrix(y_test,res)))
    results_file.write('classification report: \n {classification_report} \n'.format(classification_report = classification_report(y_test,res)))
    results_file.write('accuracy score: \n {accuracy_score} \n'.format(accuracy_score = accuracy_score(y_test, res, normalize=True, sample_weight=None)))
    results_file.write('_ '*50)
    results_file.write('\n')

In [77]:
results_file = open('results.txt','w')
files  = ['iris','Immunotherapy']
num_of_trees = [10,50,100]
minimum_samples_in_leaf = [1,5,10]
minimum_leafs_split = [2,10,20]
param_list = [num_of_trees,minimum_samples_in_leaf,minimum_leafs_split]
param_list_extraction = list(itertools.product(*param_list))
for file_name in files:
    df = pd.read_csv('%s.csv'%file_name)
    for list_params in param_list_extraction:
        rfc(df,file_name,0.3 ,1337,list_params)
        rfc_nb(df,file_name,0.3 ,1337,list_params)
    results_file.write('_'*50)

  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])


  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  'precision', 'predicted', average, warn_for)
  jointi = np.log(self.class_prior_[i])
  '