In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [43]:
class RandomForest():
    """A class to choose and train a random forest model"""
    """This implements the sklearn Random Forest Model"""
    
    def __init__(self, data_dir='./processed_data/', regressor=False, n_jobs=1):
        """Initializes the random forest class"""
        
        # Create an initialization function that will allow for
        # a saved model to be loaded
        
        self.n_jobs = n_jobs
        self.regressor = regressor
        self.data_dir = data_dir
        self.regressor = regressor
        
        if self.regressor:
            self.model = RandomForestRegressor()
        else:
            self.model = RandomForestClassifier()
        
        # *******************************************
        # USE THESE IN FUNCTIONS TO KEEP TRACK OF WHAT 
        # HAS ALREADY BEEN DONE TO DATA, MODELS, ETC...
        # keep track of which functions have been called
        self.read_data_called = False
        self.select_features_called = False
        self.tune_params_called = False
        self.train_model_called = False
        # ********************************************
        
    def read_data(self):
        """Read in the data from the specified directory"""
        
        self.read_data_called = True
        
        self.cv_X_df = pd.read_csv(self.data_dir+'cv_X.csv', header=0, index_col=0)
        self.cv_X = self.cv_X_df.values
        self.cv_y_df = pd.read_csv(self.data_dir+'cv_y.csv', header=0, index_col=0)
        self.cv_y = self.cv_y_df.values
        self.cv_ids_df = pd.read_csv(self.data_dir+'cv_ids.csv', header=0, index_col=0)
        self.train_X_df = pd.read_csv(self.data_dir+'train_X.csv', header=0, index_col=0)
        self.train_y_df = pd.read_csv(self.data_dir+'train_y.csv', header=0, index_col=0)
        self.train_ids_df = pd.read_csv(self.data_dir+'train_ids.csv', header=0, index_col=0)
        self.test_X_df = pd.read_csv(self.data_dir+'test_X.csv', header=0, index_col=0)
        self.test_ids_df = pd.read_csv(self.data_dir+'test_ids.csv', header=0, index_col=0)
        
    def get_feature_importances(self):
        """Trains a basic random forest to get a list of feature importances"""
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
        
        importance_tree = RandomForestRegressor(n_jobs=1,
                                random_state=1,
                                n_estimators=100,
                                max_features='sqrt',
                                max_depth=10)
        
        # train a basic model so that we can access feature importances
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        importance_tree.fit(cv_X, cv_y)
        self.feature_importances = importance_tree.feature_importances_
        
        # sort the features by importances, most important first
        self.sorted_features = self.feature_importances.argsort()[::-1]
        
        return self.sorted_features
            
    def select_features(self):
        """Select features from the dataset by using feature importances
        to add features one by one for selection"""
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
            
        self.select_features_called = True
            
        # TODO:
        #   - VERIFY THAT THIS IS WORKING
        #   - THINK OF OTHER TYPES OF FEATURE SELECTION
        #     BACKWARD FEATURE SELECTION?
        
        if self.regressor:
            model = RandomForestRegressor(oob_score=True,
                                        n_jobs=self.n_jobs,
                                        random_state=1,
                                        n_estimators=1000)
        else:
            model = RandomForestClassifier(oob_score=True,
                                         n_jobs=self.n_jobs,
                                         random_state=1,
                                         n_estimators=1000)
            
        # keep track of the scores from cross validation
        all_scores = []
        
        # which features to add, in order
        self.sorted_features = self.get_feature_importances()
        
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        cv_X_selected = cv_X[:, self.sorted_features[:1]]
        for feature in self.sorted_features[1:]:
            
            # format data so that it can be added easily
            added_feat = np.transpose(cv_X[:, feature])
            added_feat.shape += (1, )
            
            # append data to the next column
            cv_X_selected = np.append(cv_X_selected, added_feat, axis=1)
            
            # train a model with the augmented data
            model.fit(cv_X_selected, cv_y)
            score = model.oob_score_
            #scores = cross_val_score(feature_select_model,
            #                       cv_X_selected, cv_y, cv=2)
            #score = scores.mean()
            
            # have an evaluation metric to measure performance of added features
            all_scores.append(score)
        
        # **************************************************************
        # IS THIS THE ONLY CONSIDERATION FOR SELECTING FEATURES??
        # WHAT IF THE SCORE IS VERY HIGH WITH FEW FEATURES OUT OF MANY??
        # WHAT IF THE LARGEST SCORE IS AFTER MANY FEATURES ARE ADDED
        # WITH MINIMAL IMPROVEMENT TO SCORE??
        max_score_index = np.argmax(all_scores)
        # *************************************************************
        
        self.selected_feature_indices = self.sorted_features[:(max_score_index+1)]
        self.selected_features = cv_X_selected[:, :max_score_index+1]
        # **************************************************************
        
        return self.selected_feature_indices
        
    def tune_params(self):
        """Function to handle tuning hyperparameters of the model"""
        
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
        
        self.tune_params_called = True
        
        self.tune_max_features = True
        self.tune_n_estimators = True
        self.tune_min_samples_leaf = True
        
        # TODO:
        #   - VERIFY THAT THIS IS WORKING
        #   - FIND GOOD VALUES OF PARAMETERS TO TEST
        
        # parameters to tune for RandomForest
        
        # the maximum number of features to be considered for a tree.
        max_features_pos = [None, 'sqrt', 0.2, 0.33, 0.5]
        
        # number of estimator trees to be built.
        # generally the more the better, but takes more
        # cpu time
        n_estimators_pos = [50, 100, 500, 1000]
        
        # minimum number of samples in a leaf
        min_samples_leaf_pos = [1, 10, 20, 50]
        
        # stores the parameters along with scores from training
        parameters = []
        scores = []
        
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        
        self.max_features = max_features_pos[int(len(max_features_pos)/2)]
        self.n_estimators = n_estimators_pos[int(len(n_estimators_pos)/2)]
        self.min_samples_leaf = min_samples_leaf_pos[int(len(min_samples_leaf_pos)/2)]
        
        if self.regressor:
            model = RandomForestRegressor(oob_score=True,
                                         n_jobs=self.n_jobs,
                                         random_state=1,
                                         max_features=self.max_features,
                                         n_estimators=self.n_estimators,
                                         min_samples_leaf=self.min_samples_leaf)
        else:
            model = RandomForestClassifier(oob_score=True,
                                          n_jobs=self.n_jobs,
                                          random_state=1,
                                          max_features=self.max_features,
                                          n_estimators=self.n_estimators,
                                          min_samples_leaf=self.min_samples_leaf)
        
        # perform feature-space searching
        if self.tune_max_features:
            scores_max_features = []
            for max_features in max_features_pos:
                model.set_params(max_features=max_features)
                print(model.get_params)
                model.fit(cv_X, cv_y)
                scores_max_features.append(model.oob_score_)
                print(model.oob_score_)
            
            best_param_index = np.argmax(scores_max_features)
            print(best_param_index)
            self.max_features = max_features_pos[best_param_index]
            model.set_params(max_features=self.max_features)
                
        if self.tune_n_estimators:
            scores_n_estimators = []
            for n_estimators in n_estimators_pos:
                model.set_params(n_estimators=n_estimators)
                print(model.get_params)
                model.fit(cv_X, cv_y)
                scores_n_estimators.append(model.oob_score_)
                print(model.oob_score_)
            
            best_param_index = np.argmax(scores_n_estimators)
            print(best_param_index)
            self.n_estimators = n_estimators_pos[best_param_index]
            model.set_params(n_estimators=self.n_estimators)
                    
        if self.tune_min_samples_leaf:
            scores_min_samples_leaf = []
            for min_samples_leaf in min_samples_leaf_pos:
                model.set_params(min_samples_leaf=min_samples_leaf)
                print(model.get_params)
                model.fit(cv_X, cv_y)
                scores_min_samples_leaf.append(model.oob_score_)
                print(model.oob_score_)
            
            best_param_index = np.argmax(scores_min_samples_leaf)
            print(best_param_index)
            self.min_samples_leaf = min_samples_leaf_pos[best_param_index]
            model.set_params(min_samples_leaf=self.min_samples_leaf)
        
        self.model = model
        print(self.model.get_params())
        
    def train_model(self):
        """Trains the model on all training data
        and returns a model to be used for prediction"""
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
            
        self.train_model_called = True
            
        # determine which model to use
        if self.tune_params_called:
            self.model = self.model
        else:
            if self.regressor:
                model = RandomForestRegressor(oob_score=True,
                                             n_jobs=self.n_jobs,
                                             random_state=1)
            else:
                model = RandomForestClassifier(oob_score=True,
                                              n_jobs=self.n_jobs,
                                              random_state=1)
        
        if self.select_features_called:
            train_X = self.selected_features
        else:
            train_X = self.train_X_df.values
            
        train_y = np.ravel(self.train_y_df.values)
        
        self.model.fit(train_X, train_y)
        print(self.model.oob_score_)
        
    def predict_output(self):
        """Make predictions on test data."""
        
        # transform the features to match the selected features from training
        test_X_all = self.test_X_df.values
        test_X = test_X_all[:, self.selected_feature_indices[:1]]
        for feature in self.selected_feature_indices[1:]:
            
            # format data so that it can be added easily
            added_feat = np.transpose(test_X_all[:, feature])
            added_feat.shape += (1, )
            
            # append data to the next column
            test_X = np.append(test_X, added_feat, axis=1)
        
        pred = self.model.predict(test_X)
        return pred
        
    def get_model(self):
        """Return the model used for prediction"""
        return self.model
        

In [44]:
model = RandomForest()

model.read_data()

imp = model.get_feature_importances()
print(imp)

features = model.select_features()
print(features)

model.tune_params()

model.train_model()

pred = model.predict_output()
print(pred)

train_model = model.get_model()

[1 5 2 0 3 4 6]
[1 5 2]
<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=True, random_state=1,
            verbose=0, warm_start=False)>
0.802469135802
<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=True, random_state=1,
            verbose=0, warm_start=False)>
0.801346801347
<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, 