In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
class RandomForest():
    """A class to choose and train a random forest model"""
    """This implements the sklearn Random Forest Model"""
    
    def __init__(self, data_dir='./processed_data/', regressor=False, n_jobs=1):
        """Initializes the random forest class"""
        
        # Create an initialization function that will allow for
        # a saved model to be loaded
        
        self.n_jobs = n_jobs
        self.regressor = regressor
        self.data_dir = data_dir
        
        if regressor:
            self.model = RandomForestRegressor()
        else:
            self.model = RandomForestClassifier()
            
        self.params = {}
        self.features = []
        
    def read_data(self):
        """Read in the data from the specified directory"""
        
        self.cv_X_df = pd.read_csv(self.data_dir+'cv_X.csv', header=0, index_col=0)
        self.cv_X = self.cv_X_df.values
        self.cv_y_df = pd.read_csv(self.data_dir+'cv_y.csv', header=0, index_col=0)
        self.cv_y = self.cv_y_df.values
        self.cv_ids_df = pd.read_csv(self.data_dir+'cv_ids.csv', header=0, index_col=0)
        self.train_X_df = pd.read_csv(self.data_dir+'train_X.csv', header=0, index_col=0)
        self.train_y_df = pd.read_csv(self.data_dir+'train_y.csv', header=0, index_col=0)
        self.train_ids_df = pd.read_csv(self.data_dir+'train_ids.csv', header=0, index_col=0)
        self.test_X_df = pd.read_csv(self.data_dir+'test_X.csv', header=0, index_col=0)
        self.test_ids_df = pd.read_csv(self.data_dir+'test_ids.csv', header=0, index_col=0)
        
    def get_feature_importances(self):
        """Trains a basic random forest to get a list of feature importances"""
        
        importance_tree = RandomForestRegressor(n_jobs=1,
                                random_state=1,
                                n_estimators=100,
                                max_features='sqrt',
                                max_depth=10)
        
        # train a basic model so that we can access feature importances
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        importance_tree.fit(cv_X, cv_y)
        self.feature_importances = importance_tree.feature_importances_
        
        # sort the features by importances, most important first
        self.sorted_features = self.feature_importances.argsort()[::-1]
        
        return self.sorted_features
            
    def select_features(self):
        """Select features from the dataset by using feature importances
        to add features one by one for selection"""
        
        # TODO:
        #   - THINK OF OTHER TYPES OF FEATURE SELECTION
        #     BACKWARD FEATURE SELECTION?
        
        # which features to add, in order
        self.sorted_features = self.get_feature_importances()
        
        cv_X = self.cv_X_df.values
        cv_X_selected = cv_X[:, self.sorted_features[:1]]
        for feature in self.sorted_features:
            
            # format data so that it can be added easily
            added_feat = np.transpose(cv_X[:, feature])
            added_feat.shape += (1, )
            
            # append data to the next column
            cv_X_selected = np.append(cv_X_selected, added_feat, axis=1)
            
            # train a model with the augmented data
            
            
            # have an evaluation metric to measure performance of added features
            
        
        
        self.selected_features = self.features
        
        return self.selected_features
        
    def tune_params(self):
        """Function to handle tuning hyperparameters of the model"""
        
        # TODO:
        #   - PERFORM HYPERPARAMETER SEARCH HERE
        self.params = self.params
        
    def train_model(self):
        """Trains the model on all training data
        and returns a model to be used for prediction"""
        train_X = self.train_X_df.values
        train_y = np.ravel(self.train_y_df.values)
        self.model.fit(train_X, train_y)
        
    def predict_output(self):
        """Make predictions on test data."""
        pred = self.model.predict(self.test_X_df.values)
        return pred
        
    def get_model(self):
        """Return the model used for prediction"""
        
        # TODO:
        #    - WHAT IS BEST FORMAT TO RETURN THIS?? 
        #      JUST RETURN THE SKLEAN MODEL OR RETURN COPY OF ITSELF?
        
        return self.model
        

In [3]:
model = RandomForest()

model.read_data()

imp = model.get_feature_importances()
print(imp)

model.select_features()

model.tune_params()

model.train_model()

pred = model.predict_output()
print(pred)

train_model = model.get_model()

[1 0 2]
[1 0 1 1]
