In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVR, SVC
from sklearn.model_selection import cross_val_score

# scoring functions for feature selection
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile

from sklearn.metrics import make_scorer

import csv
import os

In [9]:
class SVM():
    """A class to choose and train a support vector machine model"""
    """This implements the sklearn Random Forest Model"""
    
    def __init__(self, data_dir='./processed_data/', regressor=False, n_jobs=1,
                random_state=1, kernel='rbf', max_iter=1000,
                opt_func=None, inv_opt_func=None, scorer=None):
        """Initializes the random forest class"""
        
        # TODO:
        #    - Create an initialization function that will allow for
        #      a saved model to be loaded
        
        self.n_jobs = n_jobs
        self.regressor = regressor
        self.data_dir = data_dir
        self.regressor = regressor
        self.random_state = random_state
        self.kernel = kernel
        self.max_iter = max_iter
        self.opt_func = opt_func
        self.inv_opt_func = inv_opt_func
        self.scorer = scorer
        
        # keep track of which functions have been called
        self.read_data_called = False
        self.select_features_called = False
        self.tune_params_called = False
        self.train_model_called = False
        
    def read_data(self):
        """Read in the data from the specified directory"""
        
        self.read_data_called = True
        
        self.cv_X_df = pd.read_csv(self.data_dir+'cv_X.csv', header=0)
        self.cv_y_df = pd.read_csv(self.data_dir+'cv_y.csv', header=0)
        self.cv_ids_df = pd.read_csv(self.data_dir+'cv_ids.csv', header=0)
        self.train_X_df = pd.read_csv(self.data_dir+'train_X.csv', header=0)
        self.train_y_df = pd.read_csv(self.data_dir+'train_y.csv', header=0)
        self.train_ids_df = pd.read_csv(self.data_dir+'train_ids.csv', header=0)
        self.test_X_df = pd.read_csv(self.data_dir+'test_X.csv', header=0)
        self.test_ids_df = pd.read_csv(self.data_dir+'test_ids.csv', header=0)
        
    '''
    def get_feature_importances(self):
        """Trains a basic random forest to get a list of feature importances"""
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
        
        importance_tree = RandomForestRegressor(n_jobs=self.n_jobs,
                                random_state=self.random_state,
                                n_estimators=self.n_estimators)
        
        # train a basic model so that we can access feature importances
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        importance_tree.fit(cv_X, cv_y)
        self.feature_importances = importance_tree.feature_importances_
        
        if not(self.opt_func == None):
            cv_y = self.opt_func(cv_y)
        
        # sort the features by importances, most important first
        self.sorted_features = self.feature_importances.argsort()[::-1]
        
        return self.sorted_features
    '''
    '''
    def select_features(self):
        """Select features from the dataset by using feature importances
        to add features one by one for selection"""
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
            
        self.select_features_called = True
            
        # TODO:
        #   - VERIFY THAT THIS IS WORKING
        #   - THINK OF OTHER TYPES OF FEATURE SELECTION
        #     BACKWARD FEATURE SELECTION?
        #   - IS THIS THE ONLY CONSIDERATION FOR SELECTING FEATURES??
        #     WHAT IF THE SCORE IS VERY HIGH WITH FEW FEATURES OUT OF MANY??
        #     WHAT IF THE LARGEST SCORE IS AFTER MANY FEATURES ARE ADDED
        #     WITH MINIMAL IMPROVEMENT TO SCORE??
        
        if self.regressor:
            model = SVR(kernel=self.kernel, max_iter=self.max_iter)
        else:
            model = SVC(kernel=self.kernel, max_iter=self.max_iter)
            
        # keep track of the scores from cross validation
        all_scores = []
        max_score = -10000
        early_stopping_rounds = 10
        early_stopping_count = 0
        
        # which features to add, in order
        self.sorted_features = self.get_feature_importances()
        
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        cv_X_selected = np.array([])
        cv_X_selected.shape = (cv_X.shape[0], 0)
                
        if not(self.opt_func == None):
            cv_y = self.opt_func(cv_y)
            
        for feature in self.sorted_features:
            
            # format data so that it can be added easily
            added_feat = np.transpose(cv_X[:, feature])
            added_feat.shape += (1, )
            
            # append data to the next column
            cv_X_selected = np.append(cv_X_selected, added_feat, axis=1)
            
            # train a model with the augmented data
            #model.fit(cv_X_selected, cv_y)
            #score = model.oob_score_
            scores = cross_val_score(model, cv_X_selected, cv_y, cv=3, scoring='neg_mean_squared_error')
            score = scores.mean()
            print(feature)
            print(score)
            
            # have an evaluation metric to measure performance of added features
            all_scores.append(score)
            
            if score > max_score:
                max_score = score
                early_stopping_count = 0
            else:
                early_stopping_count += 1
                
            if early_stopping_count > early_stopping_rounds:
                break
        
        max_score_index = np.argmax(all_scores)
        print(all_scores)
        print(max_score_index)
        
        # select the features
        self.selected_feature_indices = self.sorted_features[:(max_score_index+1)]
        self.selected_features = cv_X_selected[:, :(max_score_index+1)]
        
        return self.selected_feature_indices
    '''
    def select_features_2(self, score_func_name='f_regression', percentage=100):
        """Function to select features based on sklearn scoring"""
        
        self.select_features_called = True
        
        cv_X = self.cv_X_df.values
        cv_y = np.ravel(self.cv_y_df.values)
        
        if score_func_name == 'chi2':
            score_func = chi2
        elif score_func_name == 'f_regression':
            score_func = f_regression
        elif score_func_name == 'f_classif':
            score_func = f_classif
               
        if not(self.opt_func == None):
            cv_y = self.opt_func(cv_y)
        
        self.feature_selector = SelectPercentile(score_func, percentile=percentage)
        self.selected_features = self.feature_selector.fit_transform(cv_X, cv_y)
        
        return self.selected_features
        
    def tune_params(self):
        """Function to handle tuning hyperparameters of the model"""
        
        # TODO:
        #   - FIND GOOD VALUES OF PARAMETERS TO TEST
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
        
        self.tune_params_called = True
        
        self.tune_C = True
        self.tune_epsilon = True
        self.tune_gamma = True
        
        # Parameters to tune for RandomForest
        # the maximum number of features to be considered for a tree.
        C_pos = [0.001, 0.01, 0.1, 1, 10, 100]
        
        epsilon_pos = [0, 0.001, 0.01, 0.1, 1, 10, 100]
        
        gamma_pos = [0.001, 0.01, 0.1, 1, 10, 100]
        
        # only take the selected features if we have performed training already
        if self.select_features_called:
            cv_X = self.selected_features
        else:
            cv_X = self.cv_X_df.values
        
        cv_y = np.ravel(self.cv_y_df.values)
        
        if not(self.opt_func == None):
            cv_y = self.opt_func(cv_y)
        
        # some default values for training
        self.C = 1
        self.epsilon = 0.1
        self.gamma = 'auto'
        
        cv_splits = 5
        
        if self.regressor:
            model = SVR(kernel=self.kernel, max_iter=self.max_iter, C=self.C,
                       epsilon=self.epsilon, gamma=self.gamma)
        else:
            model = SVC(kernel=self.kernel, max_iter=self.max_iter, C=self.C,
                       gamma=self.gamma)
            self.tune_epsilon = False
                    
        # perform parameter-space searching
        if self.tune_C:
            scores = []
            for C in C_pos:
                model.set_params(C=C)
                print(model.get_params)
                score = cross_val_score(model, cv_X, cv_y, cv=cv_splits,
                                     scoring=make_scorer(self.scorer))
                score = score.mean()
                scores.append(score)
                print(score)
            
            best_param_index = np.argmax(scores)
            print(best_param_index)
            self.C = C_pos[best_param_index]
            model.set_params(C=self.C)
            
        if self.tune_epsilon:
            scores = []
            for epsilon in epsilon_pos:
                model.set_params(epsilon=epsilon)
                print(model.get_params)
                score = cross_val_score(model, cv_X, cv_y, cv=cv_splits,
                                     scoring=make_scorer(self.scorer))
                score = score.mean()
                scores.append(score)
                print(score)
            
            best_param_index = np.argmax(scores)
            print(best_param_index)
            self.epsilon = epsilon_pos[best_param_index]
            model.set_params(epsilon=self.epsilon)
            
        if self.tune_gamma:
            scores = []
            for gamma in gamma_pos:
                model.set_params(gamma=gamma)
                print(model.get_params)
                score = cross_val_score(model, cv_X, cv_y, cv=cv_splits,
                                     scoring=make_scorer(self.scorer))
                score = score.mean()
                scores.append(score)
                print(score)
            
            best_param_index = np.argmax(scores)
            print(best_param_index)
            self.gamma = gamma_pos[best_param_index]
            model.set_params(gamma=self.gamma)            
            
            
        self.model = model
        print(self.model.get_params())
        
    def train_model(self, max_iter=-1):
        """Trains the model on all training data
        and returns a model to be used for prediction"""
        
        if not(self.read_data_called):
            raise AssertionError("No data yet!")
            
        self.train_model_called = True
            
        # determine which model to use
        if self.tune_params_called:
            self.model = self.model
        else:
            if self.regressor:
                model = SVR(kernel=self.kernel, max_iter=self.max_iter)
            else:
                model = SVC(kernel=self.kernel, max_iter=self.max_iter)
        
        if self.select_features_called:
            train_X = self.selected_features
        else:
            train_X = self.train_X_df.values
            
        train_y = np.ravel(self.train_y_df.values)
                        
        if not(self.opt_func == None):
            train_y = self.opt_func(train_y)
        
        self.model.fit(train_X, train_y)
        
    def predict_output(self):
        """Make predictions on test data."""
        
        if not(self.train_model_called):
            raise AssertionError("Train the model first!")
        
        # transform the features to match the selected features from training
        #test_X_all = self.test_X_df.values
        #test_X = np.array([])
        #test_X.shape = (test_X_all.shape[0], 0)
        #for feature in self.selected_feature_indices:
            #
            # format data so that it can be added easily`
            #added_feat = np.transpose(test_X_all[:, feature])
            #added_feat.shape += (1, )
            #
            # append data to the next column
            #test_X = np.append(test_X, added_feat, axis=1)
            
        
        test_X = self.test_X_df.values
        test_X = self.feature_selector.transform(test_X)
        
        self.pred = self.model.predict(test_X)
                        
        if not(self.inv_opt_func == None):
            self.pred = self.inv_opt_func(self.pred)
            
        return self.pred
    
    def write_output(self, output_file='output.csv', header=['Id', 'Result'],
                    out_dir='./output/'):
        # create output dir if it does not exist
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            
        # write output to a file
        prediction_file = open(out_dir+output_file, 'w')
        open_file_object = csv.writer(prediction_file, lineterminator='\n')
        open_file_object.writerow(header)
        data = zip(np.ravel(self.test_ids_df.values), self.pred)
        open_file_object.writerows(data)
        prediction_file.close()
        
    def get_model(self):
        """Return the model used for prediction"""
        return self.model
        

In [10]:
def neg_rmse(y, y_pred):
    return -1*np.sqrt(np.mean((y_pred-y)**2))

def accuracy(y, y_pred):
    correct = np.sum([1 if y[x] == y_pred[x] else 0 for x in y])
    return float(correct) / len(y)

def log_e(y):
    return np.log(y)

def log_10(y):
    return np.log10(y)

def exp_e(y):
    return np.exp(y)

def exp_10(y):
    return np.power(10, y)

In [12]:
model = SVM(n_jobs=2, regressor=False, kernel='rbf', max_iter=10000,
                    opt_func=None, inv_opt_func=None, scorer=accuracy)
    
model.read_data()

#imp = model.get_feature_importances()
#print(imp)

#features = model.select_features()
features = model.select_features_2(score_func_name='f_regression', percentage=100)
#print(features)

model.tune_params()

model.train_model(max_iter=10000)

pred = model.predict_output()
print(pred)

model.write_output(output_file='titanic_output_SVM.csv', header=['PassengerId', 'Survived'])

train_model = model.get_model()

  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


<bound method BaseEstimator.get_params of SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=10000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>
0.322905027933
<bound method BaseEstimator.get_params of SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=10000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>
0.322905027933
<bound method BaseEstimator.get_params of SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=10000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>
0.876836158192
<bound method BaseEstimator.get_params of SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gam