In [54]:
# Importing useful libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, \
                    LeaveOneOut, KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
import sklearn.utils

from imblearn.over_sampling import SMOTE

import xgboost
import lightgbm

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# Loading the dataset
df = pd.read_csv('./Data_for_UCI_named.csv')
df.head()

In [None]:
# Checking the shape of the dataset
df.shape

In [None]:
# Checking for missing values
df.isnull().sum()

In [None]:
# Dropping 'stab' column as instructed
df = df.drop('stab', axis=1)
df.head()

In [None]:
y = df.pop('stabf') # Pops out the stabf column as the label
X = df # Uses the remaining columns as the features
X.head()

In [None]:
# Splitting the data into 80:20 training and testing test with a random_state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 

In [None]:
scaler = StandardScaler() # Initializes a StandardScaler object
scaled_X_train = scaler.fit_transform(X_train) # Fits and transform the training set
scaled_X_test = scaler.transform(X_test) # Transforms the testing set

#### Let's create a function that returns the metric score of a test set. The metric can be any of  accuracy_score, precision_score, recall_score, f1_score and confusion matrix.

In [None]:
# First, let's create a dictionary of the metrices we will be using
metrics = {'accuracy_score': accuracy_score, 'precision_score': precision_score, 'recall_score': recall_score, 
               'f1_score': f1_score, 'confusion_matrix': confusion_matrix}

In [None]:
# Defining the function
def get_metric_score(metric, ytrue, ypred, neg_pos_label):
    ''' This function returns the specified metric score. It only works with classifier metrics.
        
        Args:   metric (string): the evaluating metric, can be any of accuracy_score, precision_score, recall_score, f1_score, 
                                 or confusion matrix.
                ytrue (array): the true labels
                ypred (array): the predicted labels
                neg_pos_label (list): a list of the classes you want as the negative and positive label in order 
                                      of [negative_label, positive_label]
                
        Return: returns the metric score
    '''
    
    if metric == 'accuracy_score':
        return accuracy_score(ytrue, ypred)
    
    elif metric == 'confusion_matrix':
        return confusion_matrix(ytrue, ypred, neg_pos_label)
    
    else:
        return metrics[metric](ytrue, ypred, pos_label=neg_pos_label[1]) # this is done because precision, recall and f1_score
                                                                         # takes the same arguments

#### Let's create a that function fits a classifier on a training set and prints out the accuracy_score, precision_score, recall_score,    f1_score and confusion matrix of the testing set.

In [None]:
# Defining the function
def fit_and_score(classifier, xtrain, ytrain, xtest, ytest, neg_pos_label):
    ''' This function fits a classifier on a training set and prints out the accuracy_score, precision_score, recall_score, 
    f1_score and confusion matrix of the testing set.
    
    Args: classifier (classifier object): the classifier you want to use
          xtrain (ndarray): the training features
          ytrain (array): the training labels
          xtest (ndarray): the testing features
          ytest (array): the testing labels
          neg_pos_label (list): a list of the classes you want as the negative and positive label in order 
                                      of [negative_label, positive_label]
    '''
    classifier.fit(xtrain, ytrain) # fits the classifier
    ypred = classifier.predict(xtest) # predicts
    
    # for each metric in metrics (dictionary earlier defined), print out the metric score.
    for metric in metrics:
        
        # this 'if' block is to ensure that the confusion matrix is properly printed out to improve redability
        if metric == 'confusion_matrix':
            print()
            print('confusion_matrix is:')
            print(get_metric_score(metric, ytest, ypred, neg_pos_label))
            
        else:
            print('{} is {}'.format(metric, get_metric_score(metric, ytest, ypred, neg_pos_label)))

In [None]:
label_list = ['unstable', 'stable'] # this is to be used as the neg_pos_label needed in fit_and_score function

## Let's evaluate our model on different classifiers

#### Training and testing on RandomForestClassifier

In [None]:
random_forest = RandomForestClassifier(random_state=1)
fit_and_score(random_forest, scaled_X_train, y_train, scaled_X_test, y_test, label_list)

In [None]:
extra_trees = ExtraTreesClassifier(random_state=1)
fit_and_score(extra_trees, scaled_X_train, y_train, scaled_X_test, y_test, label_list)

In [None]:
xgb = xgboost.XGBClassifier(random_state=1)
fit_and_score(xgb, scaled_X_train, y_train, scaled_X_test, y_test, label_list)

In [None]:
lgbm = lightgbm.LGBMClassifier(random_state=1)
fit_and_score(lgbm, scaled_X_train, y_train, scaled_X_test, y_test, label_list)

In [None]:
# initializing search space of hyperparameters
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

# making a dictionary of the grid
hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split, 'max_features': max_features}

In [None]:
extra_trees2 = ExtraTreesClassifier(random_state=1) # initializes an ExtraTreesClassifier

# initializing a RandomizedSearchCV
tuned_extra_trees = RandomizedSearchCV(extra_trees2, hyperparameter_grid, random_state=1, verbose=1, n_jobs=3) 

In [None]:
fit_and_score(tuned_extra_trees, scaled_X_train, y_train, scaled_X_test, y_test, label_list)

In [None]:
random_forest2 = RandomForestClassifier(random_state=1) # initializes a random forest classifier object
random_forest2.fit(scaled_X_train, y_train) # fits the model
ypred_random_forest = random_forest2.predict(scaled_X_test) # predicts on the testing set
accuracy_score(y_test, ypred_random_forest) # ouputs the accuracy

In [None]:
xgb2 = xgboost.XGBClassifier(random_state=1) # initializes a xgboost classifier object
xgb2.fit(scaled_X_train, y_train) # fits the model
ypred_xgb = xgb2.predict(scaled_X_test) # predicts on the testing set
accuracy_score(y_test, ypred_xgb) # ouputs the accuracy

In [None]:
light_gbm2 = lightgbm.LGBMClassifier(random_state=1) # initializes a light gbm classifier object
light_gbm2.fit(scaled_X_train, y_train) # fits the model
ypred_light_gbm2 = light_gbm2.predict(scaled_X_test) # predicts on the testing set
accuracy_score(y_test, ypred_light_gbm2) # ouputs the accuracy

In [None]:
# initializing search space of hyperparameters
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

# making a dictionary of the grid
hyperparameter_grid = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf, 
                       'min_samples_split': min_samples_split, 'max_features': max_features}

In [None]:
extra_trees2 = ExtraTreesClassifier(random_state=1) # initializes an ExtraTreesClassifier object

# initializing a RandomizedSearchCV object
tuned_extra_trees2 = RandomizedSearchCV(extra_trees2, hyperparameter_grid, cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, 
                                        verbose = 1, random_state = 1)

In [None]:
# fitting the tuned_extra_trees2 model
tuned_extra_trees2.fit(scaled_X_train, y_train)

In [None]:
# finding the best hyperparameters
tuned_extra_trees2.best_params_

In [None]:
# let's train an ordinary extra trees classifier
extra_trees3 = ExtraTreesClassifier(random_state=1) # initializes an extra trees classifier object
extra_trees3.fit(scaled_X_train, y_train) # fits the extra trees

In [None]:
ypred_ordinary = extra_trees3.predict(scaled_X_test) # predicts on the testing set

print('accuracy of ordinary extra trees is {}'.format(accuracy_score(y_test, ypred_ordinary)))

In [None]:
# now let's find the accuracy of the tuned extra trees
ypred_tuned = tuned_extra_trees2.best_estimator_.predict(scaled_X_test) # predicts with the best estimator of the tuned extra 
                                                                        # trees
print('accuracy of tuned extra trees is {}'.format(accuracy_score(y_test, ypred_tuned)))

In [None]:
features_importance = tuned_extra_trees2.best_estimator_.feature_importances_ # finds the feature importance

In [None]:
most_important_feature = features_importance.max() # finds the most important feature
least_important_feature = features_importance.min() # finds the least important feature

cols = X.columns # assigns the features in the data to a variable 'cols'

In [None]:
print('most important feature is {}'.format(cols[features_importance == most_important_feature][0]))

In [None]:
print('least important feature is {}'.format(cols[features_importance == least_important_feature][0]))