# Random Forest

Classification with imbalanced data using random forest.
Comparison of accuracy score using downsampling and SMOTE upsampling

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
## multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# read the data
path="../../Data Processing/Data/" # path to file relative to working directory
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv(path+'categorical.csv')
targets = pd.read_csv(path+'target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)

In [None]:
RAND_STATE = 42 # for reproducible shuffling
TT_RATIO = 0.25 # test/train

In [None]:
data['TARGET_B'].value_counts() # distribution of target "B" (binary label)

In [None]:
# X,y
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
# split the data by type
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

<b> Since SMOTE works on numerical data only, we encode the categorical variables

In [None]:
# onehot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='if_binary').fit(categoricalX) #  drop the first category in each feature to reduce frame size
encoded_categorical = pd.DataFrame(encoder.transform(categoricalX).toarray()) # encode
#encoded_categorical = pd.DataFrame(encoded_categorical) #
X = pd.concat([numericalX, encoded_categorical], axis = 1) # rejoin
X.head(3)


In [None]:
# check for missing values
X.isna().any(axis=1).value_counts()
X.isna().any(axis=1).value_counts()

In [None]:
# remove the null values
na_idx = X[X.isna().any(axis=1)].index
X = pd.DataFrame(X).drop(na_idx)
y = pd.DataFrame(y).drop(na_idx)

In [None]:
# test-train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [None]:
X_train.columns = X_train.columns.astype(str)

In [None]:
X_test.columns = X_test.columns.astype(str)

In [None]:
def down_samp_rand(Xin, yin, ratio=1):
        from imblearn.under_sampling import RandomUnderSampler
        """Downsamples majority class using random sampling.
        Ratio argument is the ratio of minority class to the downsampled majority
        """
        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=RAND_STATE)
        X_rus, y_rus = rus.fit_resample(Xin, yin)
        return X_rus, y_rus

In [None]:
X_train, y_train = down_samp_rand(X_train,y_train)

In [None]:
# check that we have downsampled
y_train.value_counts()

#### train, fit, and evaluate model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_ops = {"max_depth":6,
           "min_samples_leaf":20, # resulting features in leaves
           "n_estimators":100,
           "bootstrap":True,
           "oob_score":True,
           "random_state":RAND_STATE}

clf = RandomForestClassifier(**rfc_ops)

        #max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
         #                    bootstrap=True,oob_score=True, random_state=RAND_STATE)
clf.fit(X_train, y_train)
print("train prediction accuracy score: %.2f" %(clf.score(X_train, y_train)))
print("test prediction accuracy score: %.2f"  %(clf.score(X_test, y_test)))

In [None]:
from sklearn.metrics import accuracy_score
score_ds = accuracy_score(y_test,clf.predict(X_test))

Reference: [Details](https://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score) about how the accuracy is computed

#### cross validation

In [None]:
from sklearn.model_selection import cross_val_score
folds=5
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=folds)


In [None]:
print("cv scores over {:d} iterations: \n".format(folds))
cross_val_scores

In [None]:
print("the std. dev. in the cv scores is {:.4f}".format(np.std(cross_val_scores)))

## Random forest model using SMOTE upsampling

In [None]:
# test-train refresh

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis=1)
X_test = X_test.drop(['TARGET_D'],   axis=1)


In [None]:
# again we need to drop the null (we use the same shuffling)
na_idx = X_train[X_train.isna().any(axis=1)].index
X_train = pd.DataFrame(X_train).drop(na_idx)
y_train = pd.DataFrame(y_train).drop(na_idx)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)
y_train.value_counts()

In [55]:
# fit the model and evaluate using the upsampled data from SMOTE
clf.fit(X_train, y_train)

In [56]:
print("test prediction accuracy score: %.2f" %(clf.score(X_test, y_test)))


test prediction accuracy score: 0.88


In [None]:
score_us = accuracy_score(y_test,clf.predict(X_test))

In [None]:
pd.DataFrame({"score": [score_ds,score_us]},index=["downsampled", "upsampled"] )

Note the improvement using S:MOTE over downsmpling

In [None]:
clf.predict_proba(X_test)

In [None]:
clf.predict(X_test)