# Random Foreset 

In [25]:
import numpy as np
import pandas as pd
import sklearn
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score

## Data Preprocessing

### Data Loading

In [None]:
columns = []
with open('kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(':')[0]
        columns.append(feature)
columns.append('label')

train_data = pd.read_csv('kddcup.data', header=None)
train_data.columns = columns
train_data['label'] = train_data['label'].apply(lambda x: x.replace('.', ''))

test_data = pd.read_csv('corrected', header=None)
test_data.columns = columns
test_data['label'] = test_data['label'].apply(lambda x: x.replace('.', ''))

X_data = train_data.drop(columns=['label'])
y_data = train_data['label']

### Split categorical attributes and continuous attributes

In [4]:
feature_type = []
with open('kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(': ')[1]
        feature_type.append(feature[0:-2])
feature_type.append('label')

# Split features by type
data_index = X_data.index.values
idx_symbolic = [index for index, ft_type in enumerate(feature_type) if ft_type == "symbolic"]
idx_continuous = [index for index, ft_type in enumerate(feature_type) if ft_type == "continuous"]
X_symbolic = X_data.iloc[:, idx_symbolic]
X_continuous = X_data.iloc[:, idx_continuous]

### Remove Outlier by continuous attributes

In [None]:
# Remove outlier
mean = X_continuous.describe().loc['mean']
std = X_continuous.describe().loc['std']

num_attributes = len(X_continuous.columns)
instances_as_outliers = []

for attribute in X_continuous:
    attr = X_continuous[attribute]
    upper_bound = mean[attribute]+3*std[attribute]
    instances_as_outliers.append(X_continuous[attr>upper_bound].index)

filtered_idx = []
for indices in instances_as_outliers:
    for idx in indices:
        filtered_idx.append(idx)

filtered_idx = set(filtered_idx)

X_symbolic = X_symbolic.drop(index = filtered_idx, axis=0)
X_continuous = X_continuous.drop(index = filtered_idx, axis=0)

### Feature standardisation

In [21]:
# Standardise continuous features.
sscaler = StandardScaler().fit(X_continuous)
X_continuous = sscaler.transform(X_continuous)

# Encode labels
y_data_clean = y_data.drop(index = filtered_idx, axis=0)
encoder = LabelBinarizer()
y_data_clean = encoder.fit_transform(y_data_clean)

# Combine two types of features 
#X_data_clean = pd.concat([X_symbolic, X_continuous], axis=1)
#X_data_clean.describe()

# Split training data into trainig set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_continuous, y_data_clean, test_size=0.1, random_state=0)

## Random Forest Regression

In [None]:
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)


random_grid = {'n_estimators': n_estimators,
               'criterion': ['mse', 'mae'],
               'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth,
               'bootstrap': [True, False]}

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               verbose=2, 
                               cv = None,
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

#Random Search result 
np.save('best_rf_random.npy',rf_random.best_params_)

In [23]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_val, y_val)



0.9999181046365658