In [None]:

# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reade
from math import sqrt
 
# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset
 
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
 
# Convert string column to intege
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup
 
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores
 
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right
 
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini
 
# Select the best split point for a dataset
def get_split(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    while len(features) < n_features:
        index = randrange(len(dataset[0])-1)
        if index not in features:
            features.append(index)
    for index in features:
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}
 
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, n_features)
        split(node['left'], max_depth, min_size, n_features, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, n_features)
        split(node['right'], max_depth, min_size, n_features, depth+1)
 
# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
    root = get_split(train, n_features)
    split(root, max_depth, min_size, n_features, 1)
    return root
 
# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']
 
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample
 
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)
 
# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size, n_features)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return(predictions)
 
# Test the random forest algorithm
seed(2)
# load and prepare data
filename = 'sonar.all-data.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
for n_trees in [1, 5, 10]:
    scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
    print('Trees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

In [8]:
import numpy as np 
from sklearn.datasets import load_iris
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from urllib.request import urlopen
from sklearn.model_selection import train_test_split

In [2]:
def load_data():
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'
    rows = []
    for line in urlopen(data_url).readlines():
        line = line.strip().split(',')
        features = [float(i) for i in line[:-1]]
        label = [1 if line[-1] == 'M' else 0]
        rows.append(features + label)
    return rows

In [9]:
f = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data', header=None, prefix='X')
f

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X51,X52,X53,X54,X55,X56,X57,X58,X59,X60
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


In [73]:
tot_x_data = f.iloc[:, 0:58]
tot_y_data = f.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(tot_x_data, tot_y_data, test_size = 0.2, random_state = 44)

In [29]:
np.ravel(np.random.randint(10,size=(4, 10))).shape

(40,)

In [36]:
bigx, bigy = augment(X_train, y_train, 5)

In [35]:
def augment(x, y, times):
    if x.shape[0]!=y.shape[0]:
        print("length should be same")
        return 
    idx = np.ravel(np.random.randint(x.shape[0],size=(times, x.shape[0])))
    return x.iloc[idx, :].reset_index(drop=True), y.iloc[idx].reset_index(drop=True)

In [93]:
class CustomDeicisionTree:
    def __init__(self,  max_depth, min_samples_leaf, min_samples_split, columns_idx):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.columns_idx = columns_idx
        self.model = DecisionTreeClassifier(max_depth = self.max_depth, min_samples_leaf = self.min_samples_leaf, min_samples_split = self.min_samples_split)
    
    def fit(self, x_data, y_data):
        self.model.fit(x_data, y_data)
        
    def predict(self, test_data):
        filtered_data = test_data.iloc[:, self.columns_idx]
        return self.model.predict(filtered_data)

In [147]:
class RandomForest:
    def __init__(self, max_depth, min_sample_leaf, min_samples_split):
        self.max_depth = max_depth
        self.min_samples_leaf = min_sample_leaf
        self.min_samples_split = min_samples_split
        self.trees = []
        
    def fit(self, trainX, trainY, n_features, n_trees):
        for i in range(n_trees):
            bootstrapped_idx = np.random.randint(trainX.shape[0], size = trainX.shape[0])
            column_idx =  np.random.choice(trainX.shape[1], n_features)
            x_sample = trainX.iloc[bootstrapped_idx, column_idx]
            y_sample = trainY[bootstrapped_idx]
            tree = CustomDeicisionTree(max_depth = self.max_depth, min_samples_leaf = self.min_samples_leaf, min_samples_split = self.min_samples_split, columns_idx= column_idx)
            tree.fit(x_sample, y_sample)
            self.trees.append(tree)
        
    
    def bagging_predict(self, row):
        predictions = np.array([tree.predict(row) for tree in self.trees])
        return [getResult(row)for row in np.transpose(predictions)]
        

In [164]:
customed_rf = RandomForest(max_depth=5, min_sample_leaf=4, min_samples_split=4)

In [165]:
customed_rf.fit(bigx, bigy, 10, 300)

In [166]:
predict_test = customed_rf.bagging_predict(X_test)

In [163]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix(y_test, predict_test)

array([[19,  0],
       [ 8, 15]])

In [167]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix(y_test, predict_test)

array([[19,  0],
       [ 8, 15]])

In [177]:
rf_1000 = RandomForest(max_depth=7, min_sample_leaf=1, min_samples_split=2)
rf_1000.fit(bigx, bigy, 10, 300)
predict_test_1000 = rf_1000.bagging_predict(X_test)
confusion_matrix(y_test, predict_test_1000)

array([[19,  0],
       [ 8, 15]])

### sklearn 에서 제공하는 decision tree 로 적합시킨 경우 

In [157]:
dt_clf = DecisionTreeClassifier(max_depth = 7, min_samples_leaf=2, min_samples_split=2)
dt_clf.fit(bigx, bigy)

DecisionTreeClassifier(max_depth=7, min_samples_leaf=2)

In [158]:
dt_prediction = dt_clf.predict(X_test)

In [159]:
confusion_matrix(y_test, dt_prediction)

array([[15,  4],
       [14,  9]])

### sklearn 에서 제공하는 randomForest 로 적합한 경우 

In [176]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=300, max_features = 10, max_depth = 7, random_state=123456)
rf.fit(bigx, bigy)
predicted = rf.predict(X_test)
confusion_matrix(y_test, predicted)

array([[19,  0],
       [ 7, 16]])