## How to run 
Change the filepath and run.

---


PS. It take much time on CIFAR dataset.

## Random Forest

In [0]:
import pandas as pd
import numpy as np
import random
import math
import collections
from sklearn import metrics


def load_data(filepath):
    '''
    :param filepath: the path with filename
    :return: Dataframe, Train data and test data
    '''
    data = np.load(filepath)
    Xts = data['Xts']
    Yts = data['Yts']
    Xtr = data['Xtr']
    Ytr = data['Str']

    train_data = []
    for i in range(Xtr.shape[0]):
        train_data.append(Xtr[i].flatten())

    test_data = []
    for i in range(Xts.shape[0]):
        test_data.append(Xts[i].flatten())

    train_x = np.array(train_data)
    test_x = np.array(test_data)
    Yts = Yts.reshape(-1, 1)
    Ytr = Ytr.reshape(-1, 1)
    train_data = np.hstack((train_x, Ytr))
    test_data = np.hstack((test_x, Yts))

    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)

    return train_df, test_df



class Tree(object):
    '''
    Define a decision tree
    '''
    def __init__(self):
        self.splitFeas = None
        self.splitVal = None
        self.leaf_value = None
        self.left = None
        self.right = None

    # Predict the category of sample by recursion
    def predicValue(self, dataset):
        if self.leaf_value is not None:
            return self.leaf_value
        elif dataset[self.splitFeas] <= self.splitVal:
            return self.left.predicValue(dataset)
        else:
            return self.right.predicValue(dataset)

class RandomForestClassifier(object):
    def __init__(self, trees=10, max_depth=5, min_samples_split=2, min_samples_leaf=1,
                 min_split_gain=0.0, subsample=1.0, random_state=None):
        self.n_tree = trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_split_gain = min_split_gain
        self.colsample_bytree = None
        self.subsample = subsample
        self.random_state = random_state
        self.trees = dict()

    def fit(self, dataset, targets):
        targets = targets.to_frame(name=targets.name)


        if self.random_state:
            random.seed(self.random_state)

        #Create the random seed
        random_state_stages = random.sample(range(self.n_tree), self.n_tree)

        # We randomly chose log2(features) of features
        self.colsample_bytree = int(math.log(len(dataset.columns)))

        for stage in range(self.n_tree):
            print("iter: "+str(stage+1))
            random.seed(random_state_stages[stage])

            # Randomly chose specific percentage of samples
            subset_index = random.sample(range(len(dataset)), int(self.subsample * len(dataset)))

            # Randomly chose the features
            subcol_index = random.sample(dataset.columns.tolist(), self.colsample_bytree)

            tempDataset = dataset.loc[subset_index, subcol_index].reset_index(drop=True)
            tempTaget = targets.loc[subset_index, :].reset_index(drop=True)

            tree = self.tree_fit(tempDataset, tempTaget, depth=0)
            self.trees[stage] = tree

    # Create decision tree by recursion
    def tree_fit(self, dataset, targets, depth):
        # Stop condition1: when leaf only have one category or the number smaller that the threshold
        if len(targets[targets.columns[-1]].unique()) <= 1 \
                or len(dataset) <= self.min_samples_split:
            tree = Tree()
            tree.leaf_value = self.calLeafValue(targets[targets.columns[-1]])
            return tree

        # Stop condition2: when the depth reach the threshold
        if depth < self.max_depth:
            splitFea, splitValue, splitGain = self.chooseFeas(dataset, targets)
            l_dataset, r_dataset, l_targets, r_targets = \
                self.split_dataset(dataset, targets, splitFea, splitValue)

            tree = Tree()
            # Stop condition3: sample's number is smaller than the threshold
            if len(l_dataset) <= self.min_samples_leaf or \
                    len(r_dataset) <= self.min_samples_leaf or \
                    splitGain <= self.min_split_gain:
                tree.leaf_value = self.calLeafValue(targets[targets.columns[-1]])
                return tree
            else:
                tree.splitFeas = splitFea
                tree.splitVal = splitValue
                tree.left = self.tree_fit(l_dataset, l_targets, depth + 1)
                tree.right = self.tree_fit(r_dataset, r_targets, depth + 1)
                return tree
        else:
            tree = Tree()
            tree.leaf_value = self.calLeafValue(targets[targets.columns[-1]])
            return tree

    # Choose the best way to split the data accoding the features, threshold and gains
    def chooseFeas(self, dataset, targets):
        splitGain = 1
        splitFeature = None
        splitVal = None

        for fea in dataset.columns:
            values = sorted(dataset[fea].unique().tolist())

            # Calulate the gain of split and choose the best one
            for value in values:
                l_targets = targets[dataset[fea] <= value]
                r_targets = targets[dataset[fea] > value]
                splitGini = self.giniIndex(l_targets[l_targets.columns[-1]], r_targets[l_targets.columns[-1]])

                if splitGini < splitGain:
                    splitFeature = fea
                    splitVal = value
                    splitGain = splitGini
        return splitFeature, splitVal, splitGain

    # Set leaf value with the most frequent feas
    @staticmethod
    def calLeafValue(targets):
        counts = collections.Counter(targets)
        label = max(zip(counts.values(), counts.keys()))
        return label[1]

    # Use gini index to evaluate the split
    @staticmethod
    def giniIndex(l_targets, r_targets):
        gain = 0
        length = len(l_targets) + len(r_targets)
        for targets in [l_targets, r_targets]:
            gini = 1
            len_targets = len(targets)
            count = collections.Counter(targets)
            for index in count:
                prob = count[index] * 1.0 / len_targets
                gini -= prob ** 2
            gain += len(targets) * 1.0 / (length) * gini
        return gain

    # Split the dataset by threshold
    @staticmethod
    def split_dataset(dataset, targets, s_feas, s_value):
        l_dataset = dataset[dataset[s_feas] <= s_value]
        l_targets = targets[dataset[s_feas] <= s_value]
        r_dataset = dataset[dataset[s_feas] > s_value]
        r_targets = targets[dataset[s_feas] > s_value]
        return l_dataset, r_dataset, l_targets, r_targets

    # Predict the label
    def predict(self, dataset):
        Result = []
        for index, row in dataset.iterrows():
            predList = []
            # Summary the prediction and choose the most frequency one
            for stage, tree in self.trees.items():
                predList.append(tree.predicValue(row))

            labelCounts = collections.Counter(predList)
            predLabel = max(zip(labelCounts.values(), labelCounts.keys()))[1]
            Result.append(predLabel)
        return np.array(Result)


# Standerized the train and test data
def standard(x_train, x_test):

  std = np.std(x_train, keepdims=True)
  mean = np.mean(x_train, keepdims=True)
  x_train = (x_train-mean)/std
  x_test = (x_test-mean)/std
  return x_train, x_test



if __name__ == '__main__':
  #Change the path of file
  df, test_df = load_data('/content/drive/My Drive/CIFAR.npz')
  train_count = int(0.8 * len(df))
  train_acc = []
  val_acc = []
  test_acc = []
  for i in range(1):
    # shuffle the dataset and select 80% data
    df = df.sample(frac=1).reset_index(drop=True)
    test_df = test_df.sample(frac=1).reset_index(drop=True)

    clf = RandomForestClassifier(trees=1,
                    max_depth=5,
                    min_samples_split=3,
                    min_samples_leaf=3,
                    subsample=0.2,
                    random_state=233)

    train_x = df.iloc[:, :-1].values
    train_y = df.iloc[:, -1].values
    test_x = test_df.iloc[:, :-1].values
    test_y = test_df.iloc[:, -1].values
    train_x, test_x = standard(train_x, test_x)
    train_x = pd.DataFrame(train_x)
    test_x = pd.DataFrame(test_x)

    clf.fit(df.iloc[:train_count, :-1], df.iloc[:train_count, -1])
        
    train_acc.append(metrics.accuracy_score(df.iloc[:train_count, -1], clf.predict(df.iloc[:train_count, :-1])))
    val_acc.append(metrics.accuracy_score(df.iloc[train_count:, -1], clf.predict(df.iloc[train_count:, :-1])))
    test_acc.append(metrics.accuracy_score(test_df.iloc[:, -1], clf.predict(test_df.iloc[:, :-1])))


  print(train_acc,np.mean(train_acc))
  print(val_acc, np.mean(val_acc))
  print(test_acc, np.mean(test_acc))




## SVM baseline

In [0]:
from sklearn import svm
import numpy as np
import pandas as pd

# load the data, if dataset is CIFAR combine three channel
def load_data(filepath,dataset = 'MINIST'):
    data = np.load(filepath)

    Xts = data['Xts']
    Yts = data['Yts']
    Xtr = data['Xtr']
    Ytr = data['Str']

    train_data = []
    if dataset == 'CIFAR':
        for i in range(Xtr.shape[0]):
            temp = Xtr[i,:,:,0]+Xtr[i,:,:,1]+Xtr[i,:,:,2]
            train_data.append(temp.flatten())
    else:
        for i in range(Xtr.shape[0]):
            train_data.append(Xtr[i].flatten())

    test_data = []
    if dataset =='CIFAR':
        for i in range(Xts.shape[0]):
            temp = Xts[i, :, :, 0] + Xts[i, :, :, 1] + Xts[i, :, :, 2]
            test_data.append(temp.flatten())
    else:
        for i in range(Xts.shape[0]):
            test_data.append(Xts[i].flatten())

    train_x = np.array(train_data)
    test_x = np.array(test_data)
    print(train_x.shape, test_x.shape)

    Yts = Yts.reshape(-1, 1)
    Ytr = Ytr.reshape(-1, 1)
    train_data = np.hstack((train_x, Ytr))
    test_data = np.hstack((test_x, Yts))
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)

    return train_df, test_df


# standerize the train and test dataset
def standard(x_train, x_test):
    std = np.std(x_train, keepdims=True)
    mean = np.mean(x_train, keepdims=True)
    x_train = (x_train-mean)/std
    x_test = (x_test-mean)/std
    return x_train, x_test

if __name__ == "__main__":
    df, test_df = load_data('/content/drive/My Drive/CIFAR.npz')
    
    # shuffle the dataset and select 80% of data
    df = df.sample(frac=1).reset_index(drop=True)
    train_count = int(0.8 * len(df))
    
    clf = svm.SVC(C = 20, kernel='poly')

    train_x = df.iloc[:train_count, :-1].values.tolist()
    train_y = df.iloc[:train_count, -1].values.tolist()
    val_x = df.iloc[0:train_count, :-1].values.tolist()
    val_y = df.iloc[0:train_count, -1].values.tolist()
    test_x = test_df.iloc[:, :-1].values.tolist()
    test_y = test_df.iloc[:, -1].values.tolist()

    train_x, test_x = standard(train_x,test_x)

    clf.fit(train_x, train_y)
    print(len(train_y))
    print(clf.score(train_x, train_y),clf.score(val_x,val_y),clf.score(test_x,test_y))