In [1]:
import pandas as pd
import numpy as np

# Adaboost-SAMME

In [None]:
import numpy as np
import pandas as pd
import random
import math
from sklearn import tree

class BoostedDT:

    def __init__(self, numBoostingIters=100, maxTreeDepth=3):
        '''
        Constructor

        Class Fields 
        clfs : List object containing individual DecisionTree classifiers, in order of creation during boosting
        betas : List of beta values, in order of creation during boosting
        '''

        self.clfs = None  # keep the class fields, and be sure to keep them updated during boosting
        self.betas = None
        
        self.numBoost = numBoostingIters
        self.maxDepth = maxTreeDepth
        self.numBoostingIters = numBoostingIters
        
        self.labels = None
        self.K = 0
        self.classes = []
        


    def fit(self, X, y, random_state=None):
        '''
        Trains the model. 
        Be sure to initialize all individual Decision trees with the provided random_state value if provided.
        
        Arguments:
            X is an n-by-d Pandas Data Frame
            y is an n-by-1 Pandas Data Frame
            random_seed is an optional integer value
        '''
        #TODO
        
        n,d = X.shape
        weights = np.ones(n)/n
        self.clfs = []
        self.betas = []
        y_train = y.to_numpy()
        err_t = []
        self.labels = np.unique(y_train)
        K = len(self.labels)
        
        self.classes = np.unique(y_train)
        self.K = K
        
        for t in range(self.numBoost):
            random.seed(random_state)
            np.random.seed(random_state)
            
            modelDT = DecisionTreeClassifier(max_depth=self.maxDepth, random_state=random_state)
            modelDT.fit(X, y, sample_weight = weights)
            self.clfs.append(modelDT)
            
            y_pred = modelDT.predict(X)
            err = 0
            for i in range(n):
                if y_pred[i] != y_train[i]:
                    err = err + weights[i]
            if err == 0:
                err = 1e-16
            err_t.append(err)
            # print(err)
            
            beta = 0.5 * (np.log((1 - err)/err) + np.log(K - 1))
            self.betas.append(beta)
            for i in range(n):
                if y_pred[i] == y_train[i]:
                    weights[i] = weights[i] * np.exp(-beta)
                else:
                    weights[i] = weights[i] * np.exp(beta)    
            sumwt = weights.sum()
            weights = weights / sumwt
            # print(weights)
            
              
            
    def predict(self, X):
        '''
        Used the model to predict values for each instance in X
        Arguments:
            X is an n-by-d Pandas Data Frame
        Returns:
            an n-by-1 Pandas Data Frame of the predictions
        '''
        
        n,d = X.shape
        labellist = list(self.labels)
        score = np.zeros((n, len(labellist)))
        pred = np.empty(n)
        #print(labellist) 
        
        for t in range(self.numBoost):
            ypred = self.clfs[t].predict(X)
            for i in range(n):
                scoreind = labellist.index(ypred[i])
                score[i, scoreind] = score[i, scoreind] + self.betas[t]
        
        #print(score)
        labelind = np.argmax(score, axis=1)
        for i in range(n):
            pred[i] = labellist[labelind[i]]
        
        return pd.DataFrame(pred)

# Test BoostedDT

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def test_boostedDT():

  # load the data set
  sklearn_dataset = datasets.load_iris()
  # sklearn_dataset = datasets.load_breast_cancer()
  # convert to pandas df
  df = pd.DataFrame(sklearn_dataset.data,columns=sklearn_dataset.feature_names)
  df['CLASS'] = pd.Series(sklearn_dataset.target)
  df.head()

  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.5, random_state=42)
  # Split into X,y matrices
  X_train = train.drop(['CLASS'], axis=1)
  y_train = train['CLASS']
  X_test = test.drop(['CLASS'], axis=1)
  y_test = test['CLASS']


  # train the decision tree
  modelDT = DecisionTreeClassifier()
  modelDT.fit(X_train, y_train)

  # train the boosted DT
  modelBoostedDT = BoostedDT(numBoostingIters=100, maxTreeDepth=3)
  modelBoostedDT.fit(X_train, y_train)

  # train sklearn's implementation of Adaboost
  modelSKBoostedDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=100)
  modelSKBoostedDT.fit(X_train, y_train)

  # output predictions on the test data
  ypred_DT = modelDT.predict(X_test)
  ypred_BoostedDT = modelBoostedDT.predict(X_test)
  ypred_SKBoostedDT = modelSKBoostedDT.predict(X_test)

  # compute the training accuracy of the model
  accuracy_DT = accuracy_score(y_test, ypred_DT)
  accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  accuracy_SKBoostedDT = accuracy_score(y_test, ypred_SKBoostedDT)

  print("Decision Tree Accuracy = "+str(accuracy_DT))
  print("My Boosted Decision Tree Accuracy = "+str(accuracy_BoostedDT))
  print("Sklearn's Boosted Decision Tree Accuracy = "+str(accuracy_SKBoostedDT))
  print()
  print("Note that due to randomization, your boostedDT might not always have the ")
  print("exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they ")
  print("should be roughly equivalent and should usually exceed the standard DT.")

for i in range(10):
    test_boostedDT()


Decision Tree Accuracy = 0.9333333333333333
My Boosted Decision Tree Accuracy = 0.9733333333333334
Sklearn's Boosted Decision Tree Accuracy = 0.9733333333333334

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.
Decision Tree Accuracy = 0.9733333333333334
My Boosted Decision Tree Accuracy = 0.96
Sklearn's Boosted Decision Tree Accuracy = 0.9733333333333334

Note that due to randomization, your boostedDT might not always have the 
exact same accuracy as Sklearn's boostedDT.  But, on repeated runs, they 
should be roughly equivalent and should usually exceed the standard DT.
Decision Tree Accuracy = 0.9333333333333333
My Boosted Decision Tree Accuracy = 0.9733333333333334
Sklearn's Boosted Decision Tree Accuracy = 0.9466666666666667

Note that due to randomization, your boostedDT might not always have the 
exact same acc

In [None]:
dataset = pd.read_csv('processedData.csv')
dataset

Unnamed: 0,id,Size of chocolate pool,Height of pipe,longitude,Lattitude,Cocoa farm,Region code,District code,Location,Chocolate consumers in town,...,Type of pump_8,Type of pump_9,Type of pump_10,Type of pump_11,Type of pump_12,Type of pump_13,Type of pump_14,Type of pump_15,Type of pump_16,Type of pump_17
0,0,-0.215232,0.383148,0.127745,0.200042,5,339,4,107,-0.094853,...,0,0,0,0,0,0,0,0,0,0
1,1,-0.215232,1.656841,0.168190,-1.253220,7,336,5,15,-0.542884,...,0,0,0,0,1,0,0,0,0,0
2,2,-0.215232,0.383148,0.201245,-0.194442,6,326,5,23,-0.094853,...,0,0,0,0,0,0,0,0,0,0
3,3,-0.223547,1.000638,0.226629,0.855958,2,328,6,63,-0.531396,...,0,0,0,0,0,0,0,0,0,0
4,4,-0.215232,0.383148,0.202470,-0.133295,6,326,5,23,-0.094853,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47431,74240,-0.215232,0.117957,0.223194,0.824255,2,327,8,41,0.215322,...,0,0,0,0,0,0,0,0,0,0
47432,74242,-0.215232,0.383148,0.142478,-1.098675,7,337,3,79,-0.094853,...,0,0,0,0,0,0,0,0,0,0
47433,74243,-0.215232,0.127635,0.148343,1.453259,1,345,7,114,-0.370564,...,0,0,0,0,0,0,1,0,0,0
47434,74246,-0.190289,0.592204,0.189330,-0.680715,7,336,2,53,-0.586538,...,0,0,0,0,0,0,0,0,0,0


In [None]:
  df = dataset.drop(['id'], axis=1)

  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.2, random_state=42)
    
  # Split into X,y matrices
  X_train = train.drop(['label'], axis=1)
  y_train = train['label']
  X_test = test.drop(['label'], axis=1)
  y_test = test['label']

  # train the boosted DT
  modelBoostedDT = BoostedDT(numBoostingIters=300, maxTreeDepth=12)
  modelBoostedDT.fit(X_train, y_train)

  # output predictions on the test data
  ypred_BoostedDT = modelBoostedDT.predict(X_test)
  ytrainpred_BoostedDT = modelBoostedDT.predict(X_train)
    
  # compute the accuracy of the model
  accuracy_BoostedDT = accuracy_score(y_test, ypred_BoostedDT)
  print("My Boosted Decision Tree Estimated Accuracy = "+str(accuracy_BoostedDT))
  
  accuracy_BoostedDT = accuracy_score(y_train, ytrainpred_BoostedDT)
  print("My Boosted Decision Tree Training Accuracy = "+str(accuracy_BoostedDT))

My Boosted Decision Tree Estimated Accuracy = 0.7993254637436762
My Boosted Decision Tree Training Accuracy = 0.9951512596184252


In [None]:
  from sklearn.svm import SVC

  # split randomly into training/testing
  train, test = train_test_split(df, test_size=0.2, random_state=42)
  
  # Split into X,y matrices
  X_train = train.drop(['label'], axis=1)
  y_train = train['label']
  X_test = test.drop(['label'], axis=1)
  y_test = test['label']

  # train the SVM model
  clf = SVC(gamma='auto')
  clf.fit(X_train, y_train)

  # output predictions on the test data
  ypred_SVM = clf.predict(X_test)
  ytrainpred_SVM = clf.predict(X_train)
    
  # compute the accuracy of the model
  accuracy_SVM = accuracy_score(y_test, ypred_SVM)
  print("My SVM Estimated Accuracy = "+str(accuracy_SVM))

  accuracy_SVM = accuracy_score(y_train, ytrainpred_SVM)
  print("My SVM Training Accuracy = "+str(accuracy_SVM))
    

My SVM Estimated Accuracy = 0.7484190556492412
My SVM Training Accuracy = 0.7682881838305049
