In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
LE = LabelEncoder()
SS = StandardScaler()
df = ""

In [4]:
def initialiseData():
    df = pd.DataFrame(pd.read_csv('BitcoinHeistData.csv'))
    df = df.dropna()
    df = df.sample(frac=1,random_state=2).reset_index(drop=True)
    return df

In [6]:
df = initialiseData()

In [7]:
def runDecisionTree(trainX, trainY, testX, testY):
    maxHeight = [4,8,10,15,20]
    i=0
    while i < len(maxHeight):
        dtc1 = DecisionTreeClassifier(criterion="gini",max_depth=maxHeight[i],random_state=2)
        dtc1.fit(trainX,trainY)
        yPred = dtc1.predict(testX)
        print("Accuracy for max height of tree = ",maxHeight[i]," is ",accuracy_score(testY,yPred)," for gini")
        i+=1
        
    j=0
    while j < len(maxHeight):
        dtc2 = DecisionTreeClassifier(criterion="entropy",max_depth=maxHeight[j],random_state=2)
        dtc2.fit(trainX,trainY)
        yPred = dtc2.predict(testX)
        print("Accuracy for max height of tree = ",maxHeight[j]," is ",accuracy_score(testY,yPred)," for entropy")
        j+=1

In [17]:
def modifyData(xTrainTemp,yTrainTemp,trainTemp,i):
        trainTemp = trainTemp.sample(frac=1,random_state=i).reset_index(drop=True)
        xTrainTemp = trainTemp.drop(['label'],axis=1)
        xTrainTemp = xTrainTemp[:int(len(df)*0.5)]
        yTrainTemp = trainTemp['label']
        yTrainTemp = yTrainTemp[:int(len(df)*0.5)]
        return xTrainTemp,yTrainTemp

In [16]:
def runEnsembling(trainX,trainY,testX,testY):
    noError = 0
    res = []
    xTrainTemp = trainX.copy()
    yTrainTemp = trainY.copy()
    trainTemp = pd.concat([xTrainTemp,yTrainTemp],axis=1)
    i=0
    while i < 100:
        xTrainTemp,yTrainTemp = modifyData(xTrainTemp,yTrainTemp,trainTemp,i)
        dtc = DecisionTreeClassifier(criterion="entropy",max_depth=3,random_state=i,splitter="random")
        dtc.fit(xTrainTemp,yTrainTemp)
        yPred = dtc.predict(testX)
        res.append(yPred)
        i+=1
    resNew = []
    for i in res:
        resNew.append(np.array(i))
    resNew = (np.array(resNew)).T
    i=0
    while(i<resNew.shape[0]):
        if np.bincount(resNew[i]).argmax() == testY.iloc[i]:
            noError+=1
        i+=1
    print("Accuracy for ensemble is ",noError/len(testY))

In [10]:
def runBoostingAdaBoost(trainX,trainY,testX,testY):
    predictors = [4,8,10,15,20]
    i=0
    while i < len(predictors):
        dtc = AdaBoostClassifier(n_estimators=predictors[i],random_state=2,base_estimator=DecisionTreeClassifier(criterion="entropy",max_depth=3,random_state=2))
        dtc.fit(trainX,trainY)
        yPred = dtc.predict(testX)
        print("Accuracy for number of predictors = ",predictors[i]," is ",accuracy_score(testY,yPred)," for AdaBoost")
        i+=1

In [11]:
def runBoostingRF(trainX,trainY,testX,testY):
    predictors = [4,8,10,15,20]
    i=0
    while i < len(predictors):
        dtc = RandomForestClassifier(random_state=2,n_estimators=predictors[i],max_depth=15,criterion="entropy")
        dtc.fit(trainX,trainY)
        yPred = dtc.predict(testX)
        print("Accuracy for number of predictors = ",predictors[i]," is ",accuracy_score(testY,yPred)," for random forest")
        i+=1

In [12]:
df['label'] = LE.fit_transform(df['label']) 
df['address'] = LE.fit_transform(df['address'])

df1 = df.drop(['label'], axis=1)
labelTemp = df['label']

trainX=df1[:int(len(df1)*0.7)]
trainY= labelTemp[:int(len(labelTemp)*0.7)]

valX = df1[int(len(df1)*0.7):int(len(df1)*0.85)]
valY=labelTemp[int(len(labelTemp)*0.7):int(len(labelTemp)*0.85)]

testX = df1[int(len(df1)*0.85):]
testY =labelTemp[int(len(labelTemp)*0.85):]


In [13]:
runDecisionTree(trainX, trainY, testX, testY)

Accuracy for max height of tree =  4  is  0.9858378761385584  for gini
Accuracy for max height of tree =  8  is  0.9863567273516874  for gini
Accuracy for max height of tree =  10  is  0.9870492908652473  for gini
Accuracy for max height of tree =  15  is  0.9883612758711329  for gini
Accuracy for max height of tree =  20  is  0.988009279893944  for gini
Accuracy for max height of tree =  4  is  0.9857921623752871  for entropy
Accuracy for max height of tree =  8  is  0.9862287288145278  for entropy
Accuracy for max height of tree =  10  is  0.9873761442726369  for entropy
Accuracy for max height of tree =  15  is  0.9888961269014068  for entropy
Accuracy for max height of tree =  20  is  0.9880024228294534  for entropy


In [18]:
runEnsembling (trainX, trainY, testX, testY)

Accuracy for ensemble is  0.9857921623752871


In [20]:
runBoostingAdaBoost(trainX, trainY, testX, testY)

Accuracy for number of predictors =  4  is  0.9632804196523468  for AdaBoost
Accuracy for number of predictors =  8  is  0.7129358521616895  for AdaBoost
Accuracy for number of predictors =  10  is  0.684828744814345  for AdaBoost
Accuracy for number of predictors =  15  is  0.5207620484337322  for AdaBoost
Accuracy for number of predictors =  20  is  0.5590861818722072  for AdaBoost
Accuracy for number of predictors =  4  is  0.9877761397012605  for random forest
Accuracy for number of predictors =  8  is  0.9876915692392086  for random forest
Accuracy for number of predictors =  10  is  0.9876092844653204  for random forest
Accuracy for number of predictors =  15  is  0.987522428315105  for random forest
Accuracy for number of predictors =  20  is  0.9875932846481754  for random forest


In [None]:
runBoostingRF(trainX, trainY, testX, testY)