In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
LE = LabelEncoder()
SS = StandardScaler()
df = ""

In [3]:
def initialiseData():
    df = pd.DataFrame(pd.read_csv('BitcoinHeistData.csv'))
    df = df.dropna()
    df = df.sample(frac=1,random_state=2).reset_index(drop=True)
    return df

In [4]:
df = initialiseData()

In [5]:
def runDecisionTree(trainX, trainY, testX, testY,valX,valY):
    maxHeight = [4,8,10,15,20]
    i=0
    while i < len(maxHeight):
        dtc1 = DecisionTreeClassifier(criterion="gini",max_depth=maxHeight[i],random_state=2)
        dtc1.fit(trainX,trainY)
        yPred = dtc1.predict(testX)
        yPredVal = dtc1.predict(valX)
        print("Accuracy for max height of tree = ",maxHeight[i]," is ",accuracy_score(testY,yPred)," for gini")
        print("Accuracy for max height of tree = ",maxHeight[i]," is ",accuracy_score(valY,yPredVal)," for gini on validation set")
        i+=1
        
    j=0
    while j < len(maxHeight):
        dtc2 = DecisionTreeClassifier(criterion="entropy",max_depth=maxHeight[j],random_state=2)
        dtc2.fit(trainX,trainY)
        yPred = dtc2.predict(testX)
        yPredVal = dtc2.predict(valX)
        print("Accuracy for max height of tree = ",maxHeight[j]," is ",accuracy_score(testY,yPred)," for entropy")
        print("Accuracy for max height of tree = ",maxHeight[j]," is ",accuracy_score(valY,yPredVal)," for entropy on validation set")
        j+=1

In [6]:
def modifyData(xTrainTemp,yTrainTemp,trainTemp,i):
        trainTemp = trainTemp.sample(frac=1,random_state=i).reset_index(drop=True)
        xTrainTemp = trainTemp.drop(['label'],axis=1)
        xTrainTemp = xTrainTemp[:int(len(df)*0.5)]
        yTrainTemp = trainTemp['label']
        yTrainTemp = yTrainTemp[:int(len(df)*0.5)]
        return xTrainTemp,yTrainTemp

In [7]:
def runEnsembling(trainX,trainY,testX,testY,valX,valY):
    noError = 0
    res = []
    resVal =[]
    xTrainTemp = trainX.copy()
    yTrainTemp = trainY.copy()
    trainTemp = pd.concat([xTrainTemp,yTrainTemp],axis=1)
    i=0
    while i < 100:
        xTrainTemp,yTrainTemp = modifyData(xTrainTemp,yTrainTemp,trainTemp,i)
        dtc = DecisionTreeClassifier(criterion="entropy",max_depth=3,random_state=i,splitter="random")
        dtc.fit(xTrainTemp,yTrainTemp)
        yPred = dtc.predict(testX)
        res.append(yPred)
        yValPred = dtc.predict(valX)
        resVal.append(yValPred)
        i+=1
    resNew = []
    resNewVal = []
    for i in res:
        resNew.append(np.array(i))
    
    for i in resVal:
        resNewVal.append(np.array(i))
    resNew = (np.array(resNew)).T
    resNewVal = (np.array(resNewVal)).T
    i=0
    while(i<resNew.shape[0]):
        if np.bincount(resNew[i]).argmax() == testY.iloc[i]:
            noError+=1
        i+=1

    i=0
    noErrorVal = 0
    while(i<resNewVal.shape[0]):
        if np.bincount(resNewVal[i]).argmax() == valY.iloc[i]:
            noErrorVal+=1
        i+=1
    print("Accuracy for ensemble is ",noError/len(testY))
    print("Accuracy for ensemble on validation set is ",noErrorVal/len(valY))

In [8]:
def runBoostingAdaBoost(trainX,trainY,testX,testY,valX,valY):
    predictors = [4,8,10,15,20]
    i=0
    while i < len(predictors):
        dtc = AdaBoostClassifier(n_estimators=predictors[i],random_state=2,base_estimator=DecisionTreeClassifier(criterion="entropy",max_depth=15,random_state=2))
        dtc.fit(trainX,trainY)
        yPred = dtc.predict(testX)
        yValPred = dtc.predict(valX)
        print("Accuracy for number of predictors = ",predictors[i]," is ",accuracy_score(testY,yPred)," for AdaBoost")
        print("Accuracy for number of predictors = ",predictors[i]," is ",accuracy_score(valY,yValPred)," for AdaBoost on validation set")
        i+=1

In [9]:
def runBoostingRF(trainX,trainY,testX,testY,valX,valY):
    predictors = [4,8,10,15,20]
    i=0
    while i < len(predictors):
        dtc = RandomForestClassifier(random_state=2,n_estimators=predictors[i],max_depth=15,criterion="entropy")
        dtc.fit(trainX,trainY)
        yPred = dtc.predict(testX)
        yValPred = dtc.predict(valX)
        print("Accuracy for number of predictors = ",predictors[i]," is ",accuracy_score(testY,yPred)," for random forest")
        print("Accuracy for number of predictors = ",predictors[i]," is ",accuracy_score(valY,yValPred)," for random forest on validation set")
        i+=1

In [10]:
df['label'] = LE.fit_transform(df['label']) 
df['address'] = LE.fit_transform(df['address'])

df1 = df.drop(['label'], axis=1)
labelTemp = df['label']

trainX=df1[:int(len(df1)*0.7)]
trainY= labelTemp[:int(len(labelTemp)*0.7)]

valX = df1[int(len(df1)*0.7):int(len(df1)*0.85)]
valY=labelTemp[int(len(labelTemp)*0.7):int(len(labelTemp)*0.85)]

testX = df1[int(len(df1)*0.85):]
testY =labelTemp[int(len(labelTemp)*0.85):]


In [11]:
runDecisionTree(trainX, trainY, testX, testY,valX,valY)

Accuracy for max height of tree =  4  is  0.9858378761385584  for gini
Accuracy for max height of tree =  4  is  0.9859087324716289  for gini on validation set
Accuracy for max height of tree =  8  is  0.9863567273516874  for gini
Accuracy for max height of tree =  8  is  0.9864161552439401  for gini on validation set
Accuracy for max height of tree =  10  is  0.9870492908652473  for gini
Accuracy for max height of tree =  10  is  0.9871864321550611  for gini on validation set
Accuracy for max height of tree =  15  is  0.9883612758711329  for gini
Accuracy for max height of tree =  15  is  0.9884321322042033  for gini on validation set
Accuracy for max height of tree =  20  is  0.988009279893944  for gini
Accuracy for max height of tree =  20  is  0.9878287105290225  for gini on validation set
Accuracy for max height of tree =  4  is  0.9857921623752871  for entropy
Accuracy for max height of tree =  4  is  0.985840161826722  for entropy on validation set
Accuracy for max height of tre

In [12]:
runEnsembling (trainX, trainY, testX, testY,valX=valX,valY=valY)

Accuracy for ensemble is  0.9857921623752871
Accuracy for ensemble on validation set is  0.985840161826722


In [None]:
runBoostingAdaBoost(trainX, trainY, testX, testY,valX=valX,valY=valY)

In [None]:
runBoostingRF(trainX, trainY, testX, testY,valX=valX,valY=valY)