In [219]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score,precision_score,recall_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score 
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from statistics import mean
from math import *

In [227]:
# DATASET

data = pd.read_csv('BitcoinHeistData.csv')
data = data.drop(['address'], axis=1)
data

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income,label
0,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...,...,...
2916692,2018,330,0,0.111111,1,0,1,1.255809e+09,white
2916693,2018,330,0,1.000000,1,0,1,4.409699e+07,white
2916694,2018,330,2,12.000000,6,6,35,2.398267e+09,white
2916695,2018,330,0,0.500000,1,0,1,1.780427e+08,white


In [233]:
# ENCODING 

labelencoder_Y = LabelEncoder()
data['lable'] = labelencoder_Y.fit_transform(data['label'])
data

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income,label,lable
0,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber,26
1,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky,27
2,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber,26
3,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber,26
4,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky,27
...,...,...,...,...,...,...,...,...,...,...
2916692,2018,330,0,0.111111,1,0,1,1.255809e+09,white,28
2916693,2018,330,0,1.000000,1,0,1,4.409699e+07,white,28
2916694,2018,330,2,12.000000,6,6,35,2.398267e+09,white,28
2916695,2018,330,0,0.500000,1,0,1,1.780427e+08,white,28


In [261]:
data = data.sample(frac=1)
data

Unnamed: 0,year,day,length,weight,count,looped,neighbors,income,label,lable
1461610,2014,335,144,1.113239e-02,2479,2450,2,5.291255e+07,white,28
201490,2011,170,0,1.000000e+00,1,0,2,1.500000e+08,white,28
1661496,2015,170,4,2.000000e-01,1,0,2,1.000000e+08,white,28
529270,2012,133,144,1.563806e-01,509,509,12,7.604119e+08,white,28
792454,2013,31,12,6.250000e-02,1,0,2,1.220656e+09,white,28
...,...,...,...,...,...,...,...,...,...,...
300073,2011,269,80,3.725290e-09,1,0,1,7.600000e+07,white,28
1886377,2016,30,2,1.428571e-01,1,0,2,9.905000e+08,white,28
1111815,2013,351,6,5.000000e-01,1,0,2,8.401000e+07,white,28
175565,2011,144,80,2.493140e-07,24,0,5,4.010000e+08,white,28


In [185]:
# EDA REPORT 

profile = ProfileReport(data, title="Data Analysis Report")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [262]:
# TRAIN TEST VALIDATION SPLIT 

X= data.drop(columns='label')
Y=data[['label']]
def train_val_test_split(data, labels, train,val,test):
    print("Length: "+str(len(data)))
    
    train_data=data[:len(data)*0.7,:]
    train_labels=labels[:len(data)*0.7,:]
    
    val_data=data[len(data)*0.7:len(data)*0.85,:]
    val_labels=labels[len(data)*0.7:len(data)*0.85,:]
    
    test_data=data[len(data)*0.85:,:]
    test_labels=labels[len(data)*0.85:,:]
    
    return train_data,train_labels,val_data,val_labels,test_data,test_labels

In [263]:
# SPLITTED DATA
print(X_train.shape, X_val.shape, X_test.shape)
print(Y_train.shape, Y_val.shape, Y_test.shape)

(2041687, 8) (437504, 8) (437506, 8)
(2041687, 1) (437504, 1) (437506, 1)


In [265]:
# GINI INDEX

depth=[4,8,10,15,20]
trainingAcc=[]
testingAcc=[]

for i in depth:
    gini_tree = DecisionTreeClassifier(criterion='gini',max_depth=i)
    gini_tree = gini_tree.fit(X_train,Y_train)
    trainAcc = gini_tree.score(X_train,Y_train)
    testAcc = gini_tree.score(X_test,Y_test)
    print("Accuracy For Depth: ",i)
    print("Accuracy: ", trainAcc)
   # print("Accuracy test: ", testAcc)
    print()
    trainingAcc.append(trainAcc*100)
    testingAcc.append(testAcc*100)

Accuracy For Depth:  4
Accuracy:  0.9840328120813817

Accuracy For Depth:  8
Accuracy:  0.9854742671134215

Accuracy For Depth:  10
Accuracy:  0.9861198117047324

Accuracy For Depth:  15
Accuracy:  0.9886569292942552

Accuracy For Depth:  20
Accuracy:  0.99223387326265



In [255]:
# ENTROPY

depth=[4,8,10,15,20]
trainingAcc=[]
testingAcc=[]

for i in depth:
    entropy_tree = DecisionTreeClassifier(criterion='entropy',max_depth=i)
    entropy_tree.fit(X_train,Y_train)
    trainAcc = entropy_tree.score(X_train,Y_train)
    testAcc = entropy_tree.score(X_test,Y_test)
    print("Accuracy For Depth: ",i)
    print("Accuracy: ", trainAcc)
    print()
    trainingAcc.append(trainAcc*100)
    testingAcc.append(testAcc*100)

Accuracy For Depth:  4
Accuracy:  0.9840847299316693

Accuracy For Depth:  8
Accuracy:  0.9854556550538843

Accuracy For Depth:  10
Accuracy:  0.9860247922428854

Accuracy For Depth:  15
Accuracy:  0.9889380693514725

Accuracy For Depth:  20
Accuracy:  0.9926261958860492



In [None]:
# GETTING 50% DATASET RANDOMLY AND MAKING 100 DECISION TREES WITH (MAX DEAPTH 3)

def max_vote(predictions):
    final_prediction=[]
    for j in range(len(predictions[0])):
        maxi={}
        for i in predictions:
            if(i[j] in maxi):
                maxi[i[j]]+=1
            else:
                maxi[i[j]]=1
        max_key=max(maxi, key= lambda X: maxi[X])
        final_prediction.append(max_key)
    
    return final_prediction

stumps=[]
predictions=[]
for i in range(100):
    stumps.append(DecisionTreeClassifier(criterion="entropy",max_depth=3))
    X_train_frac=X_train.sample(frac=0.5)
    Y_train_frac=Y_train.loc[X_train_frac.index]
    stumps[i].fit(X_train,Y_train)
    predicts=stumps[i].predict(X_test)
    predictions.append(predicts)

final_prediction=max_vote(predictions)
accuracy=np.sum(np.array(final_prediction)==np.array(Y_test.to_list()))/len(Y_test)

print("Accuracy: "+str(accuracy))

In [259]:
# ADABOOST

estimators = [4, 8, 10, 15, 20]
testAcc=[]
trainAcc=[]
valAcc=[]

for i in estimators:
    adaboost_tree = AdaBoostClassifier(n_estimators=i, base_estimator=DecisionTreeClassifier(criterion="entropy", max_depth=i))
    adaboost_tree.fit(X_train,Y_train)
    testAcc.append(adaboost_tree.score(X_test,Y_test))
    trainAcc.append(adaboost_tree.score(X_train,Y_train))
    valAcc.append(adaboost_tree.score(X_val,Y_val))
    print("Accuracy For Estimate: ",i)
    print("Training Accuracy: ", trainAcc[-1])
    print("Validating Accuracy: ", valAcc[-1])
    print("Testing Accuracy: ", testAcc[-1])
    

  y = column_or_1d(y, warn=True)


Accuracy For Estimate:  4
Training Accuracy:  0.9774064290951552
Validating Accuracy:  2.0571240491515506e-05
Testing Accuracy:  5.9427756419340534e-05


  y = column_or_1d(y, warn=True)


Accuracy For Estimate:  8
Training Accuracy:  0.9386213459751667
Validating Accuracy:  0.0007588502047981276
Testing Accuracy:  0.005803348982642524


  y = column_or_1d(y, warn=True)


Accuracy For Estimate:  10
Training Accuracy:  0.9481859854130432
Validating Accuracy:  0.009741625219426566
Testing Accuracy:  0.02413681183800908


  y = column_or_1d(y, warn=True)


Accuracy For Estimate:  15
Training Accuracy:  0.9896967556731272
Validating Accuracy:  0.02513805588063195
Testing Accuracy:  0.027821332736008193


  y = column_or_1d(y, warn=True)


Accuracy For Estimate:  20
Training Accuracy:  0.9995361678846953
Validating Accuracy:  0.0992653781451141
Testing Accuracy:  0.09312329430910662
