# Risk on the Blockchain for Bitcoin

In [2]:
# Import Lib's 
import pandas as pd
import hashlib
from pandas.io import gbq
from random import shuffle
from functools import reduce
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline
from sklearn import metrics as mt

In [3]:
## Insert the BigQuery Project ID
projectid = "capstone-202618"

#### Query the tx table from BigQuery 

In [4]:
## Query the table
tx = gbq.read_gbq('SELECT * FROM Data.tx LIMIT 10000000', projectid)

In [6]:
## Rename the columns
tx = tx.rename({'int64_field_0':'txID', 'int64_field_1':'blockID', 
                'int64_field_2':'n_inputs', 'int64_field_3':'n_outputs'}, axis=1)

In [7]:
## Check the data
tx.head()

Unnamed: 0,txID,blockID,n_inputs,n_outputs
0,272843234,495187,1,2
1,272845022,495188,1,2
2,272845795,495189,1,2
3,272849076,495191,2,2
4,272849362,495191,1,2


#### Query the txout table from BigQuery 

In [10]:
## Query the txout table
txout = gbq.read_gbq('SELECT * FROM Data.txout LIMIT 10000000', projectid)

In [12]:
## Rename the columns
txout = txout.rename({'int64_field_0':'txID', 'int64_field_1':'output_seq', 
                    'int64_field_2':'addrID', 'int64_field_3':'sum'}, axis=1)

#### Query the Stone Man Loss Incident from both the tx and txout tables 

In [25]:
## Query the Stone Man Loss Incident 
incident1 = gbq.read_gbq('SELECT * FROM Data.tx WHERE int64_field_0 == 101661', projectid)

In [26]:
## Rename the columns
incident1 = incident1.rename({'int64_field_0':'txID', 'int64_field_1':'blockID', 
                'int64_field_2':'n_inputs', 'int64_field_3':'n_outputs'}, axis=1)

In [27]:
## Check the data
incident1.head()

Unnamed: 0,txID,blockID,n_inputs,n_outputs
0,101661,73272,1,2


In [28]:
## Append the incident1(Stone Man Loss) to the raw tx data
tx = incident1.append(tx)

In [30]:
## Check data
tx.head()

Unnamed: 0,txID,blockID,n_inputs,n_outputs
0,101661,73272,1,2
0,272843234,495187,1,2
1,272845022,495188,1,2
2,272845795,495189,1,2
3,272849076,495191,2,2


In [17]:
## Query the txout table for the Stone Man Loss Incident
incident1 = gbq.read_gbq('SELECT * FROM Data.txout WHERE int64_field_0 == 101661', projectid)

In [19]:
## Rename the columns
incident1 = incident1.rename({'int64_field_0':'txID', 'int64_field_1':'output_seq', 
                    'int64_field_2':'addrID', 'int64_field_3':'sum'}, axis=1)

In [20]:
## Check the data 
incident1.head()

Unnamed: 0,txID,output_seq,addrID,sum
0,101661,0,98866,100000000
1,101661,1,98910,899900000000


In [22]:
## Append the Stone Man Incident to the txout raw data
txout = incident1.append(txout)

In [29]:
## Check the data
txout.head()

Unnamed: 0,txID,output_seq,addrID,sum
0,101661,0,98866,100000000
1,101661,1,98910,899900000000
0,128712579,15,66206876,10031
1,128712579,16,137355495,62286
2,128712579,17,109004316,38045


#### Query the ___ Incident from both the tx and txout tables 

#### Merge the two data frames that have everything appened to them

In [31]:
data = pd.merge(tx, txout)

In [36]:
data.head()

Unnamed: 0,txID,blockID,n_inputs,n_outputs,output_seq,addrID,sum,risk
0,101661,73272,1,2,0,98866,100000000,high
1,101661,73272,1,2,1,98910,899900000000,high
2,101661,73272,1,2,0,98866,100000000,high
3,101661,73272,1,2,1,98910,899900000000,high
4,67819649,355201,2,2,0,71517379,6980000,low


In [35]:
data['risk'] = np.where(data['txID'] == 101661, 'high', 'low')

In [49]:
# data['risk'] = np.where(data['sum'] <= 265884431, 'low', 'high')

In [38]:
data.head()

Unnamed: 0,txID,blockID,n_inputs,n_outputs,output_seq,addrID,sum,risk
0,101661,73272,1,2,0,98866,100000000,high
1,101661,73272,1,2,1,98910,899900000000,high
2,101661,73272,1,2,0,98866,100000000,high
3,101661,73272,1,2,1,98910,899900000000,high
4,67819649,355201,2,2,0,71517379,6980000,low


In [37]:
data.groupby('risk').count()

Unnamed: 0_level_0,txID,blockID,n_inputs,n_outputs,output_seq,addrID,sum
risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high,4,4,4,4,4,4,4
low,309287,309287,309287,309287,309287,309287,309287


In [11]:
######################################################################################################################

Query the Google BigQuery DB


In [10]:
## Mt. Gox
mt_Gox_Hack = gbq.read_gbq('SELECT timestamp, transaction_id, inputs.input_pubkey_base58, outputs.output_satoshis, outputs.output_pubkey_base58 FROM (FLATTEN(DB.transactions_all, outputs.output_satoshis)) WHERE inputs.input_pubkey_base58 = "1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q"', projectid)

In [21]:
# mt_Gox_Hack = mt_Gox_Hack.rename(columns={'transaction_id':'mt_Gox_transaction_id','inputs_input_pubkey_base58':'mt_Gox_input_pubkey', 'outputs_output_satoshis':'mt_Gox_satoshis', 
#                                           'outputs_output_pubkey_base58':'mt_Gox_output_pubkey'})

In [40]:
mt_Gox_Hack = mt_Gox_Hack.rename(columns={'inputs_input_pubkey_base58':'input_pubkey', 'outputs_output_satoshis':'satoshis', 
                                          'outputs_output_pubkey_base58':'output_pubkey'})

In [10]:
# mt_Gox_Hack = mt_Gox_Hack.drop(columns=['mt_Gox_satoshis', 'timestamp'])

In [41]:
mt_Gox_Hack['risk'] = 'high'

In [42]:
mt_Gox_Hack.head()

Unnamed: 0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey,risk
0,1374760111000,20c745ad71edc1797ece5584f2fc6e5095d514c4f7588e...,1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q,298349503,1DCAJRhqWj8QnqSyBFGWe5KugTysRWqNUJ,high
1,1374760111000,20c745ad71edc1797ece5584f2fc6e5095d514c4f7588e...,1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q,30000000,18QYuFhuu4ygG421QRwZz2oxHVWCFu7AGQ,high
2,1370629170000,9fc9675753309d4bff2bd820c8e8481323c5a4056d9921...,1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q,55000000,1LhUCHFiwGqC27uhqqQ4v5X8Lgbtxk6KwD,high
3,1370629170000,9fc9675753309d4bff2bd820c8e8481323c5a4056d9921...,1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q,347739650,16jWwu9PrwMaqpZqmX5CWLRKBPHfmUDrkU,high
4,1363581739000,0b15e65d7d277a63961bd92b04466a010f04c574bacea8...,1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q,1000000000,1FEJoFr5uWKo1dCi5sE6HXwEoV8AEe9z6B,high


In [50]:
## May 2012 Bitcoinica Hack
# index = Transactions 
bitcoinica_hack = gbq.read_gbq('SELECT timestamp, transaction_id, inputs.input_pubkey_base58, outputs.output_satoshis, outputs.output_pubkey_base58 FROM (FLATTEN(DB.transactions_all, outputs.output_satoshis)) WHERE transaction_id = "7a22917744aa9ed740faf3068a2f895424ed816ed1a04012b47df7a493f056e8"', projectid)

In [25]:
# bitcoinica_hack = bitcoinica_hack.rename(columns={'transaction_id':'bitcoinica_hack_transaction_id','inputs_input_pubkey_base58':'bitcoinica_hack_input_pubkey', 'outputs_output_satoshis':'bitcoinica_hack_satoshis', 
#                                                   'outputs_output_pubkey_base58':'bitcoinica_hack_output_pubkey'})

In [51]:
bitcoinica_hack = bitcoinica_hack.rename(columns={'inputs_input_pubkey_base58':'input_pubkey', 'outputs_output_satoshis':'satoshis', 
                                                'outputs_output_pubkey_base58':'output_pubkey'})

In [14]:
# bitcoinica_hack = bitcoinica_hack.drop(columns=['bitcoinica_hack_satoshis', 'timestamp'])

In [52]:
bitcoinica_hack['risk'] = 'high'

In [56]:
bitcoinica_hack.head()

Unnamed: 0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey,risk
0,1336739433000,7a22917744aa9ed740faf3068a2f895424ed816ed1a040...,1KgTc9RSE91fS4Cfc48rbkkkGEHhjLhe7V,1854766867623,182tGyiczhXSSCTciVujNRkkMw1zQxUVhp,high
1,1336739433000,7a22917744aa9ed740faf3068a2f895424ed816ed1a040...,1CMKwkqWVD6BiuHWtrBFTiiCSuSxyyN677,1854766867623,182tGyiczhXSSCTciVujNRkkMw1zQxUVhp,high
2,1336739433000,7a22917744aa9ed740faf3068a2f895424ed816ed1a040...,1KsssDbhj8sW5rvcQ6NHiNoxU2wmSVUrQT,1854766867623,182tGyiczhXSSCTciVujNRkkMw1zQxUVhp,high
3,1336739433000,7a22917744aa9ed740faf3068a2f895424ed816ed1a040...,1Fs1ixzNdPDqLcvsNieUtxd6nX5mZ67SR3,1854766867623,182tGyiczhXSSCTciVujNRkkMw1zQxUVhp,high
4,1336739433000,7a22917744aa9ed740faf3068a2f895424ed816ed1a040...,1PqfGjpgZpdd4gbbGUsUWPJkVaVqoSLizY,1854766867623,182tGyiczhXSSCTciVujNRkkMw1zQxUVhp,high


In [27]:
# bitcoinica_hack_inpub_list = bitcoinica_hack['bitcoinica_hack_input_pubkey'].unique


In [108]:
## Get transaction table form BigQuery 
transaction = gbq.read_gbq('SELECT timestamp, transaction_id, inputs.input_pubkey_base58, outputs.output_satoshis, outputs.output_pubkey_base58 FROM (FLATTEN(DB.transactions_all, outputs.output_satoshis)) WHERE timestamp BETWEEN 1388534400000 and 1514764799000 LIMIT 1250000', projectid)
# gbq.to_gbq(transaction, 'DB.Transactions', projectid, if_exists='replace')



In [93]:
transaction.count()

timestamp                       4570156
transaction_id                  4570156
inputs_input_pubkey_base58      3937315
outputs_output_satoshis         4570156
outputs_output_pubkey_base58    4532291
dtype: int64

In [110]:
transaction.head()

Unnamed: 0,timestamp,transaction_id,inputs_input_pubkey_base58,outputs_output_satoshis,outputs_output_pubkey_base58,risk
0,1416201207000,bbc288593aeda0f05b168866fda9a3e49247606023adcc...,14JE4yVMGq4pwAb4k2QKdGKo8mjo75hWgG,1281422340,115rpX7VgS8ZQ92FAfLTqWh7VPvnFjX5W2,high
1,1416201207000,bbc288593aeda0f05b168866fda9a3e49247606023adcc...,14JE4yVMGq4pwAb4k2QKdGKo8mjo75hWgG,66088,1FZSudc5nKym1NVyqjadhbRyUSbp7YqFMM,low
2,1415976676000,084cf258db2814b34eb9cf3ffb9ab5788b5bb297a21753...,1HqKBGuXcEQr2C9NXbiM7ZdF7EFtHMLQD2,8022220904,1Do3NWv8TkZVsCmnex8puuymJMVTUFUpef,high
3,1415976676000,084cf258db2814b34eb9cf3ffb9ab5788b5bb297a21753...,1HqKBGuXcEQr2C9NXbiM7ZdF7EFtHMLQD2,40000,14Z6TmPGkmogxsJ9gP65g866Vv19UgsSvy,low
4,1415317375000,59ed5fc9ff157c030423a5216bc1f11a00a418514a6a60...,1LuckyR1fFHEsXYyx5QK4UFzv3PEAepPMK,240000,1NxaBCFQwejSZbQfWcYNwgqML5wWoE3rK4,low


Sample the data 

In [95]:
trans = transaction.sample(1000000)

In [28]:
trans = pd.read_csv("Trans_sampled.csv")

In [96]:
trans.count()

timestamp                       1000000
transaction_id                  1000000
inputs_input_pubkey_base58       861799
outputs_output_satoshis         1000000
outputs_output_pubkey_base58     991698
dtype: int64

In [30]:
trans['risk'] = ''

In [34]:
trans.head()

Unnamed: 0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey,risk
0,1454622682000,7924596930a0fc18247cf08ee9cf3d4dfa8b839050f3fb...,17m5KxzT5ypkUSY68DHGjzfvYcQsfKWmxY,85000,135zDqhbNcmPk3gbyeJmH75yiLdVZechsK,
1,1508292791000,e7382000d1651be0788926151c9638f3811eec702fcaf7...,1J37CY8hcdUXQ1KfBhMCsUVafa8XjDsdCn,2554569711,1J37CY8hcdUXQ1KfBhMCsUVafa8XjDsdCn,
2,1466508756000,58f4400b2256e518ee3f2faf87bc1e151a0f7a1fa44a75...,,1178910381,3M9Du3hTieNnE25tjpZaK5XgpuBxYieMHC,
3,1411046836000,bd39e3d7d1f5b23c7f881bf5ba05d636de3c5a7c20e287...,1417UB82UpzZAawfe2RNWkvfiZ24RDoFCj,43085024,1NaZM3q7cURUrr4zrZwWomDNvz28JPk9We,
4,1454474864000,6759e1c98419ed10963ce8f591377d53092137d268314e...,,18175000,15C7EGQ2cbhzJjtAQibWVwM8xUrABnByiH,


In [32]:
trans = trans.rename(columns={'Unnamed: 0':'Sample_ID', 'inputs_input_pubkey_base58':'input_pubkey', 'outputs_output_satoshis':'satoshis', 
                            'outputs_output_pubkey_base58':'output_pubkey'})

In [33]:
trans = trans.drop(columns=['Sample_ID'])

In [3]:
# trans.to_csv("Trans_sampled.csv")

In [2]:
trans = pd.read_csv("Trans_sampled.csv")

In [59]:
# dfs = [mt_Gox_Hack, bitcoinica_hack, trans]
# df_final = reduce(lambda left,right: pd.concat(left,right), dfs)

In [61]:
result = trans.append(mt_Gox_Hack)

In [62]:
result.head()

Unnamed: 0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey,risk
0,1454622682000,7924596930a0fc18247cf08ee9cf3d4dfa8b839050f3fb...,17m5KxzT5ypkUSY68DHGjzfvYcQsfKWmxY,85000,135zDqhbNcmPk3gbyeJmH75yiLdVZechsK,
1,1508292791000,e7382000d1651be0788926151c9638f3811eec702fcaf7...,1J37CY8hcdUXQ1KfBhMCsUVafa8XjDsdCn,2554569711,1J37CY8hcdUXQ1KfBhMCsUVafa8XjDsdCn,
2,1466508756000,58f4400b2256e518ee3f2faf87bc1e151a0f7a1fa44a75...,,1178910381,3M9Du3hTieNnE25tjpZaK5XgpuBxYieMHC,
3,1411046836000,bd39e3d7d1f5b23c7f881bf5ba05d636de3c5a7c20e287...,1417UB82UpzZAawfe2RNWkvfiZ24RDoFCj,43085024,1NaZM3q7cURUrr4zrZwWomDNvz28JPk9We,
4,1454474864000,6759e1c98419ed10963ce8f591377d53092137d268314e...,,18175000,15C7EGQ2cbhzJjtAQibWVwM8xUrABnByiH,


In [63]:
result.groupby('risk').count()

Unnamed: 0_level_0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey
risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,1000000,1000000,861366,1000000,991698
high,41211,41211,41211,41211,41211


In [64]:
result2 = result.append(bitcoinica_hack)

In [65]:
result2.head()

Unnamed: 0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey,risk
0,1454622682000,7924596930a0fc18247cf08ee9cf3d4dfa8b839050f3fb...,17m5KxzT5ypkUSY68DHGjzfvYcQsfKWmxY,85000,135zDqhbNcmPk3gbyeJmH75yiLdVZechsK,
1,1508292791000,e7382000d1651be0788926151c9638f3811eec702fcaf7...,1J37CY8hcdUXQ1KfBhMCsUVafa8XjDsdCn,2554569711,1J37CY8hcdUXQ1KfBhMCsUVafa8XjDsdCn,
2,1466508756000,58f4400b2256e518ee3f2faf87bc1e151a0f7a1fa44a75...,,1178910381,3M9Du3hTieNnE25tjpZaK5XgpuBxYieMHC,
3,1411046836000,bd39e3d7d1f5b23c7f881bf5ba05d636de3c5a7c20e287...,1417UB82UpzZAawfe2RNWkvfiZ24RDoFCj,43085024,1NaZM3q7cURUrr4zrZwWomDNvz28JPk9We,
4,1454474864000,6759e1c98419ed10963ce8f591377d53092137d268314e...,,18175000,15C7EGQ2cbhzJjtAQibWVwM8xUrABnByiH,


In [67]:
result2.groupby('risk').count()

Unnamed: 0_level_0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey
risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,1000000,1000000,861366,1000000,991698
high,41241,41241,41241,41241,41241


In [None]:
trans_list = ['7a22917744aa9ed740faf3068a2f895424ed816ed1a040', ]

In [121]:
## Both Mt. Gox and Bitcoinica
inpub_list = ['1LNWw6yCxkUmkhArb2Nf2MPw6vG7u5WG7q', '1CMKwkqWVD6BiuHWtrBFTiiCSuSxyyN677', '1KsssDbhj8sW5rvcQ6NHiNoxU2wmSVUrQT',
              '1PqfGjpgZpdd4gbbGUsUWPJkVaVqoSLizY', '1PMCKJc5dy5qiXEjCLE7rNN38F117txf8D', '1Fy7piz4aEJBNLUZKUfA6fDQuYyavbod7m',
              '1FnkfHqrBVWVSRxqkKW72frCTujmo2tt9R', '1A62DnxpWARroVYXM7X7huZe7kysMug3mM', '12t1Dp1Cat6z7cnMGP5oGE1LsxD8wNJZTE', 
              '15EKqUPkSb6DfoMmKgatMskiAkwCdh8n7b', '1KuNvyWFqc5wBSWUCyJLDgv12vequwMMCS', '1A62DnxpWARroVYXM7X7huZe7kysMug3mM', 
              '1KxFvsetNsiPP6oKcHAe3D29Wq2R2Kzfxu', '1DMcPxrjDxQ7DpnPhNVnx2Pgs8jiwk7EiX', '1JRvN52epYmWdk3bZXE3sZxgVeTo5qN3LS',
              '1HPnh4o5pmmK1784TjtPyVTFj3mhyiBajn', '16ighKEKdAPqPDhw3r8KfmjhQWVHSBzBeU', '19WsmzLuZW25WtvGLmhzVjCcRDf4KAmjYz', 
              '1kvhbNMVbwEV1dPhh5QKfPdf8Z5DoW7zc', '16ywQheQqeGMjjBeREPQ5motMQZfHghDKi', '142xFX8VJNmdkPTvMrff4rRG5d11NHEAA8',
              '1KuNvyWFqc5wBSWUCyJLDgv12vequwMMCS', '16Htoo4s3jMmRFzCqU32HtQhpWJnF7FJ21', '1JfAAbfKAK6nNVNr9bdRuMwzM7ojuBS6gh',
              '18MsaLYNa5a6Un5qauip1EChxQ4ibrNBhj', '1vgTYAxdF1DaKXUP1SAAe4QmJ95MJh5HG', '1EdgCDGYBn4twN1doYzQ9bUozBsuCJFY93',
              '1kvhbNMVbwEV1dPhh5QKfPdf8Z5DoW7zc', '18RoAhyH8FsFWXCF54owJzReKch5MAMHsg', '1KgTc9RSE91fS4Cfc48rbkkkGEHhjLhe7V',
              '1KgTc9RSE91fS4Cfc48rbkkkGEHhjLhe7V']

In [124]:
inpub_list = abs(hash(inpub_list))

In [None]:
trans['hack'] = np.where(trans['input_pubkey'] == 610780436, 'low', 'high')

Now we, flag each of the transactions in the training set with high/low risk

In [5]:
## The average transaction is worth 0.92 BTC, which is equal to 92,000,000 satoshis. 
## Create a variable to hold the average satosshis
avg_sat = 610780436
trans['risk'] = np.where(trans['satoshis'] <= 610780436, 'low', 'high')

In [15]:
# Convert transaction_id to integer
for y in trans['transaction_id']:
    trans['transaction_id'] = abs(hash(y))

In [103]:
n = 'None'
abs(hash(n))

3253103400149891883

In [102]:
NaN = 'NaN'
abs(hash(NaN))

5894463979814294565

In [22]:
# Convert input_pubkey to integer
for w in trans['input_pubkey']:
    trans['input_pubkey'] = abs(hash(w))

In [29]:
# Convert output_pubkey to integer
for z in trans['output_pubkey']:
    trans['output_pubkey'] = abs(hash(z))

In [77]:
trans.to_csv("trans_flagged.csv")

In [3]:
trans = pd.read_csv("trans_flagged.csv")

In [5]:
trans = trans.drop(columns='Unnamed: 0')

In [6]:
trans.head()

Unnamed: 0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey,risk
0,1454622682000,1556704086554120743,5610622358866825713,85000,6389026776455554319,low
1,1508292791000,1556704086554120743,5610622358866825713,2554569711,6389026776455554319,high
2,1466508756000,1556704086554120743,5610622358866825713,1178910381,6389026776455554319,high
3,1411046836000,1556704086554120743,5610622358866825713,43085024,6389026776455554319,low
4,1454474864000,1556704086554120743,5610622358866825713,18175000,6389026776455554319,low


In [68]:
trans["satoshis"].mean()

610780436.731065

In [69]:
trans["satoshis"].std()

10209459050.342167

In [76]:
trans.groupby('risk').count()

Unnamed: 0_level_0,timestamp,transaction_id,input_pubkey,satoshis,output_pubkey
risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
high,87514,87514,87514,87514,87514
low,912486,912486,912486,912486,912486


In [11]:
######################################################################################################################

# Modeling

## Random Forest 

In [52]:
data.head()

Unnamed: 0,txID,blockID,n_inputs,n_outputs,output_seq,addrID,sum,risk
0,67819649,355201,2,2,0,71517379,6980000,low
1,67819649,355201,2,2,1,74772360,5820000,low
2,67820418,355202,9,2,0,74773993,722000000,high
3,67820418,355202,9,2,1,74773994,743700000,high
4,67821808,355203,1,2,0,68844506,810000,low


Split the data for modleing 

In [54]:
y = data['risk']

In [55]:
y.head()

0     low
1     low
2    high
3    high
4     low
Name: risk, dtype: object

In [91]:
X = data.drop(['risk', 'output_seq', 'blockID', 'sum'], axis=1)

In [92]:
X.head()

Unnamed: 0,txID,n_inputs,n_outputs,addrID
0,67819649,2,2,71517379
1,67819649,2,2,74772360
2,67820418,9,2,74773993
3,67820418,9,2,74773994
4,67821808,1,2,68844506


In [93]:
# Split the data for Cross-Validation 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [94]:
# Create an random forest instance 
rf = RandomForestClassifier(n_estimators=25, random_state=11)

In [95]:
rf.fit(x_train,y_train)  # train object

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=11, verbose=0, warm_start=False)

In [96]:
y_hat = rf.predict(x_test) # get test set precitions

In [97]:
pred = rf.predict(x_test)

In [98]:
acc=rf.score(x_test,y_test)
print(acc)

0.924237184432444


KNN

In [107]:
KNN = KNeighborsClassifier(n_neighbors=10, n_jobs=5)

In [108]:
# Train the reusable KNN classifier on the training data
KNN.fit(X,y)  # train object
y_hat = KNN.predict(x_test) # get test set precitions

In [109]:
# Accuracy for the iterations of training/testing
accuracy_KNN = mt.accuracy_score(y_test,y_hat)
print(accuracy_KNN)

0.937777659546168


In [110]:
#Metric report 
metrics_KNN = classification_report(y_test,y_hat)
print(metrics_KNN)

             precision    recall  f1-score   support

       high       0.67      0.44      0.53      6018
        low       0.95      0.98      0.97     69164

avg / total       0.93      0.94      0.93     75182

