### Import settings

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = 'pyspark-shell'

In [2]:
import findspark
findspark.init('/opt/spark')
from pyspark import SparkContext,SparkConf
conf = (SparkConf()
         .setMaster("spark://10.200.5.39:7077")
         .set("spark.executor.memory","30g")
         .set('spark.driver.memory', '60G')
         .set("spark.sql.autoBroadcastJoinThreshold", "-1")
         .setAppName("exportapp"))
sc = SparkContext(conf=conf)

In [3]:
import pyspark
import binascii
from pyspark.sql import SQLContext
from functools import reduce
import pygraphviz
import pyspark.sql.functions as f
from IPython.display import Image
from networkx.drawing.nx_pydot import write_dot
sqlContext = SQLContext(sc)

In [4]:
path = "hdfs://10.200.5.25:9001/user/titanium/"

In [5]:
#############################################################
#            LOAD BLOCKS
#############################################################
#df_blocks_load = sqlContext.read\
#         .format("com.databricks.spark.csv")\
#         .option("header", "true")\
#     .option("inferSchema", "true")\
#         .load(path+"block/*.csv.gz")


In [6]:
#############################################################
#            LOAD LABEL
#############################################################
df_label_load = sqlContext.read\
         .format("com.databricks.spark.csv")\
         .option("header", "true")\
         .option("inferSchema", "true")\
         .load(path+"walletexp_data/*.csv")


In [7]:
#############################################################
#            LOAD TRANSACTIONS
#############################################################
df_transactions = sqlContext.read\
             .format("csv")\
             .option("header", "true")\
             .option("inferSchema", "true")\
             .load(path+"transaction/part2")


In [50]:
df_transactions.show()

+-----------+------+--------------------+--------+------+----------+--------------------+---------+-----------+--------+--------------------+--------------------+--------+
|block_group|height|             tx_hash|coinbase|  size| timestamp|               tx_id|tx_number|     amount|vout_idx|             address|            vin_txid|vin_vout|
+-----------+------+--------------------+--------+------+----------+--------------------+---------+-----------+--------+--------------------+--------------------+--------+
|         19|190000|008DAA58386B9CC3E...|   false|129080|1342824816|008DAA58386B9CC3E...|        0| 2680640283|       0|1dice97ECuByXAvqX...|DD67E4B4E1DAABC6F...|       0|
|         19|190000|030E731152EF47BE7...|   false|129080|1342824816|030E731152EF47BE7...|        0|   10265680|       0|1F7WsJ8EDTQz4Tavh...|1966B8191E9ACAD4D...|       0|
|         19|190000|030E731152EF47BE7...|   false|129080|1342824816|030E731152EF47BE7...|        0|   20000000|       1|1dice8EMZmqKvrGE4...

In [None]:
#blk_max = df_blocks_load.agg({"height": "max"}).collect()[0]
#print(blk_max)
#print(df_blocks_load.agg({"height": "min"}).collect()[0])

In [None]:
#blk_start=520000
#blk_end=blk_max['max(height)']
#df_blocks_load=df_blocks_load.where((f.col("height")>=blk_start)&(f.col("height")<=blk_end))
#df_transactions=df_transactions.where((f.col("height")>=blk_start)&(f.col("height")<=blk_end))

In [30]:
##############################################################
#           OUTPUT ADDRESS DATAFRAME CREATION
#############################################################
df_output_addresses = df_transactions.select('address','vout_idx','amount','tx_id','tx_hash','size','coinbase','height')

In [31]:
#############################################################
#           REMOVE DUPLICATE OUTPUT DATAFRAME
#############################################################
df_output_addresses=df_output_addresses.dropDuplicates(['address','vout_idx','amount','tx_id'])

In [32]:
#############################################################
#   INPUT ADDRESS DATAFRAME CREATION AND REMOVE DUPLICATE
#############################################################

df_input_addresses=df_transactions.dropDuplicates(["vin_txid", "vin_vout"])
df_input_addresses=df_input_addresses.alias('a')\
.join(df_output_addresses.alias('b'),(f.col('a.vin_txid') == f.col('b.tx_id')) & (f.col('a.vin_vout') == f.col('b.vout_idx')),"leftouter")\
.select(f.col('b.address'),f.col('a.vin_vout'),f.col('b.amount'),f.col('a.tx_id').alias('tx_id'),f.col('b.tx_hash'),f.col('b.coinbase'),f.col('b.height'),f.col('b.size'))

In [33]:
#############################################################
#           REMOVE DUPLICATE LABEL DATAFRAME
#############################################################
df_label=df_label_load.dropDuplicates()

In [34]:
#############################################################
#           JOIN ADDRESSES IN/OUT WITH LABEL DATAFRAME
#############################################################
df_output_addresses_tag=df_output_addresses.alias('a')\
.join(df_label.alias('b'),(f.col('a.address') == f.col('b.address')),"leftouter")\
.select(f.col('b.label').alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))
df_input_addresses_tag=df_input_addresses.alias('a')\
.join(df_label.alias('b'),(f.col('b.address') == f.col('a.address')),"leftouter")\
.select(f.col('b.label').alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vin_vout').alias('vout_idx'))

In [35]:
same_address = df_output_addresses_tag.groupby("address")\
.agg(f.count("address").alias("rep"),f.first("user").alias("user"))\
.select("rep","address","user").sort(f.col("rep").desc())

In [36]:
same_address.show()

+------+--------------------+----+
|   rep|             address|user|
+------+--------------------+----+
|520326|1dice8EMZmqKvrGE4...|null|
|471725|1VayNert3x1KzbpzM...|null|
|192513|1dice97ECuByXAvqX...|null|
|149585|1dice9wcMu5hLF4g8...|null|
| 83296|1dice6YgEVBf88erB...|null|
| 80459|1dice7fUkz5h4z2wP...|null|
| 60112|1dice7W2AicHosf5E...|null|
| 44131|17qq5A3XKfrxpJRSC...|null|
| 44030|19ngVyAav9JLE6gVf...|null|
| 44028|14gZfnEn8Xd3ofkjr...|null|
| 43957|1PG1DB6uKdT9uwPBo...|null|
| 43956|13ARRimWwGhXt7ozf...|null|
| 43923|1KyYkZ8wJ7ybvGWxS...|null|
| 43902|1HZK8q2RhY718CZee...|null|
| 43891|13c7aMAEoS1QkwK49...|null|
| 43883|1PU4vjyEnMTVCmcoA...|null|
| 43883|15tvWYtQq8A4m6N1Q...|null|
| 43801|15svFBR3qDuXoqTR3...|null|
| 43766|1MtPYAjqohLH5gMq3...|null|
| 43750|1MBtmmai5T9kx5Lxh...|null|
+------+--------------------+----+
only showing top 20 rows



In [37]:
same_address = same_address.withColumn("user",f.when(f.col("user").isNull(),\
           f.monotonically_increasing_id()).otherwise(f.col("user")))

In [None]:
df_output_addresses_tag= df_output_addresses_tag.alias('a')\
.join(same_address.alias('b'),['address'],'leftouter')\
.select(f.col("b.user").alias("newuser"),f.col("a.user").alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))
df_output_addresses_tag=df_output_addresses_tag.withColumn("user",f.when(f.col("user").isNotNull(),f.col("user")).otherwise(f.col("newuser")))
df_output_addresses_tag=df_output_addresses_tag.drop("newuser")

In [None]:
df_input_addresses_tag= df_input_addresses_tag.alias('a')\
.join(same_address.alias('b'),['address'],'leftouter')\
.select(f.col("b.user").alias("newuser"),f.col("a.user").alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'))
df_input_addresses_tag=df_input_addresses_tag.withColumn("user",f.when(f.col("user").isNotNull(),f.col("user")).otherwise(f.col("newuser")))
df_input_addresses_tag=df_input_addresses_tag.drop("newuser")

In [46]:
################################################################
#   MARK TRANSACTIONS AND USE LABELLED ADDRESS WHERE IS POSSIBLE
################################################################
tx_one_out=df_output_addresses.groupby("tx_id").agg(f.count("address").alias("cnt"))
tx_one_out=tx_one_out.where(f.col("cnt")==1)
tx_one_out = tx_one_out.withColumn("newuser",f.concat(f.monotonically_increasing_id(),f.lit('_aaa')))

In [49]:
tx_one_out.printSchema()

root
 |-- tx_id: string (nullable = true)
 |-- cnt: long (nullable = false)
 |-- user: string (nullable = false)



In [None]:
tx_one_out= tx_one_out.alias('a')\
.join(df_input_addresses_tag.alias('b'),['tx_id'],'leftouter')\
.select("a.tx_id","a.cnt","a.newuser","b.user")


In [None]:
df_input_addresses_tag= df_input_addresses_tag.alias('a')\
.join(tx_one_out.alias('b'),['tx_id'],'leftouter')\
.select(f.col("b.user").alias("newuser"),f.col("a.user").alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'))
df_input_addresses_tag=df_input_addresses_tag.withColumn("user",f.when(f.col("user").isNotNull(),f.col("user")).otherwise(f.col("newuser")))
df_input_addresses_tag=df_input_addresses_tag.drop("newuser")

In [None]:
same_address = same_address.join(tx_label,"address","leftouter")
.select("b.new_label","b.user","b.address","b.amount","b.coinbase","a.tx_id","b.vout_idx","a.cnt")

In [19]:
#############################################################
#   CHECK SAME ADDRESSES IN INPUT AND MARK THEM WITH LABEL
#############################################################
same_address = df_output_addresses_tag.groupby("address")\
.agg(f.count("address").alias("rep"),f.first("user").alias("user"))\
.select("rep","address","user").sort(f.col("rep").desc())
same_address = same_address.withColumn("user",f.when(f.col("user").isNull(),\
           f.monotonically_increasing_id()).otherwise(f.col("user")))

In [20]:
#############################################################
#   JOIN ADDRESSES MARKED WITH THE INPUT DATAFRAME
#############################################################
df_output_addresses_tag = df_output_addresses_tag.alias('a')\
.join(same_address.alias('b'),f.col('a.address')==f.col('b.address'),'leftouter')\
.select(f.col("b.user").alias("new_label"),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))



In [22]:

##############################################################################################################
tx_label = tx_label.groupby("tx_id").agg(f.min("user").alias("new_label"),f.count("address").alias("naddress"))
tx_label = tx_label.withColumn("new_label",f.when(f.col("new_label").isNull(),\
           f.concat(f.monotonically_increasing_id(),f.lit('_aaa'))).otherwise(f.col("new_label")))

In [23]:
#############################################################
#  REMOVE TRANSACTION WITH INPUT MORE THAN 100
#############################################################
df_input_addresses_tag_user = df_input_addresses_tag.toDF("user","address","amount","coinbase","tx_id","vout_idx")

#############################################################
#   JOIN THE TX LABEL WITH THE INPUT DATAFRAME
#############################################################

df_input_addresses_tag_user = df_input_addresses_tag_user.alias('a')\
.join(tx_label.alias('b'),['tx_id'],'leftouter')\
.select(f.col("b.new_label").alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'))

df_input_addresses_tag_reduced=df_input_addresses_tag_user.where(f.col("user").isNotNull())

In [24]:
#############################################################
#           LIST WITH ALL INPUTS LABEL
#############################################################
list_address_user = df_input_addresses_tag_reduced.groupby("address").agg(f.first("user").alias("label"))

In [25]:
#############################################################
#       JOIN LIST INPUTS LABEL WITH OUTPUT ADDRESS
#############################################################
df_output_addresses_tag=df_output_addresses_tag.alias('a')\
.join(list_address_user.alias('b'),['address'],'leftouter')\
.select(f.col('b.label'),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))

df_output_addresses_tag = df_output_addresses_tag.withColumn("user",f.when(f.col("user").isNull(),\
           f.col("label")).otherwise(f.col("user")))\
           .drop(f.col('label'))

df_output_addresses_tag_original=df_output_addresses_tag

In [32]:
#############################################################
#       SELECT ONLY TRANSACTION WITH 2 OUTPUT (LOOKING FOR OTC)
#############################################################
df_output_addresses_tag_copy =df_output_addresses_tag.groupby("tx_id").agg(f.count("address").alias("cnt_out"))
df_output_addresses_tag_copy = df_output_addresses_tag_copy.where(f.col("cnt_out")==2)

In [33]:
#############################################################
#       COUNT THE INPUT IN EACH TRANSACTION
#############################################################
tx_input_schema = df_input_addresses_tag.groupby("tx_id").agg(f.count("address").alias("cnt_in"))

df_output_addresses_tag_copy=df_output_addresses_tag_copy.alias('a')\
.join(tx_input_schema.alias('b'),['tx_id'],'leftouter')\
.select(f.col('b.cnt_in'),f.col('a.cnt_out'),f.col('a.tx_id'))

df_output_addresses_tag_copy = df_output_addresses_tag_copy.where(f.col("cnt_in")!=2)

In [34]:
#############################################################
#       REMOVE ALL OTHER TX IN!=2 OUT=2
#############################################################
df_output_addresses_tag=df_output_addresses_tag.alias('a')\
.join(df_output_addresses_tag_copy.alias('b'),['tx_id'],'rightouter')\
.select(f.col('b.cnt_in'),f.col('b.cnt_out'),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('b.tx_id'),f.col('a.vout_idx'),f.col('a.height'))

In [35]:
first_show=df_output_addresses_tag.groupby("address").agg(f.min("height").alias("show"))

df_output_addresses_tag = df_output_addresses_tag.alias("a")\
.join(first_show.alias('b'),['address'],'leftouter')\
.select(f.col('b.show'),f.col('a.cnt_in'),f.col('a.cnt_out'),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))

df_output_addresses_tag=df_output_addresses_tag.withColumn("first",f.when(f.col("show")==f.col("height"),1).otherwise(0)).drop("show")

In [36]:
input_tx = df_input_addresses_tag_user.groupby("tx_id").agg(f.first("user").alias("inuser"))

df_output_addresses_tag = df_output_addresses_tag.alias("a")\
.join(input_tx.alias('b'),['tx_id'],'leftouter')\
.select(f.col('b.inuser'),f.col('a.first'),f.col('a.cnt_in'),f.col('a.cnt_out'),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))

df_output_addresses_tag=df_output_addresses_tag.where(f.col("inuser").isNotNull())

In [37]:
df_output_addresses_tag = df_output_addresses_tag.withColumn("processing",f.when(f.col("inuser")==f.col("user"),1).otherwise(0))
df_processing = df_output_addresses_tag.groupby("tx_id").agg(f.first("inuser").alias("label"),f.sum("processing").alias("sum"),f.sum("first").alias("first_sum"))
df_processing = df_processing.where((f.col("sum")<1)&(f.col("first_sum")==1))
df_output_addresses_tag = df_output_addresses_tag.alias('a')\
.join(df_processing.alias('b'),(f.col("a.tx_id")==f.col("b.tx_id"))&(f.col("first")==1),"leftouter")\
.select(f.col('b.label'),f.col('a.first'),f.col('a.cnt_in'),f.col('a.cnt_out'),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))

df_output_addresses_tag = df_output_addresses_tag.withColumn("user",f.when(f.col("user").isNull(),\
           f.col("label")).otherwise(f.col("user")))\
           .drop(f.col('label'))

df_output_addresses_tag_reduced=df_output_addresses_tag.where(f.col("user").isNotNull())

In [None]:
#############################################################
#           STORE INPUT ADDRESSES FEATURE IN HDFS
#############################################################
df_input_addresses_tag_reduced.write\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.save(path+"part6/df_input_addresses_tag")

In [None]:
#############################################################
#           STORE OUTPUT ADDRESSES FEATURE IN HDFS
#############################################################
df_output_addresses_tag_reduced.write\
.format("com.databricks.spark.csv")\
.option("header", "true")\
.save(path+"part6/df_output_addresses_tag")