### Import settings

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = 'pyspark-shell'

In [2]:
import findspark
findspark.init('/media/datasets/spark-2.4.0')
from pyspark import SparkContext,SparkConf
conf = (SparkConf()
         .setMaster("spark://10.200.5.25:7077")
         .set("spark.driver.host","10.200.5.25") 
         .set("spark.executor.memory","25g")
         .set('spark.driver.memory', '30G')
         .setAppName("newanalysis"))
sc = SparkContext(conf=conf)

In [3]:
import pyspark
import binascii
from pyspark.sql import SQLContext
from functools import reduce
#import pygraphviz
import pyspark.sql.functions as f
from IPython.display import Image
#from networkx.drawing.nx_pydot import write_dot
sqlContext = SQLContext(sc)

In [4]:
path = "hdfs://10.200.5.25:9001/user/titanium/"
pathDir ="analysis"

In [5]:
# Class Definition
# 1: exchange
# 2: gambling
# 3: market
# 4: service
# 5: pool

In [6]:
#############################################################
#            LOAD LABEL
#############################################################
dirt=["exchange","gambling","market","service","pool"]
df_label_load_part=[None]*5
for i in range(0,len(dirt)):
    df_label_load_part[i] = sqlContext.read\
             .format("com.databricks.spark.csv")\
             .option("header", "true")\
             .option("inferSchema", "true")\
             .load(path+"walletexp_data/"+dirt[i]+"/*.csv")
    df_label_load_part[i] =df_label_load_part[i].withColumn("xxx",f.lit(i+1))

df_label_load=df_label_load_part[0]
df_label_load=df_label_load.union(df_label_load_part[1])
df_label_load=df_label_load.union(df_label_load_part[2])
df_label_load=df_label_load.union(df_label_load_part[3])
df_label_load=df_label_load.union(df_label_load_part[4])


In [7]:
df_label_load = df_label_load.groupby("address").agg(f.first("label").alias("label"),f.first("xxx").alias("class"))

In [8]:
#############################################################
#           REMOVE DUPLICATE LABEL DATAFRAME
#############################################################
df_label=df_label_load.dropDuplicates()

In [None]:
#############################################################
#            LOAD TRANSACTIONS
#############################################################
# 1- 4
# 4
# 5
# 6
# 7
# 8
# 9-11
for i in range(9,11):
    df_transactions_part = sqlContext.read\
             .format("csv")\
             .option("header", "true")\
             .option("inferSchema", "true")\
             .load(path+"transaction/part"+str(i))
    if(i==9):
        df_transactions=df_transactions_part
    else:
        df_transactions=df_transactions.union(df_transactions_part)

        
#############################################################
#            LOAD TRANSACTIONS
#############################################################
df_transactions = sqlContext.read\
             .format("csv")\
             .option("header", "true")\
             .option("inferSchema", "true")\
             .load(path+"transaction/part6")

In [None]:
##############################################################
#           OUTPUT ADDRESS DATAFRAME CREATION
#############################################################
df_output_addresses = df_transactions.select('address','vout_idx','amount','tx_id','tx_hash','size','coinbase','height')

#############################################################
#           REMOVE DUPLICATE OUTPUT DATAFRAME
#############################################################
df_output_addresses=df_output_addresses.dropDuplicates(['address','vout_idx','amount','tx_id'])

#############################################################
#   INPUT ADDRESS DATAFRAME CREATION AND REMOVE DUPLICATE
#############################################################
df_input_addresses=df_transactions.dropDuplicates(["vin_txid", "vin_vout"])
df_input_addresses=df_input_addresses.alias('a')\
.join(df_output_addresses.alias('b'),(f.col('a.vin_txid') == f.col('b.tx_id')) & (f.col('a.vin_vout') == f.col('b.vout_idx')),"leftouter")\
.select(f.col('b.address'),f.col('a.vin_vout'),f.col('b.amount'),f.col('a.tx_id').alias('tx_id'),f.col('b.tx_hash'),f.col('b.coinbase'),f.col('b.height'),f.col('b.size'))

#############################################################
#           JOIN ADDRESSES IN/OUT WITH LABEL DATAFRAME
#############################################################
df_output_addresses_tag=df_output_addresses.alias('a')\
.join(df_label.alias('b'),(f.col('a.address') == f.col('b.address')),"leftouter")\
.select(f.col('b.class').alias("class"),f.col('b.label').alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))
df_input_addresses_tag=df_input_addresses.alias('a')\
.join(df_label.alias('b'),(f.col('b.address') == f.col('a.address')),"leftouter")\
.select(f.col('b.class').alias("class"),f.col('b.label').alias("user"),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vin_vout').alias('vout_idx'))

df_output_addresses_tag.count()
df_input_addresses_tag.count()

df_output_addresses_tag_copy =df_output_addresses_tag.groupby("tx_id").agg(f.count("address").alias("cnt_out"))

tx_input_schema = df_input_addresses_tag.groupby("tx_id").agg(f.count("address").alias("cnt_in"))

########################################################################
df_output_addresses_tag_copy=df_output_addresses_tag_copy.alias('a')\
.join(tx_input_schema.alias('b'),['tx_id'],'leftouter')\
.select(f.col('b.cnt_in'),f.col('a.cnt_out'),f.col('a.tx_id'))

df_output_addresses_tag_copy.count()

df_output_addresses_tag = df_output_addresses_tag.alias("a")\
.join(df_output_addresses_tag_copy.alias('b'),['tx_id'],'leftouter')\
.select(f.col('b.cnt_in'),f.col('b.cnt_out'),f.col('a.user'),f.col('a.class'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height'))

first_show=df_output_addresses_tag.groupby("address").agg(f.min("height").alias("show"))

df_output_addresses_tag = df_output_addresses_tag.alias("a")\
.join(first_show.alias('b'),['address'],'leftouter')\
.select(f.col('b.show'),f.col('a.cnt_in'),f.col('a.cnt_out'),f.col('a.class'),f.col('a.user'),f.col('a.address'),f.col('a.amount'),f.col('a.coinbase'),f.col('a.tx_id'),f.col('a.vout_idx'),f.col('a.height')).cache()

df_output_addresses_tag.count()

df_output_addresses_tag=df_output_addresses_tag.withColumn("first",f.when(f.col("show")==f.col("height"),1).otherwise(0)).drop("show")

df_output_addresses_tag.write.parquet(path+pathDir+"/df_output_addresses_tag9.parquet")
df_input_addresses_tag.write.parquet(path+pathDir+"/df_input_addresses_tag9.parquet")

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
df_input_addresses_tag= sqlContext.read.parquet(path+pathDir+"/df_input_addresses_tag1.parquet")
#df_input_addresses_tag2= sqlContext.read.parquet(path+pathDir+"/df_input_addresses_tag2.parquet")
#df_input_addresses_tag= df_input_addresses_tag.union(df_input_addresses_tag2)

In [None]:
df_output_addresses_tag= sqlContext.read.parquet(path+pathDir+"/df_output_addresses_tag1.parquet")
#df_output_addresses_tag2= sqlContext.read.parquet(path+pathDir+"/df_output_addresses_tag2.parquet")
#df_output_addresses_tag=df_output_addresses_tag.union(df_output_addresses_tag2)

In [None]:
df_output_addresses_tag.printSchema()
row1 = df_output_addresses_tag.agg({"height": "min"}).collect()[0]
row2 = df_output_addresses_tag.agg({"height": "max"}).collect()[0]

In [None]:
print(row1)
print(row2)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
for i in range(3,10):
    df_output_addresses_tag= sqlContext.read.parquet(path+pathDir+"/df_output_addresses_tag"+str(i)+".parquet")
    df_input_addresses_tag= sqlContext.read.parquet(path+pathDir+"/df_input_addresses_tag"+str(i)+".parquet")

    df_output_addresses_tag_reduced=df_output_addresses_tag.fillna("Unknow",subset=["user"])
    df_input_addresses_tag_reduced=df_input_addresses_tag.fillna("Unknow",subset=["user"])
    #############################################################
    #   Aggregate input and output labeled dataframe for distinct address
    #############################################################

    df_output_addresses_tag_grpby_addr=df_output_addresses_tag_reduced.groupby(df_output_addresses_tag_reduced.address)\
    .agg(f.count('address').alias("count"),(f.sum('amount')).alias("totamount"),f.first(f.col("user")).alias("user"),f.first(f.col("class")).alias("class"))
    df_input_addresses_tag_grpby_addr=df_input_addresses_tag_reduced.groupby(df_input_addresses_tag_reduced.address)\
    .agg(f.count('address').alias("count"),(f.sum('amount')).alias("totamount"),f.first(f.col("user")).alias("user"),f.first(f.col("class")).alias("class"))

    df_output_addresses_tag_grpby_addr =df_output_addresses_tag_grpby_addr.withColumn("totamount",f.round(f.col("totamount"))/100000000)
    df_input_addresses_tag_grpby_addr =df_input_addresses_tag_grpby_addr.withColumn("totamount",f.round(f.col("totamount"))/100000000)

    df_output_addresses_tag_grpby_addr.count()
    df_input_addresses_tag_grpby_addr.count()

    #############################################################
    # Aggregate input and output labeled dataframe for distinct user
    #############################################################

    df_output_addresses_tag_grpby_user=df_output_addresses_tag_grpby_addr.groupby(df_output_addresses_tag_grpby_addr.user)\
    .agg(f.count('address').alias("naddress"),f.sum('totamount').alias("balancerecv"))
    df_input_addresses_tag_grpby_user=df_input_addresses_tag_grpby_addr.groupby(df_input_addresses_tag_grpby_addr.user)\
    .agg(f.count('address').alias("naddress"),f.sum('totamount').alias("balancesend"))


    df_input_addresses_tag_grpby_user=df_input_addresses_tag_grpby_user.where(f.col("user")!="Unknow")
    df_output_addresses_tag_grpby_user=df_output_addresses_tag_grpby_user.where(f.col("user")!="Unknow")

    df_input_addresses_tag_grpby_user.count()
    df_output_addresses_tag_grpby_user.count()

    #############################################################
    #           BALANCE ESTIMATION
    #############################################################
    # Retrive user in label dataframe that are not present into the input/output dataframe
    list_unique_input_user=df_input_addresses_tag_grpby_user.groupby("user").agg(f.first("user").alias("unique")).drop("user")
    list_unique_output_user=df_output_addresses_tag_grpby_user.groupby("user").agg(f.first("user").alias("unique")).drop("user")

    # Add retrived user into the input/output dataframe (with default parameters)
    user_out_toadd=list_unique_input_user.alias("a").join(list_unique_output_user.alias("b"),f.col("a.unique")==f.col("b.unique"),"left_anti")
    user_in_toadd=list_unique_output_user.alias("a").join(list_unique_input_user.alias("b"),f.col("a.unique")==f.col("b.unique"),"left_anti")

    #Add missing user in the input and output dataframe in order to calculate an estimation of the balance
    user_in_toadd = user_in_toadd.withColumn("naddress", f.lit(0))
    user_in_toadd = user_in_toadd.withColumn("balancein", f.lit(0))

    df_input_addresses_tag_grpby_user_filled = df_input_addresses_tag_grpby_user.union(user_in_toadd)

    user_out_toadd = user_out_toadd.withColumn("naddress", f.lit(0))
    user_out_toadd = user_out_toadd.withColumn("balanceout", f.lit(0))
    df_output_addresses_tag_grpby_user_filled = df_output_addresses_tag_grpby_user.union(user_out_toadd)


    #############################################################
    #           BALANCE ESTIMATION
    #############################################################

    df_user_balance=df_output_addresses_tag_grpby_user_filled.alias('a')\
    .join(df_input_addresses_tag_grpby_user_filled.alias('b'),"user","leftouter")\
    .select(f.col('a.user'),f.col('a.balancerecv'),f.col('b.balancesend'))
    df_user_balance=df_user_balance.fillna(0,subset=["balancerecv","balancesend"])
    df_user_balance=df_user_balance.withColumn("balance",f.col("balancerecv")-f.col("balancesend"))\
    .sort(f.col("balance").desc())
    df_user_balance = df_user_balance.withColumn("balance",f.when(f.abs(f.col("balance"))<0.00000001,0).otherwise(f.col("balance")))

    df_user_balance.count()
    #############################################################
    #           COMPUTE ADDRESS FEATURE
    #############################################################

    address_feature=df_output_addresses_tag_grpby_addr.alias("a")\
    .join(df_input_addresses_tag_grpby_addr.alias("b"),f.col("a.address")==f.col("b.address"),"outer")\
    .select(f.col('a.class').alias("a3"),f.col('b.class').alias("b3"),f.col('a.address').alias("a1"),f.col('b.address').alias("b1"),f.col('a.user').alias("a2"),f.col('b.user').alias("b2"),f.col('a.count').alias("count_rec"),f.col('a.totamount').alias("totamount_rec"),f.col('b.count').alias("count_sent"),f.col('b.totamount').alias("totamount_sent"))
    address_feature=address_feature.withColumn("address",f.when(f.col("a1").isNotNull(),f.col("a1")).otherwise(f.col("b1")))\
    .drop("a1","b1")
    address_feature=address_feature.withColumn("user",f.when(f.col("a2").isNotNull(),f.col("a2")).otherwise(f.col("b2")))\
    .drop("a2","b2")
    address_feature=address_feature.withColumn("class",f.when(f.col("a3").isNotNull(),f.col("a3")).otherwise(f.col("b3")))\
    .drop("a3","b3")
    address_feature=address_feature.fillna(0)
    address_feature=address_feature.withColumn("balance",f.col("totamount_rec")-f.col("totamount_sent"))
    address_feature=address_feature.withColumn("unique",f.when((f.col("count_rec")<2)&(f.col("count_sent")<2),1).otherwise(0))

    address_feature=address_feature.alias("a")\
    .join(df_output_addresses_tag_grpby_user.alias("b"),f.col("a.user")==f.col("b.user"),"leftouter")\
    .select(f.col('a.class'),f.col('a.address'),f.col('a.user'),f.col('a.count_rec'),f.col("totamount_rec"),f.col('a.count_sent'),f.col('a.totamount_sent'),f.col('a.balance'),f.col('a.unique'),f.col('b.naddress').alias('sibling'))
    address_feature=address_feature.fillna(0)
    address_feature=address_feature.where(f.col("user")!="Unknow")
    #address_feature.count()

    address_feature.write.parquet(path+pathDir+"/address_feature"+str(i)+".parquet")

    #############################################################
    #           COMPUTE ENTITY FEATURE
    #############################################################
    entity_feature=df_user_balance.alias("a")\
    .join(df_output_addresses_tag_reduced.groupBy("user").agg(f.countDistinct("address").alias("add_recv"),f.countDistinct("tx_id").alias("count_recv")).alias("b"),f.col("a.user")==f.col("b.user"),"leftouter")\
    .select(f.col('a.user'),f.col('a.balancerecv'),f.col('a.balancesend'),f.col('a.balance'),f.col('b.count_recv'),f.col('b.add_recv'))
    entity_feature=entity_feature.alias("a")\
    .join(df_input_addresses_tag_reduced.groupBy("user").agg(f.countDistinct("address").alias("add_sent"),f.countDistinct("tx_id").alias("count_sent")).alias("b"),f.col("a.user")==f.col("b.user"),"leftouter")\
    .select(f.col('a.user'),f.col('a.balancerecv'),f.col('a.balancesend'),f.col('a.balance'),f.col('a.count_recv'),f.col('b.count_sent'),f.col('a.add_recv'),f.col('b.add_sent'))

    entity_feature=entity_feature.fillna(0,subset=["count_recv","count_sent"])
    entity_feature=entity_feature.where(f.col("user")!="Unknow")

    entity_feature= entity_feature.alias('a').join(df_label.groupby("label").agg(f.first("class").alias("class"))\
        .alias('b'),f.col("a.user")==f.col("b.label"),"leftouter")\
        .select("b.class","a.user","a.balancerecv","a.balancesend","a.balance","a.count_recv","a.count_sent",'a.add_recv','a.add_sent')
    entity_feature = entity_feature.withColumnRenamed("class","label")
    entity_feature.count()

    entity_feature.write.parquet(path+pathDir+"/entity_feature"+str(i)+".parquet")

In [None]:
for j in range(9,10):
    if(j==9):
        for i in range(9,11):
            df_transactions_part = sqlContext.read\
                     .format("csv")\
                     .option("header", "true")\
                     .option("inferSchema", "true")\
                     .load(path+"transaction/part"+str(i))
            if(i==9):
                df_transactions=df_transactions_part
            else:
                df_transactions=df_transactions.union(df_transactions_part)
    else:
        df_transactions = sqlContext.read\
                     .format("csv")\
                     .option("header", "true")\
                     .option("inferSchema", "true")\
                     .load(path+"transaction/part"+str(j))

    #Prepare a basic dataframe with all transaction-user-address information
    df_transactions_general = df_transactions.alias('a').join(df_label.alias('b'),f.col('a.address')==f.col('b.address'),"leftouter")\
    .select(f.col("a.height"),f.col("a.coinbase"),f.col("a.timestamp"),f.col("a.tx_id"),f.col("a.tx_number"),f.col("a.address"),f.col("a.amount"),f.col("a.vout_idx"),f.col("a.vin_txid"),f.col("a.vin_vout"),f.col("b.label").alias("outuser"),f.col("b.class").alias("class")).cache()
    df_transactions_general.count()

    #Calculate the input amount of each transaction from each inuser
    df_transactions_general_join_amount = df_transactions_general.groupBy("tx_id","address","vout_idx").agg(f.first("amount").alias("unique_amount"),f.first("outuser").alias("inuser"))

    #Join amount information with the basic dataframe information
    df_transactions_general_information = df_transactions_general.alias('a').join(df_transactions_general_join_amount.alias('b'),(f.col('a.vin_txid')==f.col('b.tx_id'))&(f.col('a.vin_vout')==f.col('b.vout_idx')),"leftouter")\
    .select(f.col("a.height"),f.col("a.coinbase"),f.col("a.timestamp"),f.col("a.tx_id"),f.col("a.amount"),f.col("a.outuser"),f.col("a.class"),f.col("a.address"),f.col("a.vin_txid"),f.col("a.vin_vout"),f.col("b.unique_amount").alias("amount_sent"),f.col("b.address").alias("address_sent"),f.col("b.inuser").alias("inuser_old"))

    #Remove outuser with null field
    df_transactions_general_information = df_transactions_general_information.filter(f.col("outuser").isNotNull())

    #Remove substitute inuser null information with Coinbase information
    df_transactions_general_information = df_transactions_general_information.withColumn("inuser",f.when((f.col("inuser_old").isNull())&(f.col("address").isNotNull()),"Coinbase").otherwise(f.col("inuser_old")))
    df_transactions_general_information = df_transactions_general_information.drop(f.col("inuser_old")).cache()
    df_transactions_general_information.count()

    #Inuser-Outuser dataframe with count distinct transaction
    df_inuser_outuser_numtx = df_transactions_general_information.groupby("outuser","inuser").agg(f.countDistinct("tx_id"))

    #############################################################
    #           COMPUTE MOTIFS1 FEATURE
    #############################################################

    motifs_1 = df_transactions_general_information.groupBy("outuser","inuser",'tx_id').agg(f.countDistinct("address").alias("address_recv_dist"),f.first("class").alias("class"))\
    .select("class","outuser","inuser",'tx_id',"address_recv_dist").cache()

    #Calculate out amount of each user (in-out) in each transactions
    amount_out_processing = df_transactions_general_information.groupBy("outuser","inuser","tx_id","address").agg(f.first("amount").alias("amount_recv"))\
    .groupBy("outuser","inuser","tx_id").agg(f.sum("amount_recv").alias("amount_recv"))
    amount_out_processing=amount_out_processing.fillna("Unknow")
    #Calculate in amount of each user (in-out) in each transactions
    amount_in_processing = df_transactions_general_information.groupBy("outuser","inuser","tx_id").agg(f.count("vin_txid").alias("tx_sent"),f.sum("amount_sent").alias("amount_sent"),f.countDistinct("address_sent").alias("address_sent"))\
    .groupBy("outuser","inuser","tx_id").agg(f.sum("tx_sent").alias("tx_sent"),f.sum("amount_sent").alias("amount_sent"),f.sum("address_sent").alias("address_sent_dist"))
    amount_in_processing=amount_in_processing.fillna("Unknow")


    #Calculate out amount of each transactions
    amount_out_processing_tx = amount_out_processing.groupBy("tx_id").agg(f.sum("amount_recv").alias("total_recv_amount"))
    #Calculate in amount of each transactions
    amount_in_processing_tx = df_transactions_general_information.groupBy("tx_id","vin_txid","vin_vout").agg(f.first("amount_sent").alias("amount_sent"))\
    .groupBy("tx_id").agg(f.sum("amount_sent").alias("total_sent_amount"))
    amount_out_processing=amount_out_processing.fillna("Unknow")
    amount_out_processing.count()

    #Calculate fee in each transaction
    fee_tx = amount_out_processing_tx.alias('a').join(amount_in_processing_tx.alias('b'), f.col("a.tx_id")==f.col("b.tx_id"))\
    .select("a.tx_id","total_recv_amount","total_sent_amount")
    fee_tx = fee_tx.withColumn("fees",f.col("total_sent_amount")-f.col("total_recv_amount"))

    #Join all dataframe information to a unique dataframe for motifs-1
    motifs_1 = motifs_1.alias("a").join(amount_out_processing.alias("b"),(f.col("a.outuser")==f.col("b.outuser"))&(f.col("a.inuser")==f.col("b.inuser"))&(f.col("a.tx_id")==f.col("b.tx_id")))\
    .select("a.class","a.outuser","a.inuser","a.tx_id","address_recv_dist","amount_recv")
    motifs_1 = motifs_1.alias("a").join(amount_in_processing.alias("b"),(f.col("a.outuser")==f.col("b.outuser"))&(f.col("a.inuser")==f.col("b.inuser"))&(f.col("a.tx_id")==f.col("b.tx_id")))\
    .select("a.class","a.outuser","a.inuser","a.tx_id","a.address_recv_dist","a.amount_recv","b.tx_sent","b.address_sent_dist","amount_sent")
    motifs_1=motifs_1.alias('a').join(motifs_1.groupBy("outuser","inuser").agg(f.countDistinct("tx_id").alias("tx_recv_tot")).fillna("Unknow").alias('b'),(f.col("a.outuser")==f.col("b.outuser"))&(f.col("a.inuser")==f.col("b.inuser")))\
    .select("a.class","a.outuser","a.inuser","a.tx_id","a.address_recv_dist","a.amount_recv","a.tx_sent","a.address_sent_dist","a.amount_sent","tx_recv_tot")
    motifs_1=motifs_1.alias('a').join(fee_tx.alias('b'),(f.col("a.tx_id")==f.col("b.tx_id")))\
    .select("a.class","a.outuser","a.inuser","a.tx_id","a.address_recv_dist","a.amount_recv","a.tx_sent","a.address_sent_dist","a.amount_sent","a.tx_recv_tot","b.fees")

    #Define relation between user loop or direct
    motifs_1 = motifs_1.withColumn("loop_in_out", f.when(f.col("outuser")==f.col("inuser"),1).otherwise(0))
    motifs_1 = motifs_1.withColumn("direct_in_out", f.when(f.col("outuser")==f.col("inuser"),0).otherwise(1)).cache()

    #Set to 0 where find null
    motifs_1 = motifs_1.fillna(0,subset=['amount_sent','fees'])
    motifs_1.count()

    motifs_1.write.parquet(path+pathDir+"/motifs1_"+str(j)+".parquet")

In [11]:
for j in range(9,10):
    if(j==9):
        for i in range(9,11):
            df_transactions_part = sqlContext.read\
                     .format("csv")\
                     .option("header", "true")\
                     .option("inferSchema", "true")\
                     .load(path+"transaction/part"+str(i))
            if(i==9):
                df_transactions=df_transactions_part
            else:
                df_transactions=df_transactions.union(df_transactions_part)
    else:
        df_transactions = sqlContext.read\
                     .format("csv")\
                     .option("header", "true")\
                     .option("inferSchema", "true")\
                     .load(path+"transaction/part"+str(j))

#############################################################
#           COMPUTE MOTIFS2 FEATURE
#############################################################
    #Prepare a basic dataframe with all transaction-user-address information
    df_transactions_general = df_transactions.alias('a').join(df_label.alias('b'),f.col('a.address')==f.col('b.address'),"leftouter")\
    .select(f.col("a.height"),f.col("a.coinbase"),f.col("a.timestamp"),f.col("a.tx_id"),f.col("a.tx_number"),f.col("a.address"),f.col("a.amount"),f.col("a.vout_idx"),f.col("a.vin_txid"),f.col("a.vin_vout"),f.col("b.label").alias("outuser"),f.col("b.class").alias("class")).cache()
    df_transactions_general.count()
    
    #Create dataframe with all information without repeating, and remove "null" user (clone)
    df_transactions_general_join_motifs2 = df_transactions_general.groupBy("tx_id","address","vout_idx")\
    .agg(f.first("outuser").alias("miduser"),f.first("vin_txid").alias("vin_txid"),f.first("vin_vout").alias("vin_vout"))
    #.agg(f.first("amount").alias("unique_amount"),f.first("outuser").alias("miduser"),f.first("vin_txid").alias("vin_txid"),f.first("vin_vout").alias("vin_vout"))
    #df_transactions_general_join_motifs2 = df_transactions_general_join_motifs2.filter(f.col("miduser").isNotNull())


    #Join the previuos dataframe with the dataframe general in order to obtain 1-motifs
    df_transactions_general_information2 = df_transactions_general.alias('a').join(df_transactions_general_join_motifs2.alias('b'),(f.col('a.vin_txid')==f.col('b.tx_id'))&(f.col('a.vin_vout')==f.col('b.vout_idx')),"leftouter")\
    .select(f.col("a.tx_id"),f.col("a.outuser"),f.col("a.address"),f.col("a.vin_txid").alias("tx_id_mid"),f.col("a.vin_vout").alias("vin_vout_idx_mid"),f.col("b.vin_txid").alias("tx_id_in"),f.col("b.vin_vout").alias("vin_vout_idx_in"),f.col("b.miduser"))
    #.select(f.col("a.height"),f.col("a.coinbase"),f.col("a.timestamp"),f.col("a.tx_id"),f.col("a.amount"),f.col("a.outuser"),f.col("a.address"),f.col("a.vin_txid").alias("tx_id_mid"),f.col("a.vin_vout").alias("vin_vout_idx_mid"),f.col("b.vin_txid").alias("tx_id_in"),f.col("b.vin_vout").alias("vin_vout_idx_in"),f.col("b.unique_amount").alias("amount_mid"),f.col("b.address").alias("address_mid"),f.col("b.miduser"))

    #Repeat the previuos operation in order to obtain 2-motifs
    df_transactions_general_info_deep = df_transactions_general_information2.alias('a').join(df_transactions_general_join_motifs2.alias('b'),(f.col('a.tx_id_in')==f.col('b.tx_id'))&(f.col('a.vin_vout_idx_in')==f.col('b.vout_idx')),"leftouter")\
    .select(f.col("a.tx_id"),f.col("a.outuser"),f.col("a.address"),f.col("a.tx_id_mid"),f.col("a.miduser"),f.col("b.miduser").alias("inuser_old"))
    #.select(f.col("a.height"),f.col("a.timestamp"),f.col("a.tx_id"),f.col("a.amount"),f.col("a.outuser"),f.col("a.address"),f.col("a.tx_id_mid"),f.col("a.vin_vout_idx_mid"),f.col("a.amount_mid"),f.col("a.address_mid"),f.col("a.miduser"),f.col("b.unique_amount").alias("amount_sent"),f.col("b.address").alias("address_sent"),f.col("b.miduser").alias("inuser_old"))

    #Remove null user
    df_transactions_general_info_deep = df_transactions_general_info_deep.filter(f.col("outuser").isNotNull())
    df_transactions_general_info_deep = df_transactions_general_info_deep.filter(f.col("miduser").isNotNull())

    #Change null user but with address with "Coinbase"
    df_transactions_general_info_deep = df_transactions_general_info_deep.withColumn("inuser",f.when((f.col("inuser_old").isNull())&(f.col("address").isNotNull()),"Coinbase").otherwise(f.col("inuser_old")))
    df_transactions_general_info_deep = df_transactions_general_info_deep.drop(f.col("inuser_old"))

    #Creating unique dataframe with outuser->tx->miduser->tx->inuser
    motifs_2 = df_transactions_general_info_deep.groupBy("outuser","miduser","inuser","tx_id","tx_id_mid")\
    .agg(f.count("address"))\
    .select("outuser","tx_id","miduser","tx_id_mid","inuser")

    motifs_2 = motifs_2.withColumn("loop_mid_out", f.when(f.col("outuser")==f.col("miduser"),1).otherwise(0))
    motifs_2 = motifs_2.withColumn("loop_in_mid", f.when(f.col("miduser")==f.col("inuser"),1).otherwise(0))
    motifs_2 = motifs_2.withColumn("loop_in_out", f.when(f.col("outuser")==f.col("inuser"),1).otherwise(0))

    motifs_2 = motifs_2.withColumn("direct_mid_out", f.when(f.col("outuser")==f.col("miduser"),0).otherwise(1))
    motifs_2 = motifs_2.withColumn("direct_in_mid", f.when(f.col("miduser")==f.col("inuser"),0).otherwise(1))
    motifs_2 = motifs_2.withColumn("direct_in_out", f.when(f.col("outuser")==f.col("inuser"),0).otherwise(1))

    #Encrich the previous dataframe with information from the motifs-1
    for k in range(3,j+1):
        motifs_1_part1=sqlContext.read.parquet(path+pathDir+"/motifs1_"+str(k)+".parquet")
        if(k==3):
            motifs_1=motifs_1_part1
        else:
            motifs_1=motifs_1.union(motifs_1_part1)
    
    motifs_2_cloned = motifs_2.toDF("outuser","tx_id","miduser","tx_id_mid","inuser","loop_mid_out","loop_in_mid","loop_in_out","direct_mid_out","direct_in_mid","direct_in_out")

    #Rename correctly the column
    motifs_2_cloned= motifs_2_cloned.alias("a").join(motifs_1.alias("b"),(f.col("a.outuser")==f.col("b.outuser"))&(f.col("a.miduser")==f.col("b.inuser"))&(f.col("a.tx_id")==f.col("b.tx_id")),"leftouter")\
    .select("b.class","a.outuser","a.tx_id","b.address_recv_dist","b.amount_recv","b.fees","b.tx_sent","b.address_sent_dist","b.amount_sent","a.miduser","a.tx_id_mid","a.inuser","a.loop_mid_out","a.loop_in_mid","a.loop_in_out","a.direct_mid_out","a.direct_in_mid","a.direct_in_out")\
    .withColumnRenamed("fees","fee2")\
    .withColumnRenamed("address_recv_dist","address_recv_dist_to_out")\
    .withColumnRenamed("amount_recv","amount_recv_to_out")\
    .withColumnRenamed("tx_sent","tx_sent_from_mid")\
    .withColumnRenamed("address_sent_dist","address_sent_from_mid")\
    .withColumnRenamed("amount_sent","amount_sent_from_mid").cache()

    motifs_2_cloned.count()


    motifs_2_cloned= motifs_2_cloned.alias("a").join(motifs_1.alias("b"),(f.col("a.inuser")==f.col("b.inuser"))&(f.col("a.miduser")==f.col("b.outuser"))&(f.col("a.tx_id_mid")==f.col("b.tx_id")),"leftouter")\
    .select("a.class","a.outuser","a.tx_id","a.address_recv_dist_to_out","a.amount_recv_to_out","a.fee2","a.tx_sent_from_mid","a.address_sent_from_mid","a.amount_sent_from_mid","a.miduser","a.tx_id_mid","b.address_recv_dist","b.amount_recv","b.tx_sent","b.address_sent_dist","b.amount_sent","b.fees","a.inuser","a.loop_mid_out","a.loop_in_mid","a.loop_in_out","a.direct_mid_out","a.direct_in_mid","a.direct_in_out")\
    .withColumnRenamed("fees","fee1")\
    .withColumnRenamed("address_recv_dist","address_recv_to_mid")\
    .withColumnRenamed("amount_recv","amount_recv_to_mid")\
    .withColumnRenamed("tx_sent","tx_sent_from_in")\
    .withColumnRenamed("address_sent_dist","address_sent_from_in")\
    .withColumnRenamed("amount_sent","amount_sent_from_in")

    motifs_2_cloned.count()

    #motifs_2_cloned = motifs_2_cloned.alias('a').join(df_label.alias('b'),f.col("a.outuser")==f.col("b.label"),"leftouter")\
    #    .select("b.class","a.outuser","a.tx_id","a.address_recv_dist_to_out","a.amount_recv_to_out","a.fee2",\
    #            "a.tx_sent_from_mid","a.address_sent_from_mid","a.amount_sent_from_mid","a.miduser",\
    #           "a.tx_id_mid","a.address_recv_to_mid","a.amount_recv_to_mid","a.tx_sent_from_in","a.address_sent_from_in",\
    #           "a.amount_sent_from_in","a.fee1","a.inuser","a.loop_mid_out","a.loop_in_mid","a.loop_in_out",\
    #           "a.direct_mid_out","a.direct_in_mid","a.direct_in_out")

    motifs_2_cloned.write.parquet(path+pathDir+"/motifs2_"+str(j)+".parquet")
